summaryrefslogtreecommitdiff
path: root/source/Syntax/Token.hs
diff options
context:
space:
mode:
authoradelon <22380201+adelon@users.noreply.github.com>2024-02-10 02:22:14 +0100
committeradelon <22380201+adelon@users.noreply.github.com>2024-02-10 02:22:14 +0100
commit442d732696ad431b84f6e5c72b6ee785be4fd968 (patch)
treeb476f395e7e91d67bacb6758bc84914b8711593f /source/Syntax/Token.hs
Initial commit
Diffstat (limited to 'source/Syntax/Token.hs')
-rw-r--r--source/Syntax/Token.hs438
1 files changed, 438 insertions, 0 deletions
diff --git a/source/Syntax/Token.hs b/source/Syntax/Token.hs
new file mode 100644
index 0000000..7c8606c
--- /dev/null
+++ b/source/Syntax/Token.hs
@@ -0,0 +1,438 @@
+{-# LANGUAGE DeriveAnyClass #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+
+-- |
+-- This module defines the lexer and its associated data types.
+-- The lexer takes `Text` as input and produces a stream of tokens
+-- annotated with positional information. This information is bundled
+-- together with the original raw input for producing error messages.
+--
+-- The lexer perfoms some normalizations to make describing the grammar easier.
+-- Words outside of math environments are case-folded. Some commands are analysed
+-- as variable tokens and are equivalent to their respective unicode variants
+-- (α, β, γ, ..., 𝔸, 𝔹, ℂ, ...). Similarly, @\\begin{...}@ and @\\end{...}@ commands
+-- are each parsed as single tokens.
+--
+module Syntax.Token
+ ( Token(..)
+ , tokToString
+ , tokToText
+ , TokStream(..)
+ , Located(..)
+ , runLexer
+ ) where
+
+
+import Base hiding (many)
+
+import Control.Monad.Combinators
+import Control.Monad.State.Strict
+import Data.Char (isAsciiLower)
+import Data.List.NonEmpty qualified as NonEmpty
+import Data.Text qualified as Text
+import Prettyprinter (Pretty(..))
+import Text.Megaparsec hiding (Token, Label, label)
+import Text.Megaparsec.Char qualified as Char
+import Text.Megaparsec.Char.Lexer qualified as Lexer
+
+
+runLexer :: String -> Text -> Either (ParseErrorBundle Text Void) [Located Token]
+runLexer file raw = runParser (evalStateT toks initLexerState) file raw
+
+
+type Lexer = StateT LexerState (Parsec Void Text)
+
+
+data LexerState = LexerState
+ { textNesting :: !Int
+ -- ^ Represents nesting of braces inside of the @\text{...}@
+ -- command. When we encounter @\text@ the token mode switches
+ -- to text tokens. In order to switch back to math mode correctly
+ -- we need to count the braces.
+ , mode :: !Mode
+ } deriving (Show, Eq)
+
+initLexerState :: LexerState
+initLexerState = LexerState 0 TextMode
+
+incrNesting, decrNesting :: LexerState -> LexerState
+incrNesting (LexerState n m) = LexerState (succ n) m
+decrNesting (LexerState n m) = LexerState (pred n) m
+
+data Mode = TextMode | MathMode deriving (Show, Eq)
+
+isTextMode, isMathMode :: Lexer Bool
+isTextMode = do
+ m <- gets mode
+ pure (m == TextMode)
+isMathMode = do
+ m <- gets mode
+ pure (m == MathMode)
+
+setTextMode, setMathMode :: Lexer ()
+setTextMode = do
+ st <- get
+ put st{mode = TextMode}
+setMathMode = do
+ st <- get
+ put st{mode = MathMode}
+
+-- |
+-- A token stream as input stream for a parser. Contains the raw input
+-- before tokenization as 'Text' for showing error messages.
+--
+data TokStream = TokStream
+ { rawInput :: !Text
+ , unTokStream :: ![Located Token]
+ } deriving (Show, Eq)
+
+instance Semigroup TokStream where
+ TokStream raw1 toks1 <> TokStream raw2 toks2 = TokStream (raw1 <> raw2) (toks1 <> toks2)
+
+instance Monoid TokStream where
+ mempty = TokStream mempty mempty
+
+-- | A LaTeX token.
+-- Invisible delimiters 'InvisibleBraceL' and 'InvisibleBraceR' are
+-- unescaped braces used for grouping in TEX (@{@),
+-- visibles braces are escaped braces (@\\{@).
+data Token
+ = Word !Text
+ | Variable !Text
+ | Symbol !Text
+ | Integer !Int
+ | Command !Text
+ | Label Text -- ^ A /@\\label{...}@/ command (case-sensitive).
+ | Ref (NonEmpty Text) -- ^ A /@\\ref{...}@/ command (case-sensitive).
+ | BeginEnv !Text
+ | EndEnv !Text
+ | ParenL | ParenR
+ | BracketL | BracketR
+ | VisibleBraceL | VisibleBraceR
+ | InvisibleBraceL | InvisibleBraceR
+ deriving (Show, Eq, Ord, Generic, Hashable)
+
+instance IsString Token where
+ fromString w = Word (Text.pack w)
+
+tokToText :: Token -> Text
+tokToText = \case
+ Word w -> w
+ Variable v -> v
+ Symbol s -> s
+ Integer n -> Text.pack (show n)
+ Command cmd -> Text.cons '\\' cmd
+ Label m -> "\\label{" <> m <> "}"
+ Ref ms -> "\\ref{" <> Text.intercalate ", " (toList ms) <> "}"
+ BeginEnv "math" -> "$"
+ EndEnv "math" -> "$"
+ BeginEnv env -> "\\begin{" <> env <> "}"
+ EndEnv env -> "\\end{" <> env <> "}"
+ ParenL -> "("
+ ParenR -> ")"
+ BracketL -> "["
+ BracketR -> "]"
+ VisibleBraceL -> "\\{"
+ VisibleBraceR -> "\\}"
+ InvisibleBraceL -> "{"
+ InvisibleBraceR -> "}"
+
+tokToString :: Token -> String
+tokToString = Text.unpack . tokToText
+
+instance Pretty Token where
+ pretty = \case
+ Word w -> pretty w
+ Variable v -> pretty v
+ Symbol s -> pretty s
+ Integer n -> pretty n
+ Command cmd -> "\\" <> pretty cmd
+ Label m -> "\\label{" <> pretty m <> "}"
+ Ref m -> "\\ref{" <> pretty m <> "}"
+ BeginEnv env -> "\\begin{" <> pretty env <> "}"
+ EndEnv env -> "\\end{" <> pretty env <> "}"
+ ParenL -> "("
+ ParenR -> ")"
+ BracketL -> "["
+ BracketR -> "]"
+ VisibleBraceL -> "\\{"
+ VisibleBraceR -> "\\}"
+ InvisibleBraceL -> "{"
+ InvisibleBraceR -> "}"
+
+
+data Located a = Located
+ { startPos :: !SourcePos
+ , unLocated :: !a
+ } deriving (Show)
+
+instance Eq a => Eq (Located a) where (==) = (==) `on` unLocated
+instance Ord a => Ord (Located a) where compare = compare `on` unLocated
+
+
+-- | Parses tokens, switching tokenizing mode when encountering math environments.
+toks :: Lexer [Located Token]
+toks = whitespace *> goNormal id <* eof
+ where
+ goNormal f = do
+ r <- optional tok
+ case r of
+ Nothing -> pure (f [])
+ Just t@(Located _ (BeginEnv "math")) -> goMath (f . (t:))
+ Just t@(Located _ (BeginEnv "align*")) -> goMath (f . (t:))
+ Just t -> goNormal (f . (t:))
+ goText f = do
+ r <- optional textToken
+ case r of
+ Nothing -> pure (f [])
+ Just t@(Located _ (BeginEnv "math")) -> goMathInText (f . (t:))
+ Just t@(Located _ (EndEnv "text")) -> goMath (f . (t:))
+ Just t@(Located _ (EndEnv "explanation")) -> goMath (f . (t:))
+ Just t -> goText (f . (t:))
+ goMath f = do
+ r <- optional mathToken
+ case r of
+ Nothing -> pure (f [])
+ Just t@(Located _ (EndEnv "math")) -> goNormal (f . (t:))
+ Just t@(Located _ (EndEnv "align*")) -> goNormal (f . (t:))
+ Just t@(Located _ (BeginEnv "text")) -> goText (f . (t:))
+ Just t@(Located _ (BeginEnv "explanation")) -> goText (f . (t:))
+ Just t -> goMath (f . (t:))
+ goMathInText f = do
+ r <- optional mathToken
+ case r of
+ Nothing -> pure (f [])
+ Just t@(Located _ (EndEnv "math")) -> goText (f . (t:))
+ Just t@(Located _ (BeginEnv "text")) -> goText (f . (t:))
+ Just t -> goMathInText (f . (t:))
+{-# INLINE toks #-}
+
+-- | Parses a single normal mode token.
+tok :: Lexer (Located Token)
+tok =
+ word <|> var <|> symbol <|> mathBegin <|> alignBegin <|> begin <|> end <|> opening <|> closing <|> label <|> ref <|> command
+
+-- | Parses a single math mode token.
+mathToken :: Lexer (Located Token)
+mathToken =
+ var <|> symbol <|> number <|> begin <|> alignEnd <|> end <|> opening <|> closing <|> beginText <|> beginExplanation <|> mathEnd <|> command
+
+beginText :: Lexer (Located Token)
+beginText = lexeme do
+ Char.string "\\text{"
+ setTextMode
+ pure (BeginEnv "text")
+
+-- | Same as text modulo spacing, so we treat it synonymously
+beginExplanation :: Lexer (Located Token)
+beginExplanation = lexeme do
+ Char.string "\\explanation{"
+ setTextMode
+ pure (BeginEnv "text")
+
+-- | Normal mode embedded into math mode via @\text{...}@.
+textToken :: Lexer (Located Token)
+textToken = word <|> symbol <|> begin <|> end <|> textEnd <|> mathBegin <|> alignBegin <|> opening' <|> closing' <|> ref <|> command
+ where
+ textEnd = lexeme do
+ 0 <- gets textNesting -- Otherwise fail.
+ Char.char '}'
+ setMathMode
+ pure (EndEnv "text")
+
+ opening' = lexeme (brace <|> group <|> paren <|> bracket)
+ where
+ brace = VisibleBraceL <$ lexeme (Char.string "\\{")
+ group = InvisibleBraceL <$ lexeme (Char.char '{') <* modify' incrNesting
+ paren = ParenL <$ lexeme (Char.char '(')
+ bracket = BracketL <$ lexeme (Char.char '[')
+
+ closing' = lexeme (brace <|> group <|> paren <|> bracket)
+ where
+ brace = VisibleBraceR <$ lexeme (Char.string "\\}")
+ group = InvisibleBraceR <$ lexeme (Char.char '}') <* modify' decrNesting
+ paren = ParenR <$ lexeme (Char.char ')')
+ bracket = BracketR <$ lexeme (Char.char ']')
+
+
+-- | Parses a single begin math token.
+mathBegin :: Lexer (Located Token)
+mathBegin = guardM isTextMode *> lexeme do
+ Char.string "\\(" <|> Char.string "\\[" <|> Char.string "$"
+ setMathMode
+ pure (BeginEnv "math")
+
+alignBegin :: Lexer (Located Token)
+alignBegin = guardM isTextMode *> lexeme do
+ Char.string "\\begin{align*}"
+ setMathMode
+ pure (BeginEnv "align*")
+
+
+-- | Parses a single end math token.
+mathEnd :: Lexer (Located Token)
+mathEnd = guardM isMathMode *> lexeme do
+ Char.string "\\)" <|> Char.string "\\]" <|> Char.string "$"
+ setTextMode
+ pure (EndEnv "math")
+
+alignEnd :: Lexer (Located Token)
+alignEnd = guardM isMathMode *> lexeme do
+ Char.string "\\end{align*}"
+ setTextMode
+ pure (EndEnv "align*")
+
+
+-- | Parses a word. Words are returned casefolded, since we want to ignore their case later on.
+word :: Lexer (Located Token)
+word = guardM isTextMode *> lexeme do
+ w <- some (Char.letterChar <|> Char.char '\'' <|> Char.char '-')
+ let t = Word (Text.toCaseFold (Text.pack w))
+ pure t
+
+number :: Lexer (Located Token)
+number = lexeme $ Integer <$> Lexer.decimal
+
+
+var :: Lexer (Located Token)
+var = guardM isMathMode *> lexeme (fmap Variable var')
+ where
+ var' = do
+ alphabeticPart <- letter <|> bb <|> greek
+ variationPart <- subscriptNumber <|> ticked <|> pure ""
+ pure (alphabeticPart <> variationPart)
+
+ subscriptNumber :: Lexer Text
+ subscriptNumber = do
+ Char.char '_'
+ n <- some Char.digitChar
+ pure (Text.pack n)
+
+ -- Temporary hack to fit the TPTP format.
+ ticked :: Lexer Text
+ ticked = do
+ ticks <- some $ Char.char '\''
+ let ticks' = "prime" <$ ticks :: [Text]
+ pure (Text.concat ticks')
+
+ letter :: Lexer Text
+ letter = fmap Text.singleton Char.letterChar
+
+ greek :: Lexer Text
+ greek = try do
+ Char.char '\\'
+ l <- symbolParser greeks
+ notFollowedBy Char.letterChar
+ pure l
+
+ greeks :: [Text]
+ greeks =
+ [ "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta"
+ , "iota", "kappa", "lambda", "mu", "nu", "xi", "pi", "rho", "sigma"
+ , "tau", "upsilon", "phi", "chi", "psi", "omega"
+ , "Gamma", "Delta", "Theta", "Lambda", "Xi", "Pi", "Sigma", "Upsilon"
+ , "Phi", "Psi", "Omega"
+ ]
+
+ bb :: Lexer Text
+ bb = do
+ Char.string "\\mathbb{"
+ l <- symbolParser bbs
+ Char.char '}'
+ pure $ "bb" <> l
+
+ bbs :: [Text]
+ bbs = Text.singleton <$> ['A'..'Z']
+
+
+ symbolParser :: [Text] -> Lexer Text
+ symbolParser symbols = asum (fmap Char.string symbols)
+
+
+symbol :: Lexer (Located Token)
+symbol = lexeme do
+ symb <- some (satisfy (`elem` symbols))
+ pure (Symbol (Text.pack symb))
+ where
+ symbols :: [Char]
+ symbols = ".,:;!?@=≠+-/|^><≤≥*&≈⊂⊃⊆⊇∈“”‘’"
+
+-- | Parses a TEX-style command.
+command :: Lexer (Located Token)
+command = lexeme do
+ Char.char '\\'
+ cmd <- some Char.letterChar
+ pure (Command (Text.pack cmd))
+
+-- | Parses the beginning of an environment.
+-- Commits only after having seen "\begin{".
+begin :: Lexer (Located Token)
+begin = lexeme do
+ Char.string "\\begin{"
+ env <- some (Char.letterChar <|> Char.char '*')
+ Char.char '}'
+ pure (BeginEnv (Text.pack env))
+
+-- | Parses a label command and extracts its marker.
+label :: Lexer (Located Token)
+label = lexeme do
+ Char.string "\\label{"
+ m <- marker
+ Char.char '}'
+ pure (Label m)
+
+-- | Parses a label command and extracts its marker.
+ref :: Lexer (Located Token)
+ref = lexeme do
+ -- @\\cref@ is from @cleveref@ and @\\hyperref@ is from @hyperref@
+ cmd <- Char.string "\\ref{" <|> Char.string "\\cref{" <|> Char.string "\\hyperref["
+ ms <- NonEmpty.fromList <$> marker `sepBy1` Char.char ','
+ case cmd of
+ "\\hyperref[" -> Char.string "]{" *> some (satisfy (/= '}')) *> Char.char '}' *> pure (Ref ms)
+ _ -> Char.char '}' *> pure (Ref ms)
+
+marker :: Lexer Text
+marker = takeWhile1P Nothing (\x -> isAsciiLower x || x == '_')
+
+-- | Parses the end of an environment.
+-- Commits only after having seen "\end{".
+end :: Lexer (Located Token)
+end = lexeme do
+ Char.string "\\end{"
+ env <- some (Char.letterChar <|> Char.char '*')
+ Char.char '}'
+ pure (EndEnv (Text.pack env))
+
+-- | Parses an opening delimiter.
+opening :: Lexer (Located Token)
+opening = lexeme (paren <|> brace <|> group <|> bracket)
+ where
+ brace = VisibleBraceL <$ lexeme (Char.string "\\{")
+ group = InvisibleBraceL <$ lexeme (Char.char '{')
+ paren = ParenL <$ lexeme (Char.char '(')
+ bracket = BracketL <$ lexeme (Char.char '[')
+
+-- | Parses a closing delimiter.
+closing :: Lexer (Located Token)
+closing = lexeme (paren <|> brace <|> group <|> bracket)
+ where
+ brace = VisibleBraceR <$ lexeme (Char.string "\\}")
+ group = InvisibleBraceR <$ lexeme (Char.char '}')
+ paren = ParenR <$ lexeme (Char.char ')')
+ bracket = BracketR <$ lexeme (Char.char ']')
+
+-- | Turns a Lexer into one that tracks the source position of the token
+-- and consumes trailing whitespace.
+lexeme :: Lexer a -> Lexer (Located a)
+lexeme p = do
+ start <- getSourcePos
+ t <- p
+ whitespace
+ pure (Located start t)
+
+space :: Lexer ()
+space = void (Char.char ' ' <|> Char.char '\n')
+ <|> void (Char.string "\\ " <|> Char.string "\\\\" <|> Char.string "\\!" <|> Char.string "\\," <|> Char.string "\\:" <|> Char.string "\\;" <|> Char.string "\\;")
+
+whitespace :: Lexer ()
+whitespace = Lexer.space (void (some space)) (Lexer.skipLineComment "%") empty