Initial commit

author: adelon <22380201+adelon@users.noreply.github.com> 2024-02-10 02:22:14 +0100
committer: adelon <22380201+adelon@users.noreply.github.com> 2024-02-10 02:22:14 +0100
commit: 442d732696ad431b84f6e5c72b6ee785be4fd968 (patch)
tree: b476f395e7e91d67bacb6758bc84914b8711593f /source/Syntax/Token.hs
1 files changed, 438 insertions, 0 deletions
diff --git a/source/Syntax/Token.hs b/source/Syntax/Token.hs
new file mode 100644
index 0000000..7c8606c
--- /dev/null
+++ b/source/Syntax/Token.hs
@@ -0,0 +1,438 @@
+{-# LANGUAGE DeriveAnyClass #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+
+-- |
+-- This module defines the lexer and its associated data types.
+-- The lexer takes `Text` as input and produces a stream of tokens
+-- annotated with positional information. This information is bundled
+-- together with the original raw input for producing error messages.
+--
+-- The lexer perfoms some normalizations to make describing the grammar easier.
+-- Words outside of math environments are case-folded. Some commands are analysed
+-- as variable tokens and are equivalent to their respective unicode variants
+-- (α, β, γ, ..., 𝔸, 𝔹, ℂ, ...). Similarly, @\\begin{...}@ and @\\end{...}@ commands
+-- are each parsed as single tokens.
+--
+module Syntax.Token
+    ( Token(..)
+    , tokToString
+    , tokToText
+    , TokStream(..)
+    , Located(..)
+    , runLexer
+    ) where
+
+
+import Base hiding (many)
+
+import Control.Monad.Combinators
+import Control.Monad.State.Strict
+import Data.Char (isAsciiLower)
+import Data.List.NonEmpty qualified as NonEmpty
+import Data.Text qualified as Text
+import Prettyprinter (Pretty(..))
+import Text.Megaparsec hiding (Token, Label, label)
+import Text.Megaparsec.Char qualified as Char
+import Text.Megaparsec.Char.Lexer qualified as Lexer
+
+
+runLexer :: String -> Text -> Either (ParseErrorBundle Text Void) [Located Token]
+runLexer file raw = runParser (evalStateT toks initLexerState) file raw
+
+
+type Lexer = StateT LexerState (Parsec Void Text)
+
+
+data LexerState = LexerState
+    { textNesting :: !Int
+    -- ^ Represents nesting of braces inside of the @\text{...}@
+    -- command. When we encounter @\text@ the token mode switches
+    -- to text tokens. In order to switch back to math mode correctly
+    -- we need to count the braces.
+    , mode :: !Mode
+    } deriving (Show, Eq)
+
+initLexerState :: LexerState
+initLexerState = LexerState 0 TextMode
+
+incrNesting, decrNesting :: LexerState -> LexerState
+incrNesting (LexerState n m) = LexerState (succ n) m
+decrNesting (LexerState n m) = LexerState (pred n) m
+
+data Mode = TextMode | MathMode deriving (Show, Eq)
+
+isTextMode, isMathMode :: Lexer Bool
+isTextMode = do
+    m <- gets mode
+    pure (m == TextMode)
+isMathMode = do
+    m <- gets mode
+    pure (m == MathMode)
+
+setTextMode, setMathMode :: Lexer ()
+setTextMode = do
+    st <- get
+    put st{mode = TextMode}
+setMathMode = do
+    st <- get
+    put st{mode = MathMode}
+
+-- |
+-- A token stream as input stream for a parser. Contains the raw input
+-- before tokenization as 'Text' for showing error messages.
+--
+data TokStream = TokStream
+    { rawInput :: !Text
+    , unTokStream :: ![Located Token]
+    } deriving (Show, Eq)
+
+instance Semigroup TokStream where
+    TokStream raw1 toks1 <> TokStream raw2 toks2 = TokStream (raw1 <> raw2) (toks1 <> toks2)
+
+instance Monoid TokStream where
+    mempty = TokStream mempty mempty
+
+-- | A LaTeX token.
+-- Invisible delimiters 'InvisibleBraceL' and 'InvisibleBraceR' are
+-- unescaped braces used for grouping in TEX (@{@),
+-- visibles braces are escaped braces (@\\{@).
+data Token
+    = Word !Text
+    | Variable !Text
+    | Symbol !Text
+    | Integer !Int
+    | Command !Text
+    | Label Text -- ^ A /@\\label{...}@/ command (case-sensitive).
+    | Ref (NonEmpty Text) -- ^ A /@\\ref{...}@/ command (case-sensitive).
+    | BeginEnv !Text
+    | EndEnv !Text
+    | ParenL | ParenR
+    | BracketL | BracketR
+    | VisibleBraceL | VisibleBraceR
+    | InvisibleBraceL | InvisibleBraceR
+    deriving (Show, Eq, Ord, Generic, Hashable)
+
+instance IsString Token where
+    fromString w = Word (Text.pack w)
+
+tokToText :: Token -> Text
+tokToText = \case
+    Word w -> w
+    Variable v -> v
+    Symbol s -> s
+    Integer n -> Text.pack (show n)
+    Command cmd -> Text.cons '\\' cmd
+    Label m -> "\\label{" <> m <> "}"
+    Ref ms -> "\\ref{" <> Text.intercalate ", " (toList ms) <> "}"
+    BeginEnv "math" -> "$"
+    EndEnv "math" -> "$"
+    BeginEnv env -> "\\begin{" <> env <> "}"
+    EndEnv env -> "\\end{" <> env <> "}"
+    ParenL -> "("
+    ParenR -> ")"
+    BracketL -> "["
+    BracketR -> "]"
+    VisibleBraceL -> "\\{"
+    VisibleBraceR -> "\\}"
+    InvisibleBraceL -> "{"
+    InvisibleBraceR -> "}"
+
+tokToString :: Token -> String
+tokToString = Text.unpack . tokToText
+
+instance Pretty Token where
+    pretty = \case
+        Word w -> pretty w
+        Variable v -> pretty v
+        Symbol s -> pretty s
+        Integer n -> pretty n
+        Command cmd -> "\\" <> pretty cmd
+        Label m -> "\\label{" <> pretty m <> "}"
+        Ref m -> "\\ref{" <> pretty m <> "}"
+        BeginEnv env -> "\\begin{" <> pretty env <> "}"
+        EndEnv env -> "\\end{" <> pretty env <> "}"
+        ParenL -> "("
+        ParenR -> ")"
+        BracketL -> "["
+        BracketR -> "]"
+        VisibleBraceL -> "\\{"
+        VisibleBraceR -> "\\}"
+        InvisibleBraceL -> "{"
+        InvisibleBraceR -> "}"
+
+
+data Located a = Located
+    { startPos :: !SourcePos
+    , unLocated :: !a
+    } deriving (Show)
+
+instance Eq a  => Eq  (Located a) where (==) = (==) `on` unLocated
+instance Ord a => Ord (Located a) where compare = compare `on` unLocated
+
+
+-- | Parses tokens, switching tokenizing mode when encountering math environments.
+toks :: Lexer [Located Token]
+toks = whitespace *> goNormal id <* eof
+    where
+        goNormal f = do
+            r <- optional tok
+            case r of
+                Nothing -> pure (f [])
+                Just t@(Located _ (BeginEnv "math")) -> goMath (f . (t:))
+                Just t@(Located _ (BeginEnv "align*")) -> goMath (f . (t:))
+                Just t -> goNormal (f . (t:))
+        goText f = do
+            r <- optional textToken
+            case r of
+                Nothing -> pure (f [])
+                Just t@(Located _ (BeginEnv "math")) -> goMathInText (f . (t:))
+                Just t@(Located _ (EndEnv "text")) -> goMath (f . (t:))
+                Just t@(Located _ (EndEnv "explanation")) -> goMath (f . (t:))
+                Just t -> goText (f . (t:))
+        goMath f = do
+            r <- optional mathToken
+            case r of
+                Nothing -> pure (f [])
+                Just t@(Located _ (EndEnv "math")) -> goNormal (f . (t:))
+                Just t@(Located _ (EndEnv "align*")) -> goNormal (f . (t:))
+                Just t@(Located _ (BeginEnv "text")) -> goText (f . (t:))
+                Just t@(Located _ (BeginEnv "explanation")) -> goText (f . (t:))
+                Just t -> goMath (f . (t:))
+        goMathInText f = do
+            r <- optional mathToken
+            case r of
+                Nothing -> pure (f [])
+                Just t@(Located _ (EndEnv "math")) -> goText (f . (t:))
+                Just t@(Located _ (BeginEnv "text")) -> goText (f . (t:))
+                Just t -> goMathInText (f . (t:))
+{-# INLINE toks #-}
+
+-- | Parses a single normal mode token.
+tok :: Lexer (Located Token)
+tok =
+    word <|> var <|> symbol <|> mathBegin <|> alignBegin <|> begin <|> end <|> opening <|> closing <|> label <|> ref <|> command
+
+-- | Parses a single math mode token.
+mathToken :: Lexer (Located Token)
+mathToken =
+    var <|> symbol <|> number <|> begin <|> alignEnd <|> end <|> opening <|> closing <|> beginText <|> beginExplanation <|> mathEnd <|> command
+
+beginText :: Lexer (Located Token)
+beginText = lexeme do
+    Char.string "\\text{"
+    setTextMode
+    pure (BeginEnv "text")
+
+-- | Same as text modulo spacing, so we treat it synonymously
+beginExplanation :: Lexer (Located Token)
+beginExplanation = lexeme do
+    Char.string "\\explanation{"
+    setTextMode
+    pure (BeginEnv "text")
+
+-- | Normal mode embedded into math mode via @\text{...}@.
+textToken :: Lexer (Located Token)
+textToken = word <|> symbol <|> begin <|> end <|> textEnd <|> mathBegin <|> alignBegin <|> opening' <|> closing' <|> ref <|> command
+    where
+        textEnd = lexeme do
+            0 <- gets textNesting -- Otherwise fail.
+            Char.char '}'
+            setMathMode
+            pure (EndEnv "text")
+
+        opening' = lexeme (brace <|> group <|> paren <|> bracket)
+            where
+                brace = VisibleBraceL <$ lexeme (Char.string "\\{")
+                group = InvisibleBraceL <$ lexeme (Char.char '{') <* modify' incrNesting
+                paren = ParenL <$ lexeme (Char.char '(')
+                bracket = BracketL <$ lexeme (Char.char '[')
+
+        closing' = lexeme (brace <|> group <|> paren <|> bracket)
+            where
+                brace = VisibleBraceR <$ lexeme (Char.string "\\}")
+                group = InvisibleBraceR <$ lexeme (Char.char '}') <* modify' decrNesting
+                paren = ParenR <$ lexeme (Char.char ')')
+                bracket = BracketR <$ lexeme (Char.char ']')
+
+
+-- | Parses a single begin math token.
+mathBegin :: Lexer (Located Token)
+mathBegin = guardM isTextMode *> lexeme do
+    Char.string "\\(" <|> Char.string "\\[" <|> Char.string "$"
+    setMathMode
+    pure (BeginEnv "math")
+
+alignBegin :: Lexer (Located Token)
+alignBegin = guardM isTextMode *> lexeme do
+    Char.string "\\begin{align*}"
+    setMathMode
+    pure (BeginEnv "align*")
+
+
+-- | Parses a single end math token.
+mathEnd :: Lexer (Located Token)
+mathEnd = guardM isMathMode *> lexeme do
+    Char.string "\\)" <|> Char.string "\\]" <|> Char.string "$"
+    setTextMode
+    pure (EndEnv "math")
+
+alignEnd :: Lexer (Located Token)
+alignEnd = guardM isMathMode *> lexeme do
+    Char.string "\\end{align*}"
+    setTextMode
+    pure (EndEnv "align*")
+
+
+-- | Parses a word. Words are returned casefolded, since we want to ignore their case later on.
+word :: Lexer (Located Token)
+word = guardM isTextMode *> lexeme do
+    w <- some (Char.letterChar <|> Char.char '\'' <|> Char.char '-')
+    let t = Word (Text.toCaseFold (Text.pack w))
+    pure t
+
+number :: Lexer (Located Token)
+number = lexeme $ Integer <$> Lexer.decimal
+
+
+var :: Lexer (Located Token)
+var = guardM isMathMode *> lexeme (fmap Variable var')
+    where
+    var' = do
+        alphabeticPart <- letter <|> bb <|> greek
+        variationPart  <- subscriptNumber <|> ticked <|> pure ""
+        pure (alphabeticPart <> variationPart)
+
+    subscriptNumber :: Lexer Text
+    subscriptNumber = do
+        Char.char '_'
+        n <- some Char.digitChar
+        pure (Text.pack n)
+
+    -- Temporary hack to fit the TPTP format.
+    ticked :: Lexer Text
+    ticked = do
+        ticks <- some $ Char.char '\''
+        let ticks' = "prime" <$ ticks :: [Text]
+        pure (Text.concat ticks')
+
+    letter :: Lexer Text
+    letter = fmap Text.singleton Char.letterChar
+
+    greek :: Lexer Text
+    greek = try do
+        Char.char '\\'
+        l <- symbolParser greeks
+        notFollowedBy Char.letterChar
+        pure l
+
+    greeks :: [Text]
+    greeks =
+        [ "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta"
+        , "iota", "kappa", "lambda", "mu", "nu", "xi", "pi", "rho", "sigma"
+        , "tau", "upsilon", "phi", "chi", "psi", "omega"
+        , "Gamma", "Delta", "Theta", "Lambda", "Xi", "Pi", "Sigma", "Upsilon"
+        , "Phi", "Psi", "Omega"
+        ]
+
+    bb :: Lexer Text
+    bb = do
+        Char.string "\\mathbb{"
+        l <-  symbolParser bbs
+        Char.char '}'
+        pure $ "bb" <> l
+
+    bbs :: [Text]
+    bbs = Text.singleton <$> ['A'..'Z']
+
+
+    symbolParser :: [Text] -> Lexer Text
+    symbolParser symbols = asum (fmap Char.string symbols)
+
+
+symbol :: Lexer (Located Token)
+symbol = lexeme do
+    symb <- some (satisfy (`elem` symbols))
+    pure (Symbol (Text.pack symb))
+        where
+        symbols :: [Char]
+        symbols = ".,:;!?@=≠+-/|^><≤≥*&≈⊂⊃⊆⊇∈“”‘’"
+
+-- | Parses a TEX-style command.
+command :: Lexer (Located Token)
+command = lexeme do
+    Char.char '\\'
+    cmd <- some Char.letterChar
+    pure (Command (Text.pack cmd))
+
+-- | Parses the beginning of an environment.
+-- Commits only after having seen "\begin{".
+begin :: Lexer (Located Token)
+begin = lexeme do
+    Char.string "\\begin{"
+    env <- some (Char.letterChar <|> Char.char '*')
+    Char.char '}'
+    pure (BeginEnv (Text.pack env))
+
+-- | Parses a label command and extracts its marker.
+label :: Lexer (Located Token)
+label = lexeme do
+    Char.string "\\label{"
+    m <- marker
+    Char.char '}'
+    pure (Label m)
+
+-- | Parses a label command and extracts its marker.
+ref :: Lexer (Located Token)
+ref = lexeme do
+    -- @\\cref@ is from @cleveref@ and @\\hyperref@ is from @hyperref@
+    cmd <- Char.string "\\ref{" <|> Char.string "\\cref{" <|> Char.string "\\hyperref["
+    ms <- NonEmpty.fromList <$> marker `sepBy1` Char.char ','
+    case cmd of
+        "\\hyperref[" -> Char.string "]{" *> some (satisfy (/= '}')) *> Char.char '}' *> pure (Ref ms)
+        _ -> Char.char '}' *> pure (Ref ms)
+
+marker :: Lexer Text
+marker = takeWhile1P Nothing (\x -> isAsciiLower x || x == '_')
+
+-- | Parses the end of an environment.
+-- Commits only after having seen "\end{".
+end :: Lexer (Located Token)
+end = lexeme do
+    Char.string "\\end{"
+    env <- some (Char.letterChar <|> Char.char '*')
+    Char.char '}'
+    pure (EndEnv (Text.pack env))
+
+-- | Parses an opening delimiter.
+opening :: Lexer (Located Token)
+opening = lexeme (paren <|> brace <|> group <|> bracket)
+    where
+        brace = VisibleBraceL <$ lexeme (Char.string "\\{")
+        group = InvisibleBraceL <$ lexeme (Char.char '{')
+        paren = ParenL <$ lexeme (Char.char '(')
+        bracket = BracketL <$ lexeme (Char.char '[')
+
+-- | Parses a closing delimiter.
+closing :: Lexer (Located Token)
+closing = lexeme (paren <|> brace <|> group <|> bracket)
+    where
+        brace = VisibleBraceR <$ lexeme (Char.string "\\}")
+        group = InvisibleBraceR <$ lexeme (Char.char '}')
+        paren = ParenR <$ lexeme (Char.char ')')
+        bracket = BracketR <$ lexeme (Char.char ']')
+
+-- | Turns a Lexer into one that tracks the source position of the token
+-- and consumes trailing whitespace.
+lexeme :: Lexer a -> Lexer (Located a)
+lexeme p = do
+    start <- getSourcePos
+    t <- p
+    whitespace
+    pure (Located start t)
+
+space :: Lexer ()
+space = void (Char.char ' ' <|> Char.char '\n')
+    <|> void (Char.string "\\ " <|> Char.string "\\\\" <|> Char.string "\\!" <|> Char.string "\\," <|> Char.string "\\:" <|> Char.string "\\;" <|> Char.string "\\;")
+
+whitespace :: Lexer ()
+whitespace = Lexer.space (void (some space)) (Lexer.skipLineComment "%") empty
author	adelon <22380201+adelon@users.noreply.github.com>	2024-02-10 02:22:14 +0100
committer	adelon <22380201+adelon@users.noreply.github.com>	2024-02-10 02:22:14 +0100
commit	442d732696ad431b84f6e5c72b6ee785be4fd968 (patch)
tree	b476f395e7e91d67bacb6758bc84914b8711593f /source/Syntax/Token.hs