summaryrefslogtreecommitdiff
path: root/source/Syntax/Adapt.hs
diff options
context:
space:
mode:
Diffstat (limited to 'source/Syntax/Adapt.hs')
-rw-r--r--source/Syntax/Adapt.hs382
1 files changed, 382 insertions, 0 deletions
diff --git a/source/Syntax/Adapt.hs b/source/Syntax/Adapt.hs
new file mode 100644
index 0000000..0896c08
--- /dev/null
+++ b/source/Syntax/Adapt.hs
@@ -0,0 +1,382 @@
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE ApplicativeDo #-}
+{-# LANGUAGE RecordWildCards #-}
+
+module Syntax.Adapt where
+
+import Base
+import Syntax.Abstract
+import Syntax.Lexicon
+import Syntax.Token
+
+import Data.Map.Strict qualified as Map
+import Data.Maybe (catMaybes)
+import Data.Set qualified as Set
+import Data.Sequence qualified as Seq
+import Data.HashSet qualified as HS
+import Data.HashMap.Strict qualified as HM
+import Data.Text qualified as Text
+import Text.Earley.Mixfix (Associativity(..))
+import Text.Regex.Applicative
+import Text.Megaparsec.Pos
+
+
+scanChunk :: [Located Token] -> [ScannedLexicalItem]
+scanChunk ltoks =
+ let toks = unLocated <$> ltoks
+ matchOrErr re env pos = match re toks ?? error ("could not find lexical pattern in " <> env <> " at " <> sourcePosPretty pos)
+ in case ltoks of
+ Located pos (BeginEnv "definition") : _ -> matchOrErr definition "definition" pos
+ Located pos (BeginEnv "abbreviation") : _ -> matchOrErr abbreviation "abbreviation" pos
+ Located pos (BeginEnv "struct") :_ -> matchOrErr struct "struct definition" pos
+ Located pos (BeginEnv "inductive") :_ -> matchOrErr inductive "inductive definition" pos
+ _ -> []
+
+adaptChunks :: [[Located Token]] -> Lexicon -> Lexicon
+adaptChunks = extendLexicon . concatMap scanChunk
+
+data ScannedLexicalItem
+ = ScanAdj LexicalPhrase Marker
+ | ScanFun LexicalPhrase Marker
+ | ScanNoun LexicalPhrase Marker
+ | ScanStructNoun LexicalPhrase Marker
+ | ScanVerb LexicalPhrase Marker
+ | ScanRelationSymbol RelationSymbol Marker
+ | ScanFunctionSymbol FunctionSymbol Marker
+ | ScanPrefixPredicate PrefixPredicate Marker
+ | ScanStructOp Text -- we an use the command text as export name.
+ deriving (Show, Eq)
+
+skipUntilNextLexicalEnv :: RE Token [Token]
+skipUntilNextLexicalEnv = many (psym otherToken)
+ where
+ otherToken tok = tok /= BeginEnv "definition" && tok /= BeginEnv "struct" && tok /= BeginEnv "abbreviation"
+
+notEndOfLexicalEnvToken :: RE Token Token
+notEndOfLexicalEnvToken = psym innerToken
+ where
+ innerToken tok = tok /= EndEnv "definition" && tok /= EndEnv "struct" && tok /= EndEnv "abbreviation"
+
+definition :: RE Token [ScannedLexicalItem]
+definition = do
+ sym (BeginEnv "definition")
+ few notEndOfLexicalEnvToken
+ m <- label
+ few anySym
+ lexicalItem <- head
+ few anySym
+ sym (EndEnv "definition")
+ skipUntilNextLexicalEnv
+ pure [lexicalItem m]
+
+abbreviation :: RE Token [ScannedLexicalItem]
+abbreviation = do
+ sym (BeginEnv "abbreviation")
+ few anySym
+ m <- label
+ few anySym
+ lexicalItem <- head
+ few anySym
+ sym (EndEnv "abbreviation")
+ skipUntilNextLexicalEnv
+ pure [lexicalItem m]
+
+label :: RE Token Marker
+label = msym \case
+ Label m -> Just (Marker m)
+ _ -> Nothing
+
+-- | 'RE' that matches the head of a definition.
+head :: RE Token (Marker -> ScannedLexicalItem)
+-- Note that @<|>@ is left biased for 'RE', so we can just
+-- place 'adj' before 'verb' and do not have to worry about
+-- overlapping patterns.
+head = ScanNoun <$> noun
+ <|> ScanAdj <$> adj
+ <|> ScanVerb <$> verb
+ <|> ScanFun <$> fun
+ <|> ScanRelationSymbol <$> relationSymbol
+ <|> ScanFunctionSymbol <$> functionSymbol
+ <|> ScanPrefixPredicate <$> prefixPredicate
+
+inductive :: RE Token [ScannedLexicalItem]
+inductive = do
+ sym (BeginEnv "inductive")
+ few notEndOfLexicalEnvToken
+ m <- label
+ few anySym
+ lexicalItem <- functionSymbolInductive
+ few anySym
+ sym (EndEnv "inductive")
+ skipUntilNextLexicalEnv
+ pure [ScanFunctionSymbol lexicalItem m]
+
+struct :: RE Token [ScannedLexicalItem]
+struct = do
+ sym (BeginEnv "struct")
+ few anySym
+ m <- label
+ few anySym
+ lexicalItem <- ScanStructNoun . toLexicalPhrase <$> (an *> structPat <* var)
+ few anySym
+ lexicalItems <- structOps <|> pure []
+ sym (EndEnv "struct")
+ skipUntilNextLexicalEnv
+ pure (lexicalItem m : lexicalItems)
+
+structOps :: RE Token [ScannedLexicalItem]
+structOps = do
+ sym (BeginEnv "enumerate")
+ lexicalItems <- many structOp
+ sym (EndEnv "enumerate")
+ few anySym
+ pure lexicalItems
+
+structOp :: RE Token ScannedLexicalItem
+structOp = do
+ sym (Command "item")
+ op <- math command
+ pure (ScanStructOp op)
+
+noun :: RE Token LexicalPhrase
+noun = toLexicalPhrase <$> (var *> is *> an *> pat <* iff)
+
+adj :: RE Token LexicalPhrase
+adj = toLexicalPhrase <$> (var *> is *> pat <* iff)
+
+verb :: RE Token LexicalPhrase
+verb = toLexicalPhrase <$> (var *> pat <* iff)
+
+fun :: RE Token LexicalPhrase
+fun = toLexicalPhrase <$> (the *> pat <* (is <|> comma))
+
+relationSymbol :: RE Token RelationSymbol
+relationSymbol = math relator' <* iff
+ where
+ relator' = do
+ varSymbol
+ rel <- symbol
+ varSymbol
+ pure rel
+
+functionSymbol :: RE Token FunctionSymbol
+functionSymbol = do
+ sym (BeginEnv "math")
+ toks <- few nonDefinitionKeyword
+ sym (Symbol "=")
+ pure (fromToken <$> toks)
+ where
+ fromToken = \case
+ Variable _ -> Nothing -- Variables become slots.
+ tok -> Just tok -- Everything else is part of the pattern.
+
+functionSymbolInductive :: RE Token FunctionSymbol
+functionSymbolInductive = do
+ sym (BeginEnv "math")
+ toks <- few nonDefinitionKeyword
+ sym (Command "subseteq")
+ pure (fromToken <$> toks)
+ where
+ fromToken = \case
+ Variable _ -> Nothing -- Variables become slots.
+ tok -> Just tok -- Everything else is part of the pattern.
+
+prefixPredicate :: RE Token PrefixPredicate
+prefixPredicate = math prfx <* iff
+ where
+ prfx = do
+ r <- command
+ args <- many (sym InvisibleBraceL *> varSymbol <* sym InvisibleBraceR)
+ pure (PrefixPredicate r (length args))
+
+
+command :: RE Token Text
+command = msym \case
+ Command cmd -> Just cmd
+ _ -> Nothing
+
+var :: RE Token Token
+var = math varSymbol
+
+varSymbol :: RE Token Token
+varSymbol = psym isVar
+
+
+nonDefinitionKeyword :: RE Token Token
+nonDefinitionKeyword = psym (`notElem` keywords)
+ where
+ keywords =
+ [ Word "if"
+ , Word "iff"
+ , Symbol "="
+ , Command "iff"
+ ]
+
+
+pat :: RE Token [Token]
+pat = many (psym isLexicalPhraseToken <|> var)
+
+structPat :: RE Token [Token]
+structPat = many (psym isLexicalPhraseToken)
+
+math :: RE Token a -> RE Token a
+math re = sym (BeginEnv "math") *> re <* sym (EndEnv "math")
+
+-- | We allow /conditional perfection/: the first /@if@/ in a definition is interpreted as /@iff@/.
+iff :: RE Token ()
+iff = void (sym (Word "if")) -- Using @void@ is faster (only requires recognition).
+ <|> void (sym (Word "iff"))
+ <|> void (string [Word "if", Word "and", Word "only", Word "if"])
+ <|> void (sym (Word "denote"))
+ <|> void (sym (Word "stand") *> sym (Word "for"))
+{-# INLINE iff #-}
+
+an :: RE Token ()
+an = void (sym (Word "a"))
+ <|> void (sym (Word "an"))
+{-# INLINE an #-}
+
+is :: RE Token ()
+is = void (sym (Word "is") <|> sym (Word "denotes"))
+{-# INLINE is #-}
+
+the :: RE Token ()
+the = void (sym (Word "the"))
+{-# INLINE the #-}
+
+comma :: RE Token ()
+comma = void (sym (Symbol ","))
+{-# INLINE comma #-}
+
+
+isVar :: Token -> Bool
+isVar = \case
+ Variable _ -> True
+ _token -> False
+
+isCommand :: Token -> Bool
+isCommand = \case
+ Command _ -> True
+ _token -> False
+
+isNotDefnToken :: Token -> Bool
+isNotDefnToken = \case
+ BeginEnv "definition" -> False
+ EndEnv "definition" -> False
+ _token -> True
+
+isLexicalPhraseToken :: Token -> Bool
+isLexicalPhraseToken = \case
+ Word w -> w `Set.notMember` keywords
+ --
+ -- Simple commands (outside of math-mode) are allowed. This is useful
+ -- for defining lexical phrases containing symbolic expressions such as
+ -- `X is \Ttwo{}`, where `\Ttwo` is a macro that expands to `T_2`.
+ -- We also allow these macros to take arguments, hence the need to
+ -- allow grouping delimiters. They can also be used to escape the end
+ -- of the command for correct spacing, as in the above example.
+ --
+ Command _cmd -> True
+ InvisibleBraceL -> True
+ InvisibleBraceR -> True
+ --
+ -- No other tokens may occur in lexical phrases. In particular, no `_dot`
+ -- token may occur, limiting the lexical phrase to a single sentence.
+ -- Commas occurring in variable lists should be placed
+ -- within the math environment. Thus `$a,b$ are coprime iff`,
+ -- not `$a$,`$b$` are coprime iff`.
+ --
+ _token -> False
+ where
+ keywords = Set.fromList ["a", "an", "is", "are", "if", "iff", "denote", "stand", "let"]
+
+
+toLexicalPhrase :: [Token] -> LexicalPhrase
+toLexicalPhrase toks = component <$> toks
+ where
+ component = \case
+ Variable _ -> Nothing
+ tok -> Just tok
+
+
+symbol :: RE Token Token
+symbol = msym $ \tok -> case tok of
+ Command _ -> Just tok
+ Symbol _ -> Just tok
+ _tok -> Nothing
+
+
+-- | Basic paradigms for pluralizations of nominals.
+guessNounPlural :: LexicalPhrase -> SgPl LexicalPhrase
+guessNounPlural item = SgPl item (pluralize item)
+ where
+ pluralize :: LexicalPhrase -> LexicalPhrase
+ pluralize = \case
+ Just (Word w) : pat'@(Just w' : _) | isPreposition w' -> Just (Word (Text.snoc w 's')) : pat'
+ [Just (Word w)] -> [Just (Word (Text.snoc w 's'))]
+ [tok, Just (Word w)] -> [tok, Just (Word (Text.snoc w 's'))]
+ [tok, tok', Just (Word w)] -> [tok, tok', Just (Word (Text.snoc w 's'))]
+ pat' -> pat'
+
+guessVerbPlural :: LexicalPhrase -> SgPl LexicalPhrase
+guessVerbPlural item = SgPl item itemPl
+ where
+ itemPl = case item of
+ Just (Word v) : rest -> case Text.unsnoc v of
+ Just (v', 's') -> Just (Word v') : rest
+ _ -> item
+ _ -> item
+
+isAdjR :: LexicalPhrase -> Bool
+isAdjR item = containsPreposition item || containsSlot item
+ where
+ containsPreposition, containsSlot :: LexicalPhrase -> Bool
+ containsPreposition = any isPreposition . catMaybes
+ containsSlot = (Nothing `elem`)
+
+isPreposition :: Token -> Bool
+isPreposition w = HS.member w (HS.map Word prepositions)
+
+-- | Right-biased set insertion, keeping the original set
+-- when inserting already present elements. @insertR@ is unfortunately
+-- a hidden function, even in @Data.Set.Internal@, so we approximate it
+-- here. In theory one could avoid the indirection of first forming the singleton
+-- set on the rhs.
+insertR' :: Hashable a => a -> HashSet a -> HashSet a
+insertR' x xs = xs `HS.union` HS.singleton x -- @union@ is left-biased.
+
+insertMapR :: Ord k => k -> a -> Map k a -> Map k a
+insertMapR k x xs = xs `Map.union` Map.singleton k x -- @union@ is left-biased.
+
+
+insertR :: Hashable k => k -> a -> HashMap k a -> HashMap k a
+insertR k x xs = xs `HM.union` HM.singleton k x -- @union@ is left-biased.
+
+-- | Takes the scanned lexical phrases and inserts them in the correct
+-- places in a lexicon.
+extendLexicon :: [ScannedLexicalItem] -> Lexicon -> Lexicon
+extendLexicon [] lexicon = lexicon
+-- Note that we only consider 'sg' in the 'Ord' instance of SgPl, so that
+-- known irregular plurals are preserved.
+extendLexicon (scan : scans) lexicon@Lexicon{..} = case scan of
+ ScanAdj item m -> if isAdjR item
+ then extendLexicon scans lexicon{lexiconAdjRs = insertR item m lexiconAdjRs}
+ else extendLexicon scans lexicon{lexiconAdjLs = insertR item m lexiconAdjLs}
+ ScanFun item m ->
+ extendLexicon scans lexicon{lexiconFuns = insertR (guessNounPlural item) m lexiconFuns}
+ ScanVerb item m ->
+ extendLexicon scans lexicon{lexiconVerbs = insertR (guessVerbPlural item) m lexiconVerbs}
+ ScanNoun item m ->
+ extendLexicon scans lexicon{lexiconNouns = insertR (guessNounPlural item) m lexiconNouns}
+ ScanStructNoun item m ->
+ extendLexicon scans lexicon{lexiconStructNouns = insertR (guessNounPlural item) m lexiconStructNouns}
+ ScanRelationSymbol item m ->
+ extendLexicon scans lexicon{lexiconRelationSymbols = insertR item m lexiconRelationSymbols}
+ ScanFunctionSymbol item m ->
+ if any (item `HM.member`) lexiconMixfix
+ then extendLexicon scans lexicon
+ else extendLexicon scans lexicon{lexiconMixfix = Seq.adjust (HM.insert item (NonAssoc, m)) 9 lexiconMixfix}
+ ScanStructOp op ->
+ extendLexicon scans lexicon{lexiconStructFun = insertR (StructSymbol op) (Marker op) lexiconStructFun}
+ ScanPrefixPredicate tok m ->
+ extendLexicon scans lexicon{lexiconPrefixPredicates = insertR tok m lexiconPrefixPredicates}