diff options
Diffstat (limited to 'source/Syntax/Adapt.hs')
| -rw-r--r-- | source/Syntax/Adapt.hs | 382 |
1 files changed, 382 insertions, 0 deletions
diff --git a/source/Syntax/Adapt.hs b/source/Syntax/Adapt.hs new file mode 100644 index 0000000..0896c08 --- /dev/null +++ b/source/Syntax/Adapt.hs @@ -0,0 +1,382 @@ +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE ApplicativeDo #-} +{-# LANGUAGE RecordWildCards #-} + +module Syntax.Adapt where + +import Base +import Syntax.Abstract +import Syntax.Lexicon +import Syntax.Token + +import Data.Map.Strict qualified as Map +import Data.Maybe (catMaybes) +import Data.Set qualified as Set +import Data.Sequence qualified as Seq +import Data.HashSet qualified as HS +import Data.HashMap.Strict qualified as HM +import Data.Text qualified as Text +import Text.Earley.Mixfix (Associativity(..)) +import Text.Regex.Applicative +import Text.Megaparsec.Pos + + +scanChunk :: [Located Token] -> [ScannedLexicalItem] +scanChunk ltoks = + let toks = unLocated <$> ltoks + matchOrErr re env pos = match re toks ?? error ("could not find lexical pattern in " <> env <> " at " <> sourcePosPretty pos) + in case ltoks of + Located pos (BeginEnv "definition") : _ -> matchOrErr definition "definition" pos + Located pos (BeginEnv "abbreviation") : _ -> matchOrErr abbreviation "abbreviation" pos + Located pos (BeginEnv "struct") :_ -> matchOrErr struct "struct definition" pos + Located pos (BeginEnv "inductive") :_ -> matchOrErr inductive "inductive definition" pos + _ -> [] + +adaptChunks :: [[Located Token]] -> Lexicon -> Lexicon +adaptChunks = extendLexicon . concatMap scanChunk + +data ScannedLexicalItem + = ScanAdj LexicalPhrase Marker + | ScanFun LexicalPhrase Marker + | ScanNoun LexicalPhrase Marker + | ScanStructNoun LexicalPhrase Marker + | ScanVerb LexicalPhrase Marker + | ScanRelationSymbol RelationSymbol Marker + | ScanFunctionSymbol FunctionSymbol Marker + | ScanPrefixPredicate PrefixPredicate Marker + | ScanStructOp Text -- we an use the command text as export name. + deriving (Show, Eq) + +skipUntilNextLexicalEnv :: RE Token [Token] +skipUntilNextLexicalEnv = many (psym otherToken) + where + otherToken tok = tok /= BeginEnv "definition" && tok /= BeginEnv "struct" && tok /= BeginEnv "abbreviation" + +notEndOfLexicalEnvToken :: RE Token Token +notEndOfLexicalEnvToken = psym innerToken + where + innerToken tok = tok /= EndEnv "definition" && tok /= EndEnv "struct" && tok /= EndEnv "abbreviation" + +definition :: RE Token [ScannedLexicalItem] +definition = do + sym (BeginEnv "definition") + few notEndOfLexicalEnvToken + m <- label + few anySym + lexicalItem <- head + few anySym + sym (EndEnv "definition") + skipUntilNextLexicalEnv + pure [lexicalItem m] + +abbreviation :: RE Token [ScannedLexicalItem] +abbreviation = do + sym (BeginEnv "abbreviation") + few anySym + m <- label + few anySym + lexicalItem <- head + few anySym + sym (EndEnv "abbreviation") + skipUntilNextLexicalEnv + pure [lexicalItem m] + +label :: RE Token Marker +label = msym \case + Label m -> Just (Marker m) + _ -> Nothing + +-- | 'RE' that matches the head of a definition. +head :: RE Token (Marker -> ScannedLexicalItem) +-- Note that @<|>@ is left biased for 'RE', so we can just +-- place 'adj' before 'verb' and do not have to worry about +-- overlapping patterns. +head = ScanNoun <$> noun + <|> ScanAdj <$> adj + <|> ScanVerb <$> verb + <|> ScanFun <$> fun + <|> ScanRelationSymbol <$> relationSymbol + <|> ScanFunctionSymbol <$> functionSymbol + <|> ScanPrefixPredicate <$> prefixPredicate + +inductive :: RE Token [ScannedLexicalItem] +inductive = do + sym (BeginEnv "inductive") + few notEndOfLexicalEnvToken + m <- label + few anySym + lexicalItem <- functionSymbolInductive + few anySym + sym (EndEnv "inductive") + skipUntilNextLexicalEnv + pure [ScanFunctionSymbol lexicalItem m] + +struct :: RE Token [ScannedLexicalItem] +struct = do + sym (BeginEnv "struct") + few anySym + m <- label + few anySym + lexicalItem <- ScanStructNoun . toLexicalPhrase <$> (an *> structPat <* var) + few anySym + lexicalItems <- structOps <|> pure [] + sym (EndEnv "struct") + skipUntilNextLexicalEnv + pure (lexicalItem m : lexicalItems) + +structOps :: RE Token [ScannedLexicalItem] +structOps = do + sym (BeginEnv "enumerate") + lexicalItems <- many structOp + sym (EndEnv "enumerate") + few anySym + pure lexicalItems + +structOp :: RE Token ScannedLexicalItem +structOp = do + sym (Command "item") + op <- math command + pure (ScanStructOp op) + +noun :: RE Token LexicalPhrase +noun = toLexicalPhrase <$> (var *> is *> an *> pat <* iff) + +adj :: RE Token LexicalPhrase +adj = toLexicalPhrase <$> (var *> is *> pat <* iff) + +verb :: RE Token LexicalPhrase +verb = toLexicalPhrase <$> (var *> pat <* iff) + +fun :: RE Token LexicalPhrase +fun = toLexicalPhrase <$> (the *> pat <* (is <|> comma)) + +relationSymbol :: RE Token RelationSymbol +relationSymbol = math relator' <* iff + where + relator' = do + varSymbol + rel <- symbol + varSymbol + pure rel + +functionSymbol :: RE Token FunctionSymbol +functionSymbol = do + sym (BeginEnv "math") + toks <- few nonDefinitionKeyword + sym (Symbol "=") + pure (fromToken <$> toks) + where + fromToken = \case + Variable _ -> Nothing -- Variables become slots. + tok -> Just tok -- Everything else is part of the pattern. + +functionSymbolInductive :: RE Token FunctionSymbol +functionSymbolInductive = do + sym (BeginEnv "math") + toks <- few nonDefinitionKeyword + sym (Command "subseteq") + pure (fromToken <$> toks) + where + fromToken = \case + Variable _ -> Nothing -- Variables become slots. + tok -> Just tok -- Everything else is part of the pattern. + +prefixPredicate :: RE Token PrefixPredicate +prefixPredicate = math prfx <* iff + where + prfx = do + r <- command + args <- many (sym InvisibleBraceL *> varSymbol <* sym InvisibleBraceR) + pure (PrefixPredicate r (length args)) + + +command :: RE Token Text +command = msym \case + Command cmd -> Just cmd + _ -> Nothing + +var :: RE Token Token +var = math varSymbol + +varSymbol :: RE Token Token +varSymbol = psym isVar + + +nonDefinitionKeyword :: RE Token Token +nonDefinitionKeyword = psym (`notElem` keywords) + where + keywords = + [ Word "if" + , Word "iff" + , Symbol "=" + , Command "iff" + ] + + +pat :: RE Token [Token] +pat = many (psym isLexicalPhraseToken <|> var) + +structPat :: RE Token [Token] +structPat = many (psym isLexicalPhraseToken) + +math :: RE Token a -> RE Token a +math re = sym (BeginEnv "math") *> re <* sym (EndEnv "math") + +-- | We allow /conditional perfection/: the first /@if@/ in a definition is interpreted as /@iff@/. +iff :: RE Token () +iff = void (sym (Word "if")) -- Using @void@ is faster (only requires recognition). + <|> void (sym (Word "iff")) + <|> void (string [Word "if", Word "and", Word "only", Word "if"]) + <|> void (sym (Word "denote")) + <|> void (sym (Word "stand") *> sym (Word "for")) +{-# INLINE iff #-} + +an :: RE Token () +an = void (sym (Word "a")) + <|> void (sym (Word "an")) +{-# INLINE an #-} + +is :: RE Token () +is = void (sym (Word "is") <|> sym (Word "denotes")) +{-# INLINE is #-} + +the :: RE Token () +the = void (sym (Word "the")) +{-# INLINE the #-} + +comma :: RE Token () +comma = void (sym (Symbol ",")) +{-# INLINE comma #-} + + +isVar :: Token -> Bool +isVar = \case + Variable _ -> True + _token -> False + +isCommand :: Token -> Bool +isCommand = \case + Command _ -> True + _token -> False + +isNotDefnToken :: Token -> Bool +isNotDefnToken = \case + BeginEnv "definition" -> False + EndEnv "definition" -> False + _token -> True + +isLexicalPhraseToken :: Token -> Bool +isLexicalPhraseToken = \case + Word w -> w `Set.notMember` keywords + -- + -- Simple commands (outside of math-mode) are allowed. This is useful + -- for defining lexical phrases containing symbolic expressions such as + -- `X is \Ttwo{}`, where `\Ttwo` is a macro that expands to `T_2`. + -- We also allow these macros to take arguments, hence the need to + -- allow grouping delimiters. They can also be used to escape the end + -- of the command for correct spacing, as in the above example. + -- + Command _cmd -> True + InvisibleBraceL -> True + InvisibleBraceR -> True + -- + -- No other tokens may occur in lexical phrases. In particular, no `_dot` + -- token may occur, limiting the lexical phrase to a single sentence. + -- Commas occurring in variable lists should be placed + -- within the math environment. Thus `$a,b$ are coprime iff`, + -- not `$a$,`$b$` are coprime iff`. + -- + _token -> False + where + keywords = Set.fromList ["a", "an", "is", "are", "if", "iff", "denote", "stand", "let"] + + +toLexicalPhrase :: [Token] -> LexicalPhrase +toLexicalPhrase toks = component <$> toks + where + component = \case + Variable _ -> Nothing + tok -> Just tok + + +symbol :: RE Token Token +symbol = msym $ \tok -> case tok of + Command _ -> Just tok + Symbol _ -> Just tok + _tok -> Nothing + + +-- | Basic paradigms for pluralizations of nominals. +guessNounPlural :: LexicalPhrase -> SgPl LexicalPhrase +guessNounPlural item = SgPl item (pluralize item) + where + pluralize :: LexicalPhrase -> LexicalPhrase + pluralize = \case + Just (Word w) : pat'@(Just w' : _) | isPreposition w' -> Just (Word (Text.snoc w 's')) : pat' + [Just (Word w)] -> [Just (Word (Text.snoc w 's'))] + [tok, Just (Word w)] -> [tok, Just (Word (Text.snoc w 's'))] + [tok, tok', Just (Word w)] -> [tok, tok', Just (Word (Text.snoc w 's'))] + pat' -> pat' + +guessVerbPlural :: LexicalPhrase -> SgPl LexicalPhrase +guessVerbPlural item = SgPl item itemPl + where + itemPl = case item of + Just (Word v) : rest -> case Text.unsnoc v of + Just (v', 's') -> Just (Word v') : rest + _ -> item + _ -> item + +isAdjR :: LexicalPhrase -> Bool +isAdjR item = containsPreposition item || containsSlot item + where + containsPreposition, containsSlot :: LexicalPhrase -> Bool + containsPreposition = any isPreposition . catMaybes + containsSlot = (Nothing `elem`) + +isPreposition :: Token -> Bool +isPreposition w = HS.member w (HS.map Word prepositions) + +-- | Right-biased set insertion, keeping the original set +-- when inserting already present elements. @insertR@ is unfortunately +-- a hidden function, even in @Data.Set.Internal@, so we approximate it +-- here. In theory one could avoid the indirection of first forming the singleton +-- set on the rhs. +insertR' :: Hashable a => a -> HashSet a -> HashSet a +insertR' x xs = xs `HS.union` HS.singleton x -- @union@ is left-biased. + +insertMapR :: Ord k => k -> a -> Map k a -> Map k a +insertMapR k x xs = xs `Map.union` Map.singleton k x -- @union@ is left-biased. + + +insertR :: Hashable k => k -> a -> HashMap k a -> HashMap k a +insertR k x xs = xs `HM.union` HM.singleton k x -- @union@ is left-biased. + +-- | Takes the scanned lexical phrases and inserts them in the correct +-- places in a lexicon. +extendLexicon :: [ScannedLexicalItem] -> Lexicon -> Lexicon +extendLexicon [] lexicon = lexicon +-- Note that we only consider 'sg' in the 'Ord' instance of SgPl, so that +-- known irregular plurals are preserved. +extendLexicon (scan : scans) lexicon@Lexicon{..} = case scan of + ScanAdj item m -> if isAdjR item + then extendLexicon scans lexicon{lexiconAdjRs = insertR item m lexiconAdjRs} + else extendLexicon scans lexicon{lexiconAdjLs = insertR item m lexiconAdjLs} + ScanFun item m -> + extendLexicon scans lexicon{lexiconFuns = insertR (guessNounPlural item) m lexiconFuns} + ScanVerb item m -> + extendLexicon scans lexicon{lexiconVerbs = insertR (guessVerbPlural item) m lexiconVerbs} + ScanNoun item m -> + extendLexicon scans lexicon{lexiconNouns = insertR (guessNounPlural item) m lexiconNouns} + ScanStructNoun item m -> + extendLexicon scans lexicon{lexiconStructNouns = insertR (guessNounPlural item) m lexiconStructNouns} + ScanRelationSymbol item m -> + extendLexicon scans lexicon{lexiconRelationSymbols = insertR item m lexiconRelationSymbols} + ScanFunctionSymbol item m -> + if any (item `HM.member`) lexiconMixfix + then extendLexicon scans lexicon + else extendLexicon scans lexicon{lexiconMixfix = Seq.adjust (HM.insert item (NonAssoc, m)) 9 lexiconMixfix} + ScanStructOp op -> + extendLexicon scans lexicon{lexiconStructFun = insertR (StructSymbol op) (Marker op) lexiconStructFun} + ScanPrefixPredicate tok m -> + extendLexicon scans lexicon{lexiconPrefixPredicates = insertR tok m lexiconPrefixPredicates} |
