From 442d732696ad431b84f6e5c72b6ee785be4fd968 Mon Sep 17 00:00:00 2001 From: adelon <22380201+adelon@users.noreply.github.com> Date: Sat, 10 Feb 2024 02:22:14 +0100 Subject: Initial commit --- source/Syntax/Abstract.hs | 468 ++++++++++++++++++++++++++ source/Syntax/Adapt.hs | 382 +++++++++++++++++++++ source/Syntax/Chunk.hs | 46 +++ source/Syntax/Concrete.hs | 657 +++++++++++++++++++++++++++++++++++++ source/Syntax/Concrete/Keywords.hs | 222 +++++++++++++ source/Syntax/Import.hs | 43 +++ source/Syntax/Internal.hs | 612 ++++++++++++++++++++++++++++++++++ source/Syntax/LexicalPhrase.hs | 93 ++++++ source/Syntax/Lexicon.hs | 275 ++++++++++++++++ source/Syntax/Token.hs | 438 +++++++++++++++++++++++++ 10 files changed, 3236 insertions(+) create mode 100644 source/Syntax/Abstract.hs create mode 100644 source/Syntax/Adapt.hs create mode 100644 source/Syntax/Chunk.hs create mode 100644 source/Syntax/Concrete.hs create mode 100644 source/Syntax/Concrete/Keywords.hs create mode 100644 source/Syntax/Import.hs create mode 100644 source/Syntax/Internal.hs create mode 100644 source/Syntax/LexicalPhrase.hs create mode 100644 source/Syntax/Lexicon.hs create mode 100644 source/Syntax/Token.hs (limited to 'source/Syntax') diff --git a/source/Syntax/Abstract.hs b/source/Syntax/Abstract.hs new file mode 100644 index 0000000..6f30612 --- /dev/null +++ b/source/Syntax/Abstract.hs @@ -0,0 +1,468 @@ +{-# LANGUAGE DeriveAnyClass #-} +{-# LANGUAGE DerivingStrategies #-} +{-# LANGUAGE GADTs #-} +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE StandaloneDeriving #-} + + +-- | Data types for the abstract syntax tree and helper functions +-- for constructing the lexicon. +-- +module Syntax.Abstract + ( module Syntax.Abstract + , module Syntax.LexicalPhrase + , module Syntax.Token + ) where + + +import Base +import Syntax.LexicalPhrase (LexicalPhrase, SgPl(..), unsafeReadPhraseSgPl, unsafeReadPhrase) +import Syntax.Token (Token(..)) + +import Text.Earley.Mixfix (Holey) +import Data.Text qualified as Text +import Text.Megaparsec.Pos (SourcePos) + +-- | Local "variable-like" symbols that can be captured by binders. +data VarSymbol + = NamedVar Text -- ^ A named variable. + | FreshVar Int -- ^ A nameless (implicit) variable. Should only come from desugaring. + deriving (Show, Eq, Ord, Generic, Hashable) + +instance IsString VarSymbol where + fromString v = NamedVar $ Text.pack v + + +data Expr + = ExprVar VarSymbol + | ExprInteger Int + | ExprOp FunctionSymbol [Expr] + | ExprStructOp StructSymbol (Maybe Expr) + | ExprFiniteSet (NonEmpty Expr) + | ExprSep VarSymbol Expr Stmt + -- ^ Of the form /@{x ∈ X | P(x)}@/. + | ExprReplace Expr (NonEmpty (VarSymbol,Expr)) (Maybe Stmt) + -- ^ E.g.: /@{ f(x, y) | x ∈ X, y ∈ Y | P(x, y) }@/. + | ExprReplacePred VarSymbol VarSymbol Expr Stmt + -- ^ E.g.: /@{ y | \\exists x\\in X. P(x, y) }@/. + deriving (Show, Eq, Ord) + + +type FunctionSymbol = Holey Token + +type RelationSymbol = Token + +newtype StructSymbol = StructSymbol { unStructSymbol :: Text } deriving newtype (Show, Eq, Ord, Hashable) + +-- | The predefined @cons@ function symbol used for desugaring finite set expressions. +pattern ConsSymbol :: FunctionSymbol +pattern ConsSymbol = [Just (Command "cons"), Just InvisibleBraceL, Nothing, Just InvisibleBraceR, Just InvisibleBraceL, Nothing, Just InvisibleBraceR] + +-- | The predefined @pair@ function symbol used for desugaring tuple notation.. +pattern PairSymbol :: FunctionSymbol +pattern PairSymbol = [Just (Command "pair"), Just InvisibleBraceL, Nothing, Just InvisibleBraceR, Just InvisibleBraceL, Nothing, Just InvisibleBraceR] + +-- | Function application /@f(x)@/ desugars to /@\apply{f}{x}@/. +pattern ApplySymbol :: FunctionSymbol +pattern ApplySymbol = [Just (Command "apply"), Just InvisibleBraceL, Nothing, Just InvisibleBraceR, Just InvisibleBraceL, Nothing, Just InvisibleBraceR] + +pattern DomSymbol :: FunctionSymbol +pattern DomSymbol = [Just (Command "dom"), Just InvisibleBraceL, Nothing, Just InvisibleBraceR] + +pattern CarrierSymbol :: StructSymbol +pattern CarrierSymbol = StructSymbol "carrier" + +pattern ExprConst :: Token -> Expr +pattern ExprConst c = ExprOp [Just c] [] + +pattern ExprApp :: Expr -> Expr -> Expr +pattern ExprApp e1 e2 = ExprOp ApplySymbol [e1, e2] + +pattern ExprPair :: Expr -> Expr -> Expr +pattern ExprPair e1 e2 = ExprOp PairSymbol [e1, e2] + +-- | Tuples are interpreted as nested pairs: +-- the triple /@(a, b, c)@/ is interpreted as +-- /@(a, (b, c))@/. +-- This means that the product operation should also +-- be right associative, so that /@(a, b, c)@/ can +-- form elements of /@A\times B\times C@/. +makeTuple :: NonEmpty Expr -> Expr +makeTuple = foldr1 ExprPair + + +data Chain + = ChainBase (NonEmpty Expr) Sign Relation (NonEmpty Expr) + | ChainCons (NonEmpty Expr) Sign Relation Chain + deriving (Show, Eq, Ord) + +data Relation + = RelationSymbol RelationSymbol -- ^ E.g.: /@x \in X@/ + | RelationExpr Expr -- ^ E.g.: /@x \mathrel{R} y@/ + deriving (Show, Eq, Ord) + +data Sign = Positive | Negative deriving (Show, Eq, Ord) + +data Formula + = FormulaChain Chain + | FormulaPredicate PrefixPredicate (NonEmpty Expr) + | Connected Connective Formula Formula + | FormulaNeg Formula + | FormulaQuantified Quantifier (NonEmpty VarSymbol) Bound Formula + | PropositionalConstant PropositionalConstant + deriving (Show, Eq, Ord) + +data PropositionalConstant = IsBottom | IsTop + deriving (Show, Eq, Ord, Generic, Hashable) + +data PrefixPredicate + = PrefixPredicate Text Int + deriving (Show, Eq, Ord, Generic, Hashable) + + +data Connective + = Conjunction + | Disjunction + | Implication + | Equivalence + | ExclusiveOr + | NegatedDisjunction + deriving (Show, Eq, Ord, Generic, Hashable) + + + +makeConnective :: Holey Token -> [Formula] -> Formula +makeConnective [Nothing, Just (Command "implies"), Nothing] [f1, f2] = Connected Implication f1 f2 +makeConnective [Nothing, Just (Command "land"), Nothing] [f1, f2] = Connected Conjunction f1 f2 +makeConnective [Nothing, Just (Command "lor"), Nothing] [f1, f2] = Connected Disjunction f1 f2 +makeConnective [Nothing, Just (Command "iff"), Nothing] [f1, f2] = Connected Equivalence f1 f2 +makeConnective [Just(Command "lnot"), Nothing] [f1] = FormulaNeg f1 +makeConnective pat _ = error ("makeConnective does not handle the following connective correctly: " <> show pat) + + + +type StructPhrase = SgPl LexicalPhrase + +-- | For example 'an integer' would be +-- > Noun (unsafeReadPhrase "integer[/s]") [] +type Noun = NounOf Term +data NounOf a + = Noun (SgPl LexicalPhrase) [a] + deriving (Show, Eq, Ord) + + + + +type NounPhrase t = NounPhraseOf t Term +-- NOTE: 'NounPhraseOf' is only used with arguments of type 'Term', +-- but keeping the argument parameter @a@ allows the 'Show' and 'Eq' +-- instances to remain decidable. +data NounPhraseOf t a + = NounPhrase [AdjLOf a] (NounOf a) (t VarSymbol) [AdjROf a] (Maybe Stmt) + +instance (Show a, Show (t VarSymbol)) => Show (NounPhraseOf t a) where + show (NounPhrase ls n vs rs ms) = + "NounPhrase (" + <> show ls <> ") (" + <> show n <> ") (" + <> show vs <> ") (" + <> show rs <> ") (" + <> show ms <> ")" + +instance (Eq a, Eq (t VarSymbol)) => Eq (NounPhraseOf t a) where + NounPhrase ls n vs rs ms == NounPhrase ls' n' vs' rs' ms' = + ls == ls' && n == n' && vs == vs' && rs == rs' && ms == ms' + +-- This is arbitrary and ugly, but it's useful to have a somewhat +-- usable Ord instance for all raw syntax (e.g. for 'nubOrd'). +instance (Ord a, Ord (t VarSymbol)) => Ord (NounPhraseOf t a) where + NounPhrase ls n vs rs ms `compare` NounPhrase ls' n' vs' rs' ms' = + case compare n n' of + GT -> case compare ls ls' of + GT -> case compare rs rs' of + GT -> case compare ms ms' of + GT -> compare vs vs' + ordering -> ordering + ordering -> ordering + ordering -> ordering + ordering -> ordering + +-- | @Nameless a@ is quivalent to @Const () a@ (from "Data.Functor.Const"). +-- It describes a container that is unwilling to actually contain something. +-- @Nameless@ lets us treat nouns with no names, one name, or many names uniformly. +-- Thus @NounPhraseOf Nameless a@ is a noun phrase without a name and with arguments +-- of type @a@. +data Nameless a = Nameless deriving (Show, Eq, Ord) + + +-- | Left adjectives modify nouns from the left side, +-- e.g. /@even@/, /@continuous@/, and /@σ-finite@/. +type AdjL = AdjLOf Term +data AdjLOf a + = AdjL LexicalPhrase [a] + deriving (Show, Eq, Ord) + + +-- | Right attributes consist of basic right adjectives, e.g. +-- /@divisible by ?@/, or /@of finite type@/ and verb phrases +-- marked with /@that@/, such as /@integer that divides n@/. +-- In some cases these right attributes may be followed +-- by an additional such-that phrase. +type AdjR = AdjROf Term +data AdjROf a + = AdjR LexicalPhrase [a] + | AttrRThat VerbPhrase + deriving (Show, Eq, Ord) + + +-- | Adjectives for parts of the AST where adjectives are not used +-- to modify nouns and the L/R distinction does not matter, such as +-- when then are used together with a copula (like /@n is even@/). +type Adj = AdjOf Term +data AdjOf a + = Adj LexicalPhrase [a] + deriving (Show, Eq, Ord) + + +type Verb = VerbOf Term +data VerbOf a + = Verb (SgPl LexicalPhrase) [a] + deriving (Show, Eq, Ord) + + +type Fun = FunOf Term +data FunOf a + = Fun (SgPl LexicalPhrase) [a] + deriving (Show, Eq, Ord) + + +type VerbPhrase = VerbPhraseOf Term +data VerbPhraseOf a + = VPVerb (VerbOf a) + | VPAdj (NonEmpty (AdjOf a)) -- ^ @x is foo@ / @x is foo and bar@ + | VPVerbNot (VerbOf a) + | VPAdjNot (NonEmpty (AdjOf a)) -- ^ @x is not foo@ / @x is neither foo nor bar@ + deriving (Show, Eq, Ord) + + +data Quantifier + = Universally + | Existentially + | Nonexistentially + deriving (Show, Eq, Ord) + +data QuantPhrase = QuantPhrase Quantifier (NounPhrase []) deriving (Show, Eq, Ord) + + +data Term + = TermExpr Expr + -- ^ A symbolic expression. + | TermFun Fun + -- ^ Definite noun phrase, e.g. /@the derivative of $f$@/. + | TermIota VarSymbol Stmt + -- ^ Definite descriptor, e.g. /@an $x$ such that ...@// + | TermQuantified Quantifier (NounPhrase Maybe) + -- ^ Indefinite quantified notion, e.g. /@every even integer that divides $k$ ...@/. + deriving (Show, Eq, Ord) + + +data Stmt + = StmtFormula Formula -- ^ E.g.: /@We have \@/. + | StmtVerbPhrase (NonEmpty Term) VerbPhrase -- ^ E.g.: /@\ and \ \@/. + | StmtNoun Term (NounPhrase Maybe) -- ^ E.g.: /@\ is a(n) \@/. + | StmtStruct Term StructPhrase + | StmtNeg Stmt -- ^ E.g.: /@It is not the case that \@/. + | StmtExists (NounPhrase []) -- ^ E.g.: /@There exists a(n) \@/. + | StmtConnected Connective Stmt Stmt + | StmtQuantPhrase QuantPhrase Stmt + | SymbolicQuantified Quantifier (NonEmpty VarSymbol) Bound (Maybe Stmt) Stmt + deriving (Show, Eq, Ord) + +data Bound = Unbounded | Bounded Sign Relation Expr deriving (Show, Eq, Ord) + +pattern SymbolicForall :: NonEmpty VarSymbol -> Bound -> Maybe Stmt -> Stmt -> Stmt +pattern SymbolicForall vs bound suchThat have = SymbolicQuantified Universally vs bound suchThat have + +pattern SymbolicExists :: NonEmpty VarSymbol -> Bound -> Stmt -> Stmt +pattern SymbolicExists vs bound suchThat = SymbolicQuantified Existentially vs bound Nothing suchThat + +pattern SymbolicNotExists :: NonEmpty VarSymbol -> Bound -> Stmt -> Stmt +pattern SymbolicNotExists vs bound suchThat = StmtNeg (SymbolicExists vs bound suchThat) + +data Asm + = AsmSuppose Stmt + | AsmLetNoun (NonEmpty VarSymbol) (NounPhrase Maybe) -- ^ E.g.: /@let k be an integer@/ + | AsmLetIn (NonEmpty VarSymbol) Expr -- ^ E.g.: /@let $k\in\integers$@/ + | AsmLetThe VarSymbol Fun -- ^ E.g.: /@let $g$ be the derivative of $f$@/ + | AsmLetEq VarSymbol Expr -- ^ E.g.: /@let $m = n + k$@/ + | AsmLetStruct VarSymbol StructPhrase -- ^ E.g.: /@let $A$ be a monoid@/ + deriving (Show, Eq, Ord) + +data Axiom = Axiom [Asm] Stmt + deriving (Show, Eq, Ord) + +data Lemma = Lemma [Asm] Stmt + deriving (Show, Eq, Ord) + +-- | The head of the definition describes the part before the /@iff@/, +-- i.e. the definiendum. An optional noun-phrase corresponds to an optional +-- type annotation for the 'Term' of the head. The last part of the head +-- is the lexical phrase that is defined. +-- +-- > "A natural number $n$ divides $m$ iff ..." +-- > ^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^ ^^^ +-- > type annotation variable verb definiens +-- > (a noun phrase) (all args are vars) (a statement) +-- +data DefnHead + = DefnAdj (Maybe (NounPhrase Maybe)) VarSymbol (AdjOf VarSymbol) + | DefnVerb (Maybe (NounPhrase Maybe)) VarSymbol (VerbOf VarSymbol) + | DefnNoun VarSymbol (NounOf VarSymbol) + | DefnSymbolicPredicate PrefixPredicate (NonEmpty VarSymbol) + | DefnRel VarSymbol RelationSymbol VarSymbol + -- ^ E.g.: /@$x \subseteq y$ iff [...@/ + deriving (Show, Eq, Ord) + +data Defn + = Defn [Asm] DefnHead Stmt + | DefnFun [Asm] (FunOf VarSymbol) (Maybe Term) Term + -- ^ A 'DefnFun' consists of the functional noun (which must start with /@the@/) + -- and an optional specification of a symbolic equivalent. The symbolic equivalent + -- does not need to have the same variables as the full functional noun pattern. + -- + -- > "The tensor product of $U$ and $V$ over $K$, $U\tensor V$, is ..." + -- > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^ ^^^ + -- > definiendum symbolic eqv. definiens + -- > (a functional noun) (an exression) (a term) + -- + | DefnOp SymbolPattern Expr + deriving (Show, Eq, Ord) + +data Proof + = Omitted + | Qed Justification + -- ^ Ends of a proof, leaving automation to discharge the current goal using the given justification. + | ByCase [Case] + | ByContradiction Proof + | BySetInduction (Maybe Term) Proof + -- ^ ∈-induction. + | ByOrdInduction Proof + -- ^ Transfinite induction for ordinals. + | Assume Stmt Proof + | FixSymbolic (NonEmpty VarSymbol) Bound Proof + | FixSuchThat (NonEmpty VarSymbol) Stmt Proof + | Calc Calc Proof + -- ^ Simplify goals that are implications or disjunctions. + | TakeVar (NonEmpty VarSymbol) Bound Stmt Justification Proof + | TakeNoun (NounPhrase []) Justification Proof + | Have (Maybe Stmt) Stmt Justification Proof + -- ^ /@Since \, we have \ by \.@/ + | Suffices Stmt Justification Proof + -- ^ /@It suffices to show that [...]. [...]@/ + | Subclaim Stmt Proof Proof + -- ^ A claim is a sublemma with its own proof: + -- /@Show \. \. \.@/ + | Define VarSymbol Expr Proof + -- ^ Local definition. + -- + | DefineFunction VarSymbol VarSymbol Expr VarSymbol Expr Proof + -- ^ Local function definition, e.g. /@Let $f(x) = e$ for $x\\in d$@/. + -- The first 'VarSymbol' is the newly defined symbol, the second one is the argument. + -- The first 'Expr' is the value, the final variable and expr specify a bound (the domain of the function). + deriving (Show, Eq, Ord) + +-- | An inline justification. +data Justification + = JustificationRef (NonEmpty Marker) + | JustificationSetExt + | JustificationEmpty + | JustificationLocal -- ^ Use only local assumptions + deriving (Show, Eq, Ord) + + +-- | A case of a case split. +data Case = Case + { caseOf :: Stmt + , caseProof :: Proof + } deriving (Show, Eq, Ord) + +data Calc + = Equation Expr (NonEmpty (Expr, Justification)) + -- ^ A chain of equalities. Each claimed equality has a (potentially empty) justification. + -- For example: @a &= b \\explanation{by \\cref{a_eq_b}} &= c@ + -- would be (modulo expr constructors) + -- @Equation "a" [("b", JustificationRef "a_eq_b"), ("c", JustificationEmpty)]@. + | Biconditionals Formula (NonEmpty (Formula, Justification)) + deriving (Show, Eq, Ord) + + +data Abbreviation + = AbbreviationAdj VarSymbol (AdjOf VarSymbol) Stmt + | AbbreviationVerb VarSymbol (VerbOf VarSymbol) Stmt + | AbbreviationNoun VarSymbol (NounOf VarSymbol) Stmt + | AbbreviationRel VarSymbol RelationSymbol VarSymbol Stmt + | AbbreviationFun (FunOf VarSymbol) Term + | AbbreviationEq SymbolPattern Expr + deriving (Show, Eq, Ord) + +data Datatype + = DatatypeFin (NounOf Term) (NonEmpty Text) + deriving (Show, Eq, Ord) + +data Inductive = Inductive + { inductiveSymbolPattern :: SymbolPattern + , inductiveDomain :: Expr + , inductiveIntros :: NonEmpty IntroRule + } + deriving (Show, Eq, Ord) + +data IntroRule = IntroRule + { introConditions :: [Formula] -- The inductively defined set may only appear as an argument of monotone operations on the rhs. + , introResult :: Formula -- TODO Refine. + } + deriving (Show, Eq, Ord) + + +data SymbolPattern = SymbolPattern FunctionSymbol [VarSymbol] + deriving (Show, Eq, Ord) + +data Signature + = SignatureAdj VarSymbol (AdjOf VarSymbol) + | SignatureVerb VarSymbol (VerbOf VarSymbol) + | SignatureNoun VarSymbol (NounOf VarSymbol) + | SignatureSymbolic SymbolPattern (NounPhrase Maybe) + -- ^ /@$\(\)$ is a \@/ + deriving (Show, Eq, Ord) + + +data StructDefn = StructDefn + { structPhrase :: StructPhrase + -- ^ E.g.: @partial order@ or @abelian group@.\ + , structParents :: [StructPhrase] + -- ^ Structural parents + , structLabel :: VarSymbol + , structFixes :: [StructSymbol] + -- ^ List of text for commands representing constants not inherited from its parents, + -- e.g.: @\sqsubseteq@ or @\inv@. + , structAssumes :: [(Marker, Stmt)] + } + deriving (Show, Eq, Ord) + +newtype Marker = Marker Text deriving (Show, Eq, Ord, Generic) + +deriving newtype instance Hashable Marker + +instance IsString Marker where + fromString str = Marker (Text.pack str) + +data Block + = BlockAxiom SourcePos Marker Axiom + | BlockLemma SourcePos Marker Lemma + | BlockProof SourcePos Proof + | BlockDefn SourcePos Marker Defn + | BlockAbbr SourcePos Marker Abbreviation + | BlockData SourcePos Datatype + | BlockInductive SourcePos Marker Inductive + | BlockSig SourcePos [Asm] Signature + | BlockStruct SourcePos Marker StructDefn + deriving (Show, Eq, Ord) diff --git a/source/Syntax/Adapt.hs b/source/Syntax/Adapt.hs new file mode 100644 index 0000000..0896c08 --- /dev/null +++ b/source/Syntax/Adapt.hs @@ -0,0 +1,382 @@ +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE ApplicativeDo #-} +{-# LANGUAGE RecordWildCards #-} + +module Syntax.Adapt where + +import Base +import Syntax.Abstract +import Syntax.Lexicon +import Syntax.Token + +import Data.Map.Strict qualified as Map +import Data.Maybe (catMaybes) +import Data.Set qualified as Set +import Data.Sequence qualified as Seq +import Data.HashSet qualified as HS +import Data.HashMap.Strict qualified as HM +import Data.Text qualified as Text +import Text.Earley.Mixfix (Associativity(..)) +import Text.Regex.Applicative +import Text.Megaparsec.Pos + + +scanChunk :: [Located Token] -> [ScannedLexicalItem] +scanChunk ltoks = + let toks = unLocated <$> ltoks + matchOrErr re env pos = match re toks ?? error ("could not find lexical pattern in " <> env <> " at " <> sourcePosPretty pos) + in case ltoks of + Located pos (BeginEnv "definition") : _ -> matchOrErr definition "definition" pos + Located pos (BeginEnv "abbreviation") : _ -> matchOrErr abbreviation "abbreviation" pos + Located pos (BeginEnv "struct") :_ -> matchOrErr struct "struct definition" pos + Located pos (BeginEnv "inductive") :_ -> matchOrErr inductive "inductive definition" pos + _ -> [] + +adaptChunks :: [[Located Token]] -> Lexicon -> Lexicon +adaptChunks = extendLexicon . concatMap scanChunk + +data ScannedLexicalItem + = ScanAdj LexicalPhrase Marker + | ScanFun LexicalPhrase Marker + | ScanNoun LexicalPhrase Marker + | ScanStructNoun LexicalPhrase Marker + | ScanVerb LexicalPhrase Marker + | ScanRelationSymbol RelationSymbol Marker + | ScanFunctionSymbol FunctionSymbol Marker + | ScanPrefixPredicate PrefixPredicate Marker + | ScanStructOp Text -- we an use the command text as export name. + deriving (Show, Eq) + +skipUntilNextLexicalEnv :: RE Token [Token] +skipUntilNextLexicalEnv = many (psym otherToken) + where + otherToken tok = tok /= BeginEnv "definition" && tok /= BeginEnv "struct" && tok /= BeginEnv "abbreviation" + +notEndOfLexicalEnvToken :: RE Token Token +notEndOfLexicalEnvToken = psym innerToken + where + innerToken tok = tok /= EndEnv "definition" && tok /= EndEnv "struct" && tok /= EndEnv "abbreviation" + +definition :: RE Token [ScannedLexicalItem] +definition = do + sym (BeginEnv "definition") + few notEndOfLexicalEnvToken + m <- label + few anySym + lexicalItem <- head + few anySym + sym (EndEnv "definition") + skipUntilNextLexicalEnv + pure [lexicalItem m] + +abbreviation :: RE Token [ScannedLexicalItem] +abbreviation = do + sym (BeginEnv "abbreviation") + few anySym + m <- label + few anySym + lexicalItem <- head + few anySym + sym (EndEnv "abbreviation") + skipUntilNextLexicalEnv + pure [lexicalItem m] + +label :: RE Token Marker +label = msym \case + Label m -> Just (Marker m) + _ -> Nothing + +-- | 'RE' that matches the head of a definition. +head :: RE Token (Marker -> ScannedLexicalItem) +-- Note that @<|>@ is left biased for 'RE', so we can just +-- place 'adj' before 'verb' and do not have to worry about +-- overlapping patterns. +head = ScanNoun <$> noun + <|> ScanAdj <$> adj + <|> ScanVerb <$> verb + <|> ScanFun <$> fun + <|> ScanRelationSymbol <$> relationSymbol + <|> ScanFunctionSymbol <$> functionSymbol + <|> ScanPrefixPredicate <$> prefixPredicate + +inductive :: RE Token [ScannedLexicalItem] +inductive = do + sym (BeginEnv "inductive") + few notEndOfLexicalEnvToken + m <- label + few anySym + lexicalItem <- functionSymbolInductive + few anySym + sym (EndEnv "inductive") + skipUntilNextLexicalEnv + pure [ScanFunctionSymbol lexicalItem m] + +struct :: RE Token [ScannedLexicalItem] +struct = do + sym (BeginEnv "struct") + few anySym + m <- label + few anySym + lexicalItem <- ScanStructNoun . toLexicalPhrase <$> (an *> structPat <* var) + few anySym + lexicalItems <- structOps <|> pure [] + sym (EndEnv "struct") + skipUntilNextLexicalEnv + pure (lexicalItem m : lexicalItems) + +structOps :: RE Token [ScannedLexicalItem] +structOps = do + sym (BeginEnv "enumerate") + lexicalItems <- many structOp + sym (EndEnv "enumerate") + few anySym + pure lexicalItems + +structOp :: RE Token ScannedLexicalItem +structOp = do + sym (Command "item") + op <- math command + pure (ScanStructOp op) + +noun :: RE Token LexicalPhrase +noun = toLexicalPhrase <$> (var *> is *> an *> pat <* iff) + +adj :: RE Token LexicalPhrase +adj = toLexicalPhrase <$> (var *> is *> pat <* iff) + +verb :: RE Token LexicalPhrase +verb = toLexicalPhrase <$> (var *> pat <* iff) + +fun :: RE Token LexicalPhrase +fun = toLexicalPhrase <$> (the *> pat <* (is <|> comma)) + +relationSymbol :: RE Token RelationSymbol +relationSymbol = math relator' <* iff + where + relator' = do + varSymbol + rel <- symbol + varSymbol + pure rel + +functionSymbol :: RE Token FunctionSymbol +functionSymbol = do + sym (BeginEnv "math") + toks <- few nonDefinitionKeyword + sym (Symbol "=") + pure (fromToken <$> toks) + where + fromToken = \case + Variable _ -> Nothing -- Variables become slots. + tok -> Just tok -- Everything else is part of the pattern. + +functionSymbolInductive :: RE Token FunctionSymbol +functionSymbolInductive = do + sym (BeginEnv "math") + toks <- few nonDefinitionKeyword + sym (Command "subseteq") + pure (fromToken <$> toks) + where + fromToken = \case + Variable _ -> Nothing -- Variables become slots. + tok -> Just tok -- Everything else is part of the pattern. + +prefixPredicate :: RE Token PrefixPredicate +prefixPredicate = math prfx <* iff + where + prfx = do + r <- command + args <- many (sym InvisibleBraceL *> varSymbol <* sym InvisibleBraceR) + pure (PrefixPredicate r (length args)) + + +command :: RE Token Text +command = msym \case + Command cmd -> Just cmd + _ -> Nothing + +var :: RE Token Token +var = math varSymbol + +varSymbol :: RE Token Token +varSymbol = psym isVar + + +nonDefinitionKeyword :: RE Token Token +nonDefinitionKeyword = psym (`notElem` keywords) + where + keywords = + [ Word "if" + , Word "iff" + , Symbol "=" + , Command "iff" + ] + + +pat :: RE Token [Token] +pat = many (psym isLexicalPhraseToken <|> var) + +structPat :: RE Token [Token] +structPat = many (psym isLexicalPhraseToken) + +math :: RE Token a -> RE Token a +math re = sym (BeginEnv "math") *> re <* sym (EndEnv "math") + +-- | We allow /conditional perfection/: the first /@if@/ in a definition is interpreted as /@iff@/. +iff :: RE Token () +iff = void (sym (Word "if")) -- Using @void@ is faster (only requires recognition). + <|> void (sym (Word "iff")) + <|> void (string [Word "if", Word "and", Word "only", Word "if"]) + <|> void (sym (Word "denote")) + <|> void (sym (Word "stand") *> sym (Word "for")) +{-# INLINE iff #-} + +an :: RE Token () +an = void (sym (Word "a")) + <|> void (sym (Word "an")) +{-# INLINE an #-} + +is :: RE Token () +is = void (sym (Word "is") <|> sym (Word "denotes")) +{-# INLINE is #-} + +the :: RE Token () +the = void (sym (Word "the")) +{-# INLINE the #-} + +comma :: RE Token () +comma = void (sym (Symbol ",")) +{-# INLINE comma #-} + + +isVar :: Token -> Bool +isVar = \case + Variable _ -> True + _token -> False + +isCommand :: Token -> Bool +isCommand = \case + Command _ -> True + _token -> False + +isNotDefnToken :: Token -> Bool +isNotDefnToken = \case + BeginEnv "definition" -> False + EndEnv "definition" -> False + _token -> True + +isLexicalPhraseToken :: Token -> Bool +isLexicalPhraseToken = \case + Word w -> w `Set.notMember` keywords + -- + -- Simple commands (outside of math-mode) are allowed. This is useful + -- for defining lexical phrases containing symbolic expressions such as + -- `X is \Ttwo{}`, where `\Ttwo` is a macro that expands to `T_2`. + -- We also allow these macros to take arguments, hence the need to + -- allow grouping delimiters. They can also be used to escape the end + -- of the command for correct spacing, as in the above example. + -- + Command _cmd -> True + InvisibleBraceL -> True + InvisibleBraceR -> True + -- + -- No other tokens may occur in lexical phrases. In particular, no `_dot` + -- token may occur, limiting the lexical phrase to a single sentence. + -- Commas occurring in variable lists should be placed + -- within the math environment. Thus `$a,b$ are coprime iff`, + -- not `$a$,`$b$` are coprime iff`. + -- + _token -> False + where + keywords = Set.fromList ["a", "an", "is", "are", "if", "iff", "denote", "stand", "let"] + + +toLexicalPhrase :: [Token] -> LexicalPhrase +toLexicalPhrase toks = component <$> toks + where + component = \case + Variable _ -> Nothing + tok -> Just tok + + +symbol :: RE Token Token +symbol = msym $ \tok -> case tok of + Command _ -> Just tok + Symbol _ -> Just tok + _tok -> Nothing + + +-- | Basic paradigms for pluralizations of nominals. +guessNounPlural :: LexicalPhrase -> SgPl LexicalPhrase +guessNounPlural item = SgPl item (pluralize item) + where + pluralize :: LexicalPhrase -> LexicalPhrase + pluralize = \case + Just (Word w) : pat'@(Just w' : _) | isPreposition w' -> Just (Word (Text.snoc w 's')) : pat' + [Just (Word w)] -> [Just (Word (Text.snoc w 's'))] + [tok, Just (Word w)] -> [tok, Just (Word (Text.snoc w 's'))] + [tok, tok', Just (Word w)] -> [tok, tok', Just (Word (Text.snoc w 's'))] + pat' -> pat' + +guessVerbPlural :: LexicalPhrase -> SgPl LexicalPhrase +guessVerbPlural item = SgPl item itemPl + where + itemPl = case item of + Just (Word v) : rest -> case Text.unsnoc v of + Just (v', 's') -> Just (Word v') : rest + _ -> item + _ -> item + +isAdjR :: LexicalPhrase -> Bool +isAdjR item = containsPreposition item || containsSlot item + where + containsPreposition, containsSlot :: LexicalPhrase -> Bool + containsPreposition = any isPreposition . catMaybes + containsSlot = (Nothing `elem`) + +isPreposition :: Token -> Bool +isPreposition w = HS.member w (HS.map Word prepositions) + +-- | Right-biased set insertion, keeping the original set +-- when inserting already present elements. @insertR@ is unfortunately +-- a hidden function, even in @Data.Set.Internal@, so we approximate it +-- here. In theory one could avoid the indirection of first forming the singleton +-- set on the rhs. +insertR' :: Hashable a => a -> HashSet a -> HashSet a +insertR' x xs = xs `HS.union` HS.singleton x -- @union@ is left-biased. + +insertMapR :: Ord k => k -> a -> Map k a -> Map k a +insertMapR k x xs = xs `Map.union` Map.singleton k x -- @union@ is left-biased. + + +insertR :: Hashable k => k -> a -> HashMap k a -> HashMap k a +insertR k x xs = xs `HM.union` HM.singleton k x -- @union@ is left-biased. + +-- | Takes the scanned lexical phrases and inserts them in the correct +-- places in a lexicon. +extendLexicon :: [ScannedLexicalItem] -> Lexicon -> Lexicon +extendLexicon [] lexicon = lexicon +-- Note that we only consider 'sg' in the 'Ord' instance of SgPl, so that +-- known irregular plurals are preserved. +extendLexicon (scan : scans) lexicon@Lexicon{..} = case scan of + ScanAdj item m -> if isAdjR item + then extendLexicon scans lexicon{lexiconAdjRs = insertR item m lexiconAdjRs} + else extendLexicon scans lexicon{lexiconAdjLs = insertR item m lexiconAdjLs} + ScanFun item m -> + extendLexicon scans lexicon{lexiconFuns = insertR (guessNounPlural item) m lexiconFuns} + ScanVerb item m -> + extendLexicon scans lexicon{lexiconVerbs = insertR (guessVerbPlural item) m lexiconVerbs} + ScanNoun item m -> + extendLexicon scans lexicon{lexiconNouns = insertR (guessNounPlural item) m lexiconNouns} + ScanStructNoun item m -> + extendLexicon scans lexicon{lexiconStructNouns = insertR (guessNounPlural item) m lexiconStructNouns} + ScanRelationSymbol item m -> + extendLexicon scans lexicon{lexiconRelationSymbols = insertR item m lexiconRelationSymbols} + ScanFunctionSymbol item m -> + if any (item `HM.member`) lexiconMixfix + then extendLexicon scans lexicon + else extendLexicon scans lexicon{lexiconMixfix = Seq.adjust (HM.insert item (NonAssoc, m)) 9 lexiconMixfix} + ScanStructOp op -> + extendLexicon scans lexicon{lexiconStructFun = insertR (StructSymbol op) (Marker op) lexiconStructFun} + ScanPrefixPredicate tok m -> + extendLexicon scans lexicon{lexiconPrefixPredicates = insertR tok m lexiconPrefixPredicates} diff --git a/source/Syntax/Chunk.hs b/source/Syntax/Chunk.hs new file mode 100644 index 0000000..604ccf6 --- /dev/null +++ b/source/Syntax/Chunk.hs @@ -0,0 +1,46 @@ +module Syntax.Chunk where + +import Base +import Syntax.Token + +import Data.List qualified as List + + +-- LATER This is just a naïve implementation of token chunks. +-- It and the current tokenizer should probably be replaced by +-- a more efficient implementation that does chunking +-- directly while tokenizing. + +chunkify :: [Located Token] -> [[Located Token]] +chunkify [] = [] +chunkify (tok : toks) = case unLocated tok of + BeginEnv env | isTopLevelEnv env -> + let (axiomToks, otherToks) = envBlock env toks + in (tok : axiomToks) : chunkify otherToks + -- + -- Skip all tokens outside of toplevel environments. + _ -> chunkify toks + +isTopLevelEnv :: Text -> Bool +isTopLevelEnv env = env `elem` + [ "abbreviation" + , "axiom" + , "claim" + , "corollary" + , "datatype" + , "definition" + , "inductive" + , "lemma" + , "proof" + , "proposition" + , "signature" + , "struct" + , "theorem" + ] + +envBlock :: Text -> [Located Token] -> ([Located Token], [Located Token]) +envBlock env toks = + let (pre, post) = List.span (\tok -> unLocated tok /= EndEnv env) toks + in case post of + [] -> (pre, post) + endEnv : post' -> (pre ++ [endEnv], post') diff --git a/source/Syntax/Concrete.hs b/source/Syntax/Concrete.hs new file mode 100644 index 0000000..8c6962d --- /dev/null +++ b/source/Syntax/Concrete.hs @@ -0,0 +1,657 @@ +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE RecursiveDo #-} + +-- | Concrete syntax of the surface language. +module Syntax.Concrete where + +import Base +import Syntax.Abstract +import Syntax.Concrete.Keywords +import Syntax.Lexicon (Lexicon(..), lexiconAdjs, splitOnVariableSlot) +import Syntax.Token + +import Data.HashSet qualified as HS +import Data.List.NonEmpty qualified as NonEmpty +import Data.HashMap.Strict qualified as HM +import Text.Earley (Grammar, Prod, (), rule, satisfy, terminal) +import Text.Earley.Mixfix +import Text.Megaparsec.Pos (SourcePos) + + +grammar :: Lexicon -> Grammar r (Prod r Text (Located Token) Block) +grammar lexicon@Lexicon{..} = mdo + let makeOp :: ([Maybe Token], Associativity) -> ([Maybe (Prod r Text (Located Token) Token)], Associativity) + makeOp (pat, assoc) = (map (fmap token) pat, assoc) + ops = map (map makeOp) (toList (HM.toList . HM.map fst <$> lexiconMixfix)) + conns = map (map makeOp) lexiconConnectives + + integer <- rule (terminal maybeIntToken "integer") + relator <- rule $ unLocated <$> (satisfy (\ltok -> unLocated ltok `HM.member` lexiconRelationSymbols) "relator") + varSymbol <- rule (terminal maybeVarToken "variable") + varSymbols <- rule (commaList varSymbol) + cmd <- rule (terminal maybeCmdToken "TEX command") +-- +-- Formulas have three levels: +-- +-- + Expressions: atoms or operators applied to atoms. +-- + Chains: comma-lists of expressions, separated by relators. +-- + Formulas: chains or connectives applied to chains. +-- +-- For example, the formula @x, y < z \implies x, y < z + 1@ consist of the +-- connective @\implies@ applied to two chains @x, y < z@ and @x, y < z + 1@. +-- In turn, the chain @x, y < z + 1@ consist of three expressions, +-- @x@, @y@, and @z + 1@. Finally, @z + 1@ consist the operator @+@ +-- applied to two atoms, the variable @z@ and the number literal @1@. +-- +-- This split is due to the different behaviour of relators compared to +-- operators and connectives. Relators can chain (@x < y < z@) and allow +-- lists as arguments, as in the above example. Operators and connectives +-- instead have precedence and fixity. The only syntactic difference between +-- an operator and a connective is the relative precedence compared to relators. +-- + replaceBound <- rule $ (,) <$> varSymbol <* _in <*> expr + replaceBounds <- rule $ commaList replaceBound + replaceFun <- rule $ ExprReplace <$> expr <* _pipe <*> replaceBounds <*> optional (_pipe *> comprStmt) + comprStmt <- rule $ (StmtFormula <$> formula) <|> text stmt + + replacePredSymbolic <- rule $ ExprReplacePred <$> varSymbol <* _pipe <*> (command "exists" *> varSymbol) <* _in <*> expr <* _dot <*> (StmtFormula <$> formula) + replacePredText <- rule $ ExprReplacePred <$> varSymbol <* _pipe <*> (begin "text" *> _exists *> beginMath *> varSymbol <* _in) <*> expr <* endMath <* _suchThat <*> stmt <* end "text" + replacePred <- rule $ replacePredSymbolic <|> replacePredText + + let exprStructOpOf ann = HS.foldr alg empty (HM.keysSet lexiconStructFun) + where + alg s prod = prod <|> (ExprStructOp <$> structSymbol s <*> ann) + + exprStructOp <- rule (exprStructOpOf (optional (bracket expr))) + + let bracedArgs1 ar arg = count1 ar $ group arg + let prefixPredicateOf f arg symb@(PrefixPredicate c ar) = f <$> pure symb <* command c <*> bracedArgs1 ar arg + + + exprParen <- rule $ paren expr + exprInteger <- rule $ ExprInteger <$> integer + exprVar <- rule $ ExprVar <$> varSymbol + exprTuple <- rule $ makeTuple <$> paren (commaList2 expr) + exprSep <- rule $ brace $ ExprSep <$> varSymbol <* _in <*> expr <* _pipe <*> comprStmt + exprReplace <- rule $ brace $ (replaceFun <|> replacePred) + exprFinSet <- rule $ brace $ ExprFiniteSet <$> exprs + exprBase <- rule $ asum [exprVar, exprInteger, exprStructOp, exprParen, exprTuple, exprSep, exprReplace, exprFinSet] + exprApp <- rule $ ExprApp <$> exprBase <*> (paren expr <|> exprTuple) + expr <- mixfixExpression ops (exprBase <|> exprApp) ExprOp + exprs <- rule $ commaList expr + + relationSign <- rule $ pure Positive <|> (Negative <$ command "not") + relationExpr <- rule $ RelationExpr <$> (command "mathrel" *> group expr) + relation <- rule $ (RelationSymbol <$> relator) <|> relationExpr + chainBase <- rule $ ChainBase <$> exprs <*> relationSign <*> relation <*> exprs + chainCons <- rule $ ChainCons <$> exprs <*> relationSign <*> relation <*> chain + chain <- rule $ chainCons <|> chainBase + + formulaPredicate <- rule $ asum $ prefixPredicateOf FormulaPredicate expr <$> HM.keys lexiconPrefixPredicates + formulaChain <- rule $ FormulaChain <$> chain + formulaBottom <- rule $ PropositionalConstant IsBottom <$ command "bot" "\"\\bot\"" + formulaTop <- rule $ PropositionalConstant IsTop <$ command "top" "\"\\top\"" + formulaExists <- rule $ FormulaQuantified Existentially <$> (command "exists" *> varSymbols) <*> maybeBounded <* _dot <*> formula + formulaAll <- rule $ FormulaQuantified Universally <$> (command "forall" *> varSymbols) <*> maybeBounded <* _dot <*> formula + formulaQuantified <- rule $ formulaExists <|> formulaAll + formulaBase <- rule $ asum [formulaChain, formulaPredicate, formulaBottom, formulaTop, paren formula] + formulaConn <- mixfixExpression conns formulaBase makeConnective + formula <- rule $ formulaQuantified <|> formulaConn + +-- These are asymmetric formulas (only variables are allowed on one side). +-- They express judgements. +-- + assignment <- rule $ (,) <$> varSymbol <* (_eq <|> _defeq) <*> expr + typing <- rule $ (,) <$> varSymbols <* (_in <|> _colon) <*> expr + + adjL <- rule $ adjLOf lexicon term + adjR <- rule $ adjROf lexicon term + adj <- rule $ adjOf lexicon term + adjVar <- rule $ adjOf lexicon var + + var <- rule $ math varSymbol + vars <- rule $ math varSymbols + + verb <- rule $ verbOf lexicon sg term + verbPl <- rule $ verbOf lexicon pl term + verbVar <- rule $ verbOf lexicon sg var + + noun <- rule $ nounOf lexicon sg term nounName -- Noun with optional variable name. + nounList <- rule $ nounOf lexicon sg term nounNames -- Noun with a list of names. + nounVar <- rule $ fst <$> nounOf lexicon sg var (pure Nameless) -- No names in defined nouns. + nounPl <- rule $ nounOf lexicon pl term nounNames + nounPlMay <- rule $ nounOf lexicon pl term nounName + + structNoun <- rule $ structNounOf lexicon sg var var + structNounNameless <- rule $ fst <$> structNounOf lexicon sg var (pure Nameless) + + + fun <- rule $ funOf lexicon sg term + funVar <- rule $ funOf lexicon sg var + + attrRThat <- rule $ AttrRThat <$> thatVerbPhrase + attrRThats <- rule $ ((:[]) <$> attrRThat) <|> ((\a a' -> [a,a']) <$> attrRThat <* _and <*> attrRThat) <|> pure [] + attrRs <- rule $ ((:[]) <$> adjR) <|> ((\a a' -> [a,a']) <$> adjR <* _and <*> adjR) <|> pure [] + attrRight <- rule $ (<>) <$> attrRs <*> attrRThats + + verbPhraseVerbSg <- rule $ VPVerb <$> verb + verbPhraseVerbNotSg <- rule $ VPVerbNot <$> (_does *> _not *> verbPl) + verbPhraseAdjSg <- rule $ VPAdj . (:|[]) <$> (_is *> adj) + verbPhraseAdjAnd <- rule do {_is; a1 <- adj; _and; a2 <- adj; pure (VPAdj (a1 :| [a2]))} + verbPhraseAdjNotSg <- rule $ VPAdjNot . (:|[]) <$> (_is *> _not *> adj) + verbPhraseNotSg <- rule $ verbPhraseVerbNotSg <|> verbPhraseAdjNotSg + verbPhraseSg <- rule $ verbPhraseVerbSg <|> verbPhraseAdjSg <|> verbPhraseAdjAnd <|> verbPhraseNotSg + + -- LATER can cause technical ambiguities? verbPhraseVerbPl <- rule $ VPVerb <$> verbPl + verbPhraseVerbNotPl <- rule $ VPVerbNot <$> (_do *> _not *> verbPl) + verbPhraseAdjPl <- rule $ VPAdj . (:|[]) <$> (_are *> adj) + verbPhraseAdjNotPl <- rule $ VPAdjNot . (:|[]) <$> (_are *> _not *> adj) + verbPhraseNotPl <- rule $ verbPhraseVerbNotPl <|> verbPhraseAdjNotPl + verbPhrasePl <- rule $ verbPhraseAdjPl <|> verbPhraseNotPl -- LATER <|> verbPhraseVerbPl + + + + thatVerbPhrase <- rule $ _that *> verbPhraseSg + + nounName <- rule $ optional (math varSymbol) + nounNames <- rule $ math (commaList_ varSymbol) <|> pure [] + nounPhrase <- rule $ makeNounPhrase <$> many adjL <*> noun <*> attrRight <*> optional suchStmt + nounPhrase' <- rule $ makeNounPhrase <$> many adjL <*> nounList <*> attrRight <*> optional suchStmt + nounPhrasePl <- rule $ makeNounPhrase <$> many adjL <*> nounPl <*> attrRight <*> optional suchStmt + nounPhrasePlMay <- rule $ makeNounPhrase <$> many adjL <*> nounPlMay <*> attrRight <*> optional suchStmt + nounPhraseMay <- rule $ makeNounPhrase <$> many adjL <*> noun <*> attrRight <*> optional suchStmt + + -- Quantification phrases for quantification and indfinite terms. + quantAll <- rule $ QuantPhrase Universally <$> (_forEvery *> nounPhrase' <|> _forAll *> nounPhrasePl) + quantSome <- rule $ QuantPhrase Existentially <$> (_some *> (nounPhrase' <|> nounPhrasePl)) + quantNone <- rule $ QuantPhrase Nonexistentially <$> (_no *> (nounPhrase' <|> nounPhrasePl)) + quant <- rule $ quantAll <|> quantSome <|> quantNone -- <|> quantUniq + + + termExpr <- rule $ TermExpr <$> math expr + termFun <- rule $ TermFun <$> (optional _the *> fun) + termIota <- rule $ TermIota <$> (_the *> var) <* _suchThat <*> stmt + termAll <- rule $ TermQuantified Universally <$> (_every *> nounPhraseMay) + termSome <- rule $ TermQuantified Existentially <$> (_some *> nounPhraseMay) + termNo <- rule $ TermQuantified Nonexistentially <$> (_no *> nounPhraseMay) + termQuantified <- rule $ termAll <|> termSome <|> termNo + term <- rule $ termExpr <|> termFun <|> termQuantified <|> termIota + +-- Basic statements @stmt'@ are statements without any conjunctions or quantifiers. +-- + stmtVerbSg <- rule $ StmtVerbPhrase <$> ((:| []) <$> term) <*> verbPhraseSg + stmtVerbPl <-rule $ StmtVerbPhrase <$> andList1 term <*> verbPhrasePl + stmtVerb <- rule $ stmtVerbSg <|> stmtVerbPl + stmtNounIs <- rule $ StmtNoun <$> term <* _is <* _an <*> nounPhrase + stmtNounIsNot <- rule $ StmtNeg <$> (StmtNoun <$> term <* _is <* _not <* _an <*> nounPhrase) + stmtNoun <- rule $ stmtNounIs <|> stmtNounIsNot + stmtStruct <- rule $ StmtStruct <$> (term <* _is <* _an) <*> structNounNameless + stmtExists <- rule $ StmtExists <$> (_exists *> _an *> nounPhrase') + stmtExist <- rule $ StmtExists <$> (_exist *> nounPhrasePl) + stmtExistsNot <- rule $ StmtNeg . StmtExists <$> (_exists *> _no *> nounPhrase') + stmtFormula <- rule $ StmtFormula <$> math formula + stmtBot <- rule $ StmtFormula (PropositionalConstant IsBottom) <$ _contradiction + stmt' <- rule $ stmtVerb <|> stmtNoun <|> stmtStruct <|> stmtFormula <|> stmtBot + stmtOr <- rule $ stmt' <|> (StmtConnected Disjunction <$> stmt' <* _or <*> stmt) + stmtAnd <- rule $ stmtOr <|> (StmtConnected Conjunction <$> stmtOr <* _and <*> stmt) + stmtIff <- rule $ stmtAnd <|> (StmtConnected Equivalence <$> stmtAnd <* _iff <*> stmt) + stmtIf <- rule $ StmtConnected Implication <$> (_if *> stmt) <* optional _comma <* _then <*> stmt + stmtXor <- rule $ StmtConnected ExclusiveOr <$> (_either *> stmt) <* _or <*> stmt + stmtNor <- rule $ StmtConnected NegatedDisjunction <$> (_neither *> stmt) <* _nor <*> stmt + stmtNeg <- rule $ StmtNeg <$> (_itIsWrong *> stmt) + + stmtQuantPhrase <- rule $ StmtQuantPhrase <$> (_for *> quant) <* optional _comma <* optional _have <*> stmt + + suchStmt <- rule $ _suchThat *> stmt <* optional _comma + + + -- Symbolic quantifications with or without generalized bounds. + symbolicForall <- rule $ SymbolicForall + <$> ((_forAll <|> _forEvery) *> beginMath *> varSymbols) + <*> maybeBounded <* endMath + <*> optional suchStmt + <* optional _have <*> stmt + symbolicExists <- rule $ SymbolicExists + <$> ((_exists <|> _exist) *> beginMath *> varSymbols) + <*> maybeBounded <* endMath + <*> ((_suchThat *> stmt) <|> pure (StmtFormula (PropositionalConstant IsTop))) + symbolicNotExists <- rule $ SymbolicNotExists + <$> (_exists *> _no *> beginMath *> varSymbols) + <*> maybeBounded <* endMath + <* _suchThat <*> stmt + symbolicBound <- rule $ Bounded <$> relationSign <*> relation <*> expr + maybeBounded <- rule (pure Unbounded <|> symbolicBound) + + symbolicQuantified <- rule $ symbolicForall <|> symbolicExists <|> symbolicNotExists + + stmt <- rule $ asum [stmtNeg, stmtIf, stmtXor, stmtNor, stmtExists, stmtExist, stmtExistsNot, stmtQuantPhrase, stmtIff, symbolicQuantified] "a statement" + + asmLetIn <- rule $ uncurry AsmLetIn <$> (_let *> math typing) + asmLetNoun <- rule $ AsmLetNoun <$> (_let *> fmap pure var <* (_be <|> _denote) <* _an) <*> nounPhrase + asmLetNouns <- rule $ AsmLetNoun <$> (_let *> vars <* (_be <|> _denote)) <*> nounPhrasePlMay + asmLetEq <- rule $ uncurry AsmLetEq <$> (_let *> math assignment) + asmLetThe <- rule $ AsmLetThe <$> (_let *> var <* _be <* _the) <*> fun + asmLetStruct <- rule $ AsmLetStruct <$> (_let *> var <* _be <* _an) <*> structNounNameless + asmLet <- rule $ asmLetNoun <|> asmLetNouns <|> asmLetIn <|> asmLetEq <|> asmLetThe <|> asmLetStruct + asmSuppose <- rule $ AsmSuppose <$> (_suppose *> stmt) + asm <- rule $ assumptionList (asmLet <|> asmSuppose) <* _dot + asms <- rule $ concat <$> many asm + + axiom <- rule $ Axiom <$> asms <* optional _then <*> stmt <* _dot + + lemma <- rule $ Lemma <$> asms <* optional _then <*> stmt <* _dot + + defnAdj <- rule $ DefnAdj <$> optional (_an *> nounPhrase) <*> var <* _is <*> adjVar + defnVerb <- rule $ DefnVerb <$> optional (_an *> nounPhrase) <*> var <*> verbVar + defnNoun <- rule $ DefnNoun <$> var <* _is <* _an <*> nounVar + defnRel <- rule $ DefnRel <$> (beginMath *> varSymbol) <*> relator <*> varSymbol <* endMath + defnSymbolicPredicate <- rule $ math $ asum $ prefixPredicateOf DefnSymbolicPredicate varSymbol <$> HM.keys lexiconPrefixPredicates + defnHead <- rule $ optional _write *> asum [defnAdj, defnVerb, defnNoun, defnRel, defnSymbolicPredicate] + + defnIf <- rule $ Defn <$> asms <*> defnHead <* (_iff <|> _if) <*> stmt <* _dot + defnFunSymb <- rule $ _comma *> termExpr <* _comma -- ^ Optional symbolic equivalent. + defnFun <- rule $ DefnFun <$> asms <*> (optional _the *> funVar) <*> optional defnFunSymb <* _is <*> term <* _dot + + symbolicPatternEqTerm <- rule do + pat <- beginMath *> symbolicPattern <* _eq + e <- expr <* endMath <* _dot + pure (pat, e) + defnOp <- rule $ uncurry DefnOp <$> symbolicPatternEqTerm + + defn <- rule $ defnIf <|> defnFun <|> defnOp + + abbreviationVerb <- rule $ AbbreviationVerb <$> var <*> verbVar <* (_iff <|> _if) <*> stmt <* _dot + abbreviationAdj <- rule $ AbbreviationAdj <$> var <* _is <*> adjVar <* (_iff <|> _if) <*> stmt <* _dot + abbreviationNoun <- rule $ AbbreviationNoun <$> var <* _is <* _an <*> nounVar <* (_iff <|> _if) <*> stmt <* _dot + abbreviationRel <- rule $ AbbreviationRel <$> (beginMath *> varSymbol) <*> relator <*> varSymbol <* endMath <* (_iff <|> _if) <*> stmt <* _dot + abbreviationFun <- rule $ AbbreviationFun <$> (_the *> funVar) <* (_is <|> _denotes) <*> term <* _dot + abbreviationEq <- rule $ uncurry AbbreviationEq <$> symbolicPatternEqTerm + abbreviation <- rule $ (abbreviationVerb <|> abbreviationAdj <|> abbreviationNoun <|> abbreviationRel <|> abbreviationFun <|> abbreviationEq) + + datatypeFin <- rule $ DatatypeFin <$> fmap fst (_an *> noun) <*> (_is *> _oneOf *> orList2 (math cmd) <* _dot) + datatype <- rule datatypeFin + + unconditionalIntro <- rule $ IntroRule [] <$> math formula + conditionalIntro <- rule $ IntroRule <$> (_if *> andList1_ (math formula)) <* _comma <* _then <*> math formula + inductiveIntro <- rule $ (unconditionalIntro <|> conditionalIntro) <* _dot + inductiveDomain <- rule $ math $ (,) <$> symbolicPattern <* _subseteq <*> expr + inductiveHead <- rule $ _define *> inductiveDomain <* optional _inductively <* optional _asFollows <* _dot + inductive <- rule $ uncurry Inductive <$> inductiveHead <*> enumerated1 inductiveIntro + + signatureAdj <- rule $ SignatureAdj <$> var <* _can <* _be <*> adjOf lexicon var + symbolicPattern <- symbolicPatternOf ops varSymbol + signatureSymbolic <- rule $ SignatureSymbolic <$> math symbolicPattern <* _is <* _an <*> nounPhrase + signature <- rule $ (,) <$> asms <* optional _then <*> (signatureAdj <|> signatureSymbolic) <* _dot + + structFix <- rule do + beginMath + rawCmd <- cmd + endMath + pure (StructSymbol rawCmd) + structDefn <- rule $ do + _an + ~(structPhrase, structLabel) <- structNoun + _extends + structParents <- andList1_ (_an *> structNounNameless) + maybeFixes <- optional (_equipped *> enumerated structFix) + structAssumes <- (_suchThat *> enumeratedMarked (stmt <* _dot)) <|> ([] <$ _dot) + pure StructDefn + { structPhrase = structPhrase + , structLabel = structLabel + , structParents = structParents + , structFixes = maybeFixes ?? [] + , structAssumes = structAssumes + } + + justificationSet <- rule $ JustificationSetExt <$ _bySetExt + justificationRef <- rule $ JustificationRef <$> (_by *> ref) + justificationLocal <- rule $ JustificationLocal <$ (_by *> (_assumption <|> _definition)) + justification <- rule (justificationSet <|> justificationRef <|> justificationLocal <|> pure JustificationEmpty) + + trivial <- rule $ Qed JustificationEmpty <$ _trivial <* _dot + omitted <- rule $ Omitted <$ _omitted <* _dot + qedJustified <- rule $ Qed <$> (_follows *> justification <* _dot) + qed <- rule $ qedJustified <|> trivial <|> omitted <|> pure (Qed JustificationEmpty) + + let alignedEq = symbol "&=" "\"&=\"" + explanation <- rule $ (text justification) <|> pure JustificationEmpty + equationItem <- rule $ (,) <$> (alignedEq *> expr) <*> explanation + equations <- rule $ Equation <$> expr <*> (many1 equationItem) + + let alignedIff = symbol "&" *> command "iff" "\"&\\iff\"" + biconditionalItem <- rule $ (,) <$> (alignedIff *> formula) <*> explanation + biconditionals <- rule $ Biconditionals <$> formula <*> (many1 biconditionalItem) + + calc <- rule $ Calc <$> align (equations <|> biconditionals) <*> proof + + caseOf <- rule $ command "caseOf" *> token InvisibleBraceL *> stmt <* _dot <* token InvisibleBraceR + byCases <- rule $ ByCase <$> env_ "byCase" (many1_ (Case <$> caseOf <*> proof)) + byContradiction <- rule $ ByContradiction <$ _suppose <* _not <* _dot <*> proof + bySetInduction <- rule $ BySetInduction <$> proofBy (_in *> word "-induction" *> optional (word "on" *> term)) <*> proof + byOrdInduction <- rule $ ByOrdInduction <$> proofBy (word "transfinite" *> word "induction" *> proof) + assume <- rule $ Assume <$> (_suppose *> stmt <* _dot) <*> proof + + fixSymbolic <- rule $ FixSymbolic <$> (_fix *> beginMath *> varSymbols) <*> maybeBounded <* endMath <* _dot <*> proof + fixSuchThat <- rule $ FixSuchThat <$> (_fix *> math varSymbols) <* _suchThat <*> stmt <* _dot <*> proof + fix <- rule $ fixSymbolic <|> fixSuchThat + + takeVar <- rule $ TakeVar <$> (_take *> beginMath *> varSymbols) <*> maybeBounded <* endMath <* _suchThat <*> stmt <*> justification <* _dot <*> proof + takeNoun <- rule $ TakeNoun <$> (_take *> _an *> (nounPhrase' <|> nounPhrasePl)) <*> justification <* _dot <*> proof + take <- rule $ takeVar <|> takeNoun + suffices <- rule $ Suffices <$> (_sufficesThat *> stmt) <*> (justification <* _dot) <*> proof + subclaim <- rule $ Subclaim <$> (_show *> stmt <* _dot) <*> env_ "subproof" proof <*> proof + have <- rule $ Have <$> optional (_since *> stmt <* _comma <* _have) <* optional _haveIntro <*> stmt <*> justification <* _dot <*> proof + + define <- rule $ Define <$> (_let *> beginMath *> varSymbol <* _eq) <*> expr <* endMath <* _dot <*> proof + defineFunction <- rule $ DefineFunction <$> (_let *> beginMath *> varSymbol) <*> paren varSymbol <* _eq <*> expr <* endMath <* _for <* beginMath <*> varSymbol <* _in <*> expr <* endMath <* _dot <*> proof + + proof <- rule $ asum [byContradiction, byCases, bySetInduction, byOrdInduction, calc, subclaim, assume, fix, take, have, suffices, define, defineFunction, qed] + + + blockAxiom <- rule $ uncurry3 BlockAxiom <$> envPos "axiom" axiom + blockLemma <- rule $ uncurry3 BlockLemma <$> lemmaEnv lemma + blockProof <- rule $ uncurry BlockProof <$> envPos_ "proof" proof + blockDefn <- rule $ uncurry3 BlockDefn <$> envPos "definition" defn + blockAbbr <- rule $ uncurry3 BlockAbbr <$> envPos "abbreviation" abbreviation + blockData <- rule $ uncurry BlockData <$> envPos_ "datatype" datatype + blockInd <- rule $ uncurry3 BlockInductive <$> envPos "inductive" inductive + blockSig <- rule $ (\(p, (a, s)) -> BlockSig p a s) <$> envPos_ "signature" signature + blockStruct <- rule $ uncurry3 BlockStruct <$> envPos "struct" structDefn + block <- rule $ asum [blockAxiom, blockLemma, blockDefn, blockAbbr, blockData, blockInd, blockSig, blockStruct, blockProof] + + -- Starting category. + pure block + + +proofBy :: Prod r Text (Located Token) a -> Prod r Text (Located Token) a +proofBy method = bracket $ word "proof" *> word "by" *> method + + +lemmaEnv :: Prod r Text (Located Token) a -> Prod r Text (Located Token) (SourcePos, Marker, a) +lemmaEnv content = asum + [ envPos "theorem" content + , envPos "lemma" content + , envPos "corollary" content + , envPos "claim" content + , envPos "proposition" content + ] + + +-- | A disjunctive list with at least two items: +-- * 'a or b' +-- * 'a, b, or c' +-- * 'a, b, c, or d' +-- +orList2 :: Prod r Text (Located Token) a -> Prod r Text (Located Token) (NonEmpty a) +orList2 item = ((:|) <$> item <*> many (_commaOr *> item)) + <|> ((\i j -> i:|[j]) <$> item <* _or <*> item) + + +-- | Nonempty textual lists of the form "a, b, c, and d". +-- The final comma is mandatory, 'and' is not. +-- Also allows "a and b". Should therefore be avoided in contexts where +-- a logical conjunction would also be possible. +-- Currently also allows additional 'and's after each comma... +-- +andList1 :: Prod r Text (Located Token) a -> Prod r Text (Located Token) (NonEmpty a) +andList1 item = ((:|) <$> item <*> many (_commaAnd *> item)) + <|> ((\i j -> i:|[j]) <$> item <* _and <*> item) + +-- | Like 'andList1', but drops the information about nonemptiness. +andList1_ :: Prod r Text (Located Token) a -> Prod r Text (Located Token) [a] +andList1_ item = toList <$> andList1 item + + +commaList :: Prod r Text (Located Token) a -> Prod r Text (Located Token) (NonEmpty a) +commaList item = (:|) <$> item <*> many (_comma *> item) + +-- | Like 'commaList', but drops the information about nonemptiness. +commaList_ :: Prod r Text (Located Token) a -> Prod r Text (Located Token) [a] +commaList_ item = NonEmpty.toList <$> commaList item + +-- | Like 'commaList', but requires at least two items (and hence at least one comma). +commaList2 :: Prod r Text (Located Token) a -> Prod r Text (Located Token) (NonEmpty a) +commaList2 item = (:|) <$> item <* _comma <*> commaList_ item + + +assumptionList :: Prod r Text (Located Token) a -> Prod r Text (Located Token) [a] +assumptionList item = NonEmpty.toList <$> andList1 item + +enumerated :: Prod r Text (Located Token) a -> Prod r Text (Located Token) [a] +enumerated p = NonEmpty.toList <$> enumerated1 p + +enumerated1 :: Prod r Text (Located Token) a -> Prod r Text (Located Token) (NonEmpty a) +enumerated1 p = begin "enumerate" *> many1 (command "item" *> p) <* end "enumerate" "\"\\begin{enumerate} ...\"" + + +enumeratedMarked :: Prod r Text (Located Token) a -> Prod r Text (Located Token) [(Marker, a)] +enumeratedMarked p = NonEmpty.toList <$> enumeratedMarked1 p + +enumeratedMarked1 :: Prod r Text (Located Token) a -> Prod r Text (Located Token) (NonEmpty (Marker, a)) +enumeratedMarked1 p = begin "enumerate" *> many1 ((,) <$> (command "item" *> label) <*> p) <* end "enumerate" "\"\\begin{enumerate}\\item\\label{...}...\"" + + + + +-- This function could be rewritten, so that it can be used directly in the grammar, +-- instead of with specialized variants. +-- +phraseOf + :: (pat -> [a] -> b) + -> Lexicon + -> (Lexicon -> HashSet pat) + -> (pat -> LexicalPhrase) + -> Prod r Text (Located Token) a + -> Prod r Text (Located Token) b +phraseOf constr lexicon selector proj arg = + uncurry constr <$> asum (fmap make pats) + where + pats = HS.toList (selector lexicon) + make pat = (\args -> (pat, args)) <$> go (proj pat) + go = \case + Just w : ws -> token w *> go ws + Nothing : ws -> (:) <$> arg <*> go ws + [] -> pure [] + +adjLOf :: Lexicon -> Prod r Text (Located Token) arg -> Prod r Text (Located Token) (AdjLOf arg) +adjLOf lexicon arg = phraseOf AdjL lexicon (HM.keysSet . lexiconAdjLs) id arg "a left adjective" + +adjROf :: Lexicon -> Prod r Text (Located Token) arg -> Prod r Text (Located Token) (AdjROf arg) +adjROf lexicon arg = phraseOf AdjR lexicon (HM.keysSet . lexiconAdjRs) id arg "a right adjective" + +adjOf :: Lexicon -> Prod r Text (Located Token) arg -> Prod r Text (Located Token) (AdjOf arg) +adjOf lexicon arg = phraseOf Adj lexicon (HM.keysSet . lexiconAdjs) id arg "an adjective" + +verbOf + :: Lexicon + -> (SgPl LexicalPhrase -> LexicalPhrase) + -> Prod r Text (Located Token) a + -> Prod r Text (Located Token) (VerbOf a) +verbOf lexicon proj arg = phraseOf Verb lexicon (HM.keysSet . lexiconVerbs) proj arg + +funOf + :: Lexicon + -> (SgPl LexicalPhrase -> LexicalPhrase) + -> Prod r Text (Located Token) a + -> Prod r Text (Located Token) (FunOf a) +funOf lexicon proj arg = phraseOf Fun lexicon (HM.keysSet . lexiconFuns) proj arg "functional phrase" + + +-- | A noun with a @t VarSymbol@ as name(s). +nounOf + :: Lexicon + -> (SgPl LexicalPhrase -> LexicalPhrase) + -> Prod r Text (Located Token) arg + -> Prod r Text (Located Token) (t VarSymbol) + -> Prod r Text (Located Token) (NounOf arg, t VarSymbol) +nounOf lexicon proj arg vars = + (\(args1, xs, args2, pat) -> (Noun pat (args1 <> args2), xs)) <$> asum (fmap make pats) "a noun" + where + pats = HM.keys (lexiconNouns lexicon) + make pat = + let (pat1, pat2) = splitOnVariableSlot (proj pat) + in (\args1 xs args2 -> (args1, xs, args2, pat)) <$> go pat1 <*> vars <*> go pat2 + go = \case + Just w : ws -> token w *> go ws + Nothing : ws -> (:) <$> arg <*> go ws + [] -> pure [] + +structNounOf + :: Lexicon + -> (SgPl LexicalPhrase -> LexicalPhrase) + -> Prod r Text (Located Token) arg + -> Prod r Text (Located Token) name + -> Prod r Text (Located Token) (StructPhrase, name) +structNounOf lexicon proj arg name = + (\(_args1, xs, _args2, pat) -> (pat, xs)) <$> asum (fmap make pats) "a structure noun" + where + pats = HM.keys (lexiconStructNouns lexicon) + make pat = + let (pat1, pat2) = splitOnVariableSlot (proj pat) + in (\args1 xs args2 -> (args1, xs, args2, pat)) <$> go pat1 <*> name <*> go pat2 + go = \case + Just w : ws -> token w *> go ws + Nothing : ws -> (:) <$> arg <*> go ws + [] -> pure [] + + +symbolicPatternOf + :: forall r. [[(Holey (Prod r Text (Located Token) Token), Associativity)]] + -> Prod r Text (Located Token) VarSymbol + -> Grammar r (Prod r Text (Located Token) SymbolPattern) +symbolicPatternOf ops varSymbol = rule $ asum + [ go op + | ops' <- ops + , (op, _assoc) <- ops' + ] "a symbolic pattern" + where + go :: Holey (Prod r Text (Located Token) Token) -> Prod r Text (Located Token) SymbolPattern + go [] = pure $ SymbolPattern [] [] + go (head : tail) = case head of + Just symb -> (\s (SymbolPattern op vs) -> SymbolPattern (Just s : op) vs) <$> symb <*> go tail + Nothing -> (\v (SymbolPattern op vs) -> SymbolPattern (Nothing : op) (v : vs)) <$> varSymbol <*> go tail + + +makeNounPhrase + :: [AdjL] + -> (Noun, t VarSymbol) + -> [AdjR] + -> Maybe Stmt + -> NounPhrase t +makeNounPhrase ls (n, vs) rs ms = NounPhrase ls n vs rs ms + + + + +begin, end :: Text -> Prod r Text (Located Token) SourcePos +begin kind = tokenPos (BeginEnv kind) ("\"\\begin{" <> kind <> "}\"") +end kind = tokenPos (EndEnv kind) ("\"\\end{" <> kind <> "}\"") + +-- | Surround a production rule @body@ with an environment of a certain @kind@ requiring a marker specified in a @\\label@. +-- Ignores the optional title after the beginning of the environment. +envPos :: Text -> Prod r Text (Located Token) a -> Prod r Text (Located Token) (SourcePos, Marker, a) +envPos kind body = do + p <- begin kind ("start of a \"" <> kind <> "\" environment") + optional title + m <- label + a <- body <* end kind + pure (p, m, a) + where + title :: Prod r Text (Located Token) [Token] + title = bracket (many (unLocated <$> satisfy (\ltok -> unLocated ltok /= BracketR))) + +-- 'env_' is like 'env', but without allowing titles. +-- +envPos_ :: Text -> Prod r Text (Located Token) a -> Prod r Text (Located Token) (SourcePos, a) +envPos_ kind body = (,) <$> begin kind <*> (optional label *> body) <* end kind + +env_ :: Text -> Prod r Text (Located Token) a -> Prod r Text (Located Token) a +env_ kind body = begin kind *> optional label *> body <* end kind + +-- | A label specifying a marker for referencing via /@\\label{...}@/. Returns the marker text. +label :: Prod r Text (Located Token) Marker +label = label_ "\"\\label{...}\"" + where + label_ = terminal \ltok -> case unLocated ltok of + Label m -> Just (Marker m) + _tok -> Nothing + +-- | A reference via /@\\ref{...}@/. Returns the markers as text. +ref :: Prod r Text (Located Token) (NonEmpty Marker) +ref = terminal \ltok -> case unLocated ltok of + Ref ms -> Just (Marker <$> ms) + _tok -> Nothing + +math :: Prod r Text (Located Token) a -> Prod r Text (Located Token) a +math body = beginMath *> body <* endMath + +text :: Prod r Text (Located Token) a -> Prod r Text (Located Token) a +text body = begin "text" *> body <* end "text" "\"\\text{...}\"" + +beginMath, endMath :: Prod r Text (Located Token) SourcePos +beginMath = begin "math" "start of a formula, e.g. \"$\"" +endMath = end "math" "end of a formula, e.g. \"$\"" + +paren :: Prod r Text (Located Token) a -> Prod r Text (Located Token) a +paren body = token ParenL *> body <* token ParenR "\"(...)\"" + +bracket :: Prod r Text (Located Token) a -> Prod r Text (Located Token) a +bracket body = token BracketL *> body <* token BracketR "\"[...]\"" + +brace :: Prod r Text (Located Token) a -> Prod r Text (Located Token) a +brace body = token VisibleBraceL *> body <* token VisibleBraceR "\"\\{...\\}\"" + +group :: Prod r Text (Located Token) a -> Prod r Text (Located Token) a +group body = token InvisibleBraceL *> body <* token InvisibleBraceR "\"{...}\"" + +align :: Prod r Text (Located Token) a -> Prod r Text (Located Token) a +align body = begin "align*" *> body <* end "align*" + + +maybeVarToken :: Located Token -> Maybe VarSymbol +maybeVarToken ltok = case unLocated ltok of + Variable x -> Just (NamedVar x) + _tok -> Nothing + +maybeWordToken :: Located Token -> Maybe Text +maybeWordToken ltok = case unLocated ltok of + Word n -> Just n + _tok -> Nothing + +maybeIntToken :: Located Token -> Maybe Int +maybeIntToken ltok = case unLocated ltok of + Integer n -> Just n + _tok -> Nothing + +maybeCmdToken :: Located Token -> Maybe Text +maybeCmdToken ltok = case unLocated ltok of + Command n -> Just n + _tok -> Nothing + +structSymbol :: StructSymbol -> Prod r Text (Located Token) StructSymbol +structSymbol s@(StructSymbol c) = terminal \ltok -> case unLocated ltok of + Command c' | c == c' -> Just s + _ -> Nothing + +-- | Tokens that are allowed to appear in labels of environments. +maybeTagToken :: Located Token -> Maybe Text +maybeTagToken ltok = case unLocated ltok of + Symbol "'" ->Just "'" + Symbol "-" -> Just "" + _ -> maybeWordToken ltok + + +token :: Token -> Prod r Text (Located Token) Token +token tok = terminal maybeToken tokToText tok + where + maybeToken ltok = case unLocated ltok of + tok' | tok == tok' -> Just tok + _ -> Nothing + +tokenPos :: Token -> Prod r Text (Located Token) SourcePos +tokenPos tok = terminal maybeToken tokToText tok + where + maybeToken ltok = case unLocated ltok of + tok' | tok == tok' -> Just (startPos ltok) + _ -> Nothing diff --git a/source/Syntax/Concrete/Keywords.hs b/source/Syntax/Concrete/Keywords.hs new file mode 100644 index 0000000..e0f577e --- /dev/null +++ b/source/Syntax/Concrete/Keywords.hs @@ -0,0 +1,222 @@ +{-# LANGUAGE NoImplicitPrelude #-} +{-| +This module defines lots of keywords and various filler +phrases. The prefix underscore indicates that we do not +care about the parse result (analogous to discarding +like @...; _ <- action; ...@ in do-notation). Moreover, +this convention allows the use of short names that would +otherwise be Haskell keywords or clash with other definitions. +Care should be taken with introducing too many variants of +a keyword, lest the grammar becomes needlessly ambiguous! + +The names are chosen using the following criteria: + + * As short as possible (e.g.: @_since@ over @_because@). + + * Sound like a keyword (e.g.: @_show@). + +This module also defines symbols that have special uses +(such as @_colon@ for its use in type signatures). +-} +module Syntax.Concrete.Keywords where + + +import Base +import Syntax.Token + +import Text.Earley (Prod, (), terminal) +import Text.Megaparsec (SourcePos) + +infixr 0 ? +-- | Variant of '' for annotating literal tokens. +(?) :: Prod r Text t a -> Text -> Prod r Text t a +p ? e = p ("\"" <> e <> "\"") + +word :: Text -> Prod r Text (Located Token) SourcePos +word w = terminal maybeToken + where + maybeToken ltok = case unLocated ltok of + Word w' | w == w' -> Just (startPos ltok) + _ -> Nothing + +symbol :: Text -> Prod r Text (Located Token) SourcePos +symbol s = terminal maybeToken + where + maybeToken ltok = case unLocated ltok of + Symbol s' | s == s' -> Just (startPos ltok) + _ -> Nothing + +command :: Text -> Prod r Text (Located Token) SourcePos +command cmd = terminal maybeToken + where + maybeToken ltok = case unLocated ltok of + Command cmd' | cmd == cmd' -> Just (startPos ltok) + _ -> Nothing + +_arity :: Prod r Text (Located Token) Int +_arity = asum + [ 1 <$ word "unary" + , 2 <$ word "binary" + , 3 <$ word "ternary" + , 4 <$ word "quaternary" + , 5 <$ word "quinary" + , 6 <$ word "senary" + , 7 <$ word "septenary" + , 8 <$ word "octonary" + , 9 <$ word "nonary" + , 10 <$ word "denary" + ] "\"unary\", \"binary\', ..." + +-- * Keywords + +_an :: Prod r Text (Located Token) SourcePos +_an = word "a" <|> word "an" "indefinite article" +_and :: Prod r Text (Located Token) SourcePos +_and = word "and" ? "and" +_are :: Prod r Text (Located Token) SourcePos +_are = word "are" ? "are" +_asFollows :: Prod r Text (Located Token) SourcePos +_asFollows = word "as" <* word "follows" ? "as follows" +_assumption :: Prod r Text (Located Token) SourcePos +_assumption = word "assumption" ? "assumption" +_be :: Prod r Text (Located Token) SourcePos +_be = word "be" ? "be" +_by :: Prod r Text (Located Token) SourcePos +_by = word "by" ? "by" +_bySetExt :: Prod r Text (Located Token) SourcePos +_bySetExt = word "by" <* ((word "set" ? "set") <* word "extensionality") ? "by set extensionality" +_can :: Prod r Text (Located Token) SourcePos +_can = word "can" ? "can" +_consistsOf :: Prod r Text (Located Token) SourcePos +_consistsOf = word "consists" <* word "of" ? "consists of" +_contradiction :: Prod r Text (Located Token) SourcePos +_contradiction = optional (word "a") *> word "contradiction" ? "a contradiction" +_define :: Prod r Text (Located Token) SourcePos +_define = word "define" ? "define" +_definition :: Prod r Text (Located Token) SourcePos +_definition = word "definition" ? "definition" +_denote :: Prod r Text (Located Token) SourcePos +_denote = word "denote" <|> (word "stand" <* word "for") ? "denote" +_denotes :: Prod r Text (Located Token) SourcePos +_denotes = word "denotes" ? "denotes" +_do :: Prod r Text (Located Token) SourcePos +_do = word "do" ? "do" +_does :: Prod r Text (Located Token) SourcePos +_does = word "does" ? "does" +_either :: Prod r Text (Located Token) SourcePos +_either = word "either" ? "either" +_equipped :: Prod r Text (Located Token) SourcePos +_equipped = (word "equipped" <|> word "together") <* word "with" ? "equipped with" +_every :: Prod r Text (Located Token) SourcePos +_every = (word "every") ? "every" +_exist :: Prod r Text (Located Token) SourcePos +_exist = word "there" <* word "exist" ? "there exist" +_exists :: Prod r Text (Located Token) SourcePos +_exists = word "there" <* word "exists" ? "there exists" +_extends :: Prod r Text (Located Token) SourcePos +_extends = (_is) <|> (word "consists" <* word "of") ? "consists of" +_fix :: Prod r Text (Located Token) SourcePos +_fix = word "fix" ? "fix" +_follows :: Prod r Text (Located Token) SourcePos +_follows = word "follows" ? "follows" +_for :: Prod r Text (Located Token) SourcePos +_for = word "for" ? "for" +_forAll :: Prod r Text (Located Token) SourcePos +_forAll = (word "for" <* word "all") <|> word "all" ? "all" +_forEvery :: Prod r Text (Located Token) SourcePos +_forEvery = (word "for" <* word "every") <|> (word "every") ? "for every" +_have :: Prod r Text (Located Token) SourcePos +_have = word "we" <* word "have" <* optional (word "that") ? "we have" +_if :: Prod r Text (Located Token) SourcePos +_if = word "if" ? "if" +_iff :: Prod r Text (Located Token) SourcePos +_iff = word "iff" <|> (word "if" <* word "and" <* word "only" <* word "if") ? "iff" +_inductively :: Prod r Text (Located Token) SourcePos +_inductively = word "inductively" ? "inductively" +_is :: Prod r Text (Located Token) SourcePos +_is = word "is" ? "is" +_itIsWrong :: Prod r Text (Located Token) SourcePos +_itIsWrong = word "it" <* word "is" <* (word "not" <* word "the" <* word "case" <|> word "wrong") <* word "that" ? "it is wrong that" +_let :: Prod r Text (Located Token) SourcePos +_let = word "let" ? "let" +_neither :: Prod r Text (Located Token) SourcePos +_neither = word "neither" ? "neither" +_no :: Prod r Text (Located Token) SourcePos +_no = word "no" ? "no" +_nor :: Prod r Text (Located Token) SourcePos +_nor = word "nor" ? "nor" +_not :: Prod r Text (Located Token) SourcePos +_not = word "not" ? "not" +_omitted :: Prod r Text (Located Token) SourcePos +_omitted = word "omitted" ? "omitted" +_on :: Prod r Text (Located Token) SourcePos +_on = word "on" ? "on" +_oneOf :: Prod r Text (Located Token) SourcePos +_oneOf = word "one" <* word "of" ? "one of" +_or :: Prod r Text (Located Token) SourcePos +_or = word "or" ? "or" +_particularly :: Prod r Text (Located Token) SourcePos +_particularly = (word "particularly" <|> (word "in" *> word "particular")) <* _comma ? "particularly" +_relation :: Prod r Text (Located Token) SourcePos +_relation = word "relation" ? "relation" +_satisfying :: Prod r Text (Located Token) SourcePos +_satisfying = _suchThat <|> word "satisfying" ? "satisfying" +_setOf :: Prod r Text (Located Token) SourcePos +_setOf = word "set" <* word "of" ? "set of" +_show :: Prod r Text (Located Token) SourcePos +_show = optional (word "first" <|> word "finally" <|> word "next" <|> word "now") *> optional (word "we") *> word "show" <* optional (word "that") +_since :: Prod r Text (Located Token) SourcePos +_since = word "since" <|> word "because" ? "since" +_some :: Prod r Text (Located Token) SourcePos +_some = word "some" ? "some" +_suchThat :: Prod r Text (Located Token) SourcePos +_suchThat = ((word "such" <* word "that") <|> (word "s" <* _dot <* word "t" <* _dot)) ? "such that" +_sufficesThat :: Prod r Text (Located Token) SourcePos +_sufficesThat = word "it" <* word "suffices" <* word "to" <* word "show" <* word "that" ? "it suffices to show" +_suppose :: Prod r Text (Located Token) SourcePos +_suppose = (word "suppose" <|> word "assume") <* optional (word "that") ? "assume" +_take :: Prod r Text (Located Token) SourcePos +_take = word "take" <|> word "consider" ? "take" +_that :: Prod r Text (Located Token) SourcePos +_that = word "that" ? "that" +_the :: Prod r Text (Located Token) SourcePos +_the = word "the" ? "the" +_then :: Prod r Text (Located Token) SourcePos +_then = word "then" ? "then" +_throughout :: Prod r Text (Located Token) SourcePos +_throughout = word "throughout" <* optional (word "this" <* word "section") <* optional _comma <|> (word "in" <* word "the" <* word "sequel") ? "throughout" +_thus :: Prod r Text (Located Token) SourcePos +_thus = word "thus" <|> word "then" <|> word "hence" <|> word "now" <|> word "finally" <|> word "therefore" ? "thus" +_trivial :: Prod r Text (Located Token) SourcePos +_trivial = word "straightforward" <|> word "trivial" ? "trivial" +_unique :: Prod r Text (Located Token) SourcePos +_unique = word "unique" ? "unique" +_write :: Prod r Text (Located Token) SourcePos +_write = (optional (word "we") *> word "say" <* optional (word "that")) <|> (optional (word "we") *> word "write") ? "write" + +-- | Introducing plain claims in proofs. +_haveIntro :: Prod r Text (Located Token) SourcePos +_haveIntro = _thus <|> _particularly <|> _have + +-- * Symbols + +_colon :: Prod r Text (Located Token) SourcePos +_colon = symbol ":" ? ":" +_pipe :: Prod r Text (Located Token) SourcePos +_pipe = symbol "|" <|> command "mid" ? "\\mid" +_comma :: Prod r Text (Located Token) SourcePos +_comma = symbol "," ? "," +_commaAnd :: Prod r Text (Located Token) SourcePos +_commaAnd = symbol "," <* optional (word "and") ? ", and" +_commaOr :: Prod r Text (Located Token) SourcePos +_commaOr = symbol "," <* optional (word "or") ? ", or" +_defeq :: Prod r Text (Located Token) SourcePos +_defeq = symbol ":=" ? ":=" -- Should use `\coloneq` from unicode-math as display. +_dot :: Prod r Text (Located Token) SourcePos +_dot = symbol "." ? "." +_eq :: Prod r Text (Located Token) SourcePos +_eq = symbol "=" ? "=" +_in :: Prod r Text (Located Token) SourcePos +_in = symbol "∈" <|> command "in" ? "\\in" +_subseteq :: Prod r Text (Located Token) SourcePos +_subseteq = command "subseteq" ? ":" diff --git a/source/Syntax/Import.hs b/source/Syntax/Import.hs new file mode 100644 index 0000000..33082f1 --- /dev/null +++ b/source/Syntax/Import.hs @@ -0,0 +1,43 @@ +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE ApplicativeDo #-} + + +module Syntax.Import where + + +import Base + +import Text.Regex.Applicative.Text +import Data.CharSet qualified as CharSet +import Data.Char (isAlphaNum) + +gatherImports :: Text -> [FilePath] +gatherImports s = case findFirstPrefix imps s of + Nothing -> [] + Just (paths, _) -> paths + + +imps :: RE Char [FilePath] +imps = do + mpath <- optional imp + paths <- many (few anySym *> string "\n" *> imp) + few anySym + begin + pure case mpath of + Nothing -> paths + Just path -> path : paths + + +imp :: RE Char FilePath +imp = do + -- Requiring a newline makes it possible to comment imports out. + string "\\import{" + path <- few (psym isTheoryNameChar) + sym '}' + pure path + +isTheoryNameChar :: Char -> Bool +isTheoryNameChar c = isAlphaNum c || c `CharSet.member` CharSet.fromList ".-_/" + +begin :: RE Char Text +begin = string "\\begin{" diff --git a/source/Syntax/Internal.hs b/source/Syntax/Internal.hs new file mode 100644 index 0000000..44603ad --- /dev/null +++ b/source/Syntax/Internal.hs @@ -0,0 +1,612 @@ +{-# LANGUAGE DeriveAnyClass #-} +{-# LANGUAGE DeriveTraversable #-} +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE StandaloneDeriving #-} +{-# LANGUAGE TemplateHaskell #-} + +-- | Data types for the internal (semantic) syntax tree. +module Syntax.Internal + ( module Syntax.Internal + , module Syntax.Abstract + , module Syntax.LexicalPhrase + , module Syntax.Token + ) where + + +import Base +import Syntax.Lexicon (pattern PairSymbol, pattern ConsSymbol) +import Syntax.LexicalPhrase (LexicalPhrase, SgPl(..), unsafeReadPhrase, unsafeReadPhraseSgPl) +import Syntax.Token (Token(..)) + +import Syntax.Abstract + ( Chain(..) + , Connective(..) + , VarSymbol(..) + , FunctionSymbol + , RelationSymbol + , StructSymbol (..) + , Relation + , VarSymbol(..) + , PropositionalConstant(..) + , StructPhrase + , Justification(..) + , Marker(..) + , pattern CarrierSymbol, pattern ConsSymbol + ) + +import Bound +import Bound.Scope +import Data.Deriving (deriveShow1, deriveEq1, deriveOrd1) +import Data.Hashable.Lifted +import Data.HashMap.Strict qualified as HM +import Data.Maybe +import Data.Set qualified as Set +import Data.List qualified as List +import Data.List.NonEmpty qualified as NonEmpty +import Text.Megaparsec.Pos (SourcePos) + +-- | 'Symbol's can be used as function and relation symbols. +data Symbol + = SymbolMixfix FunctionSymbol + | SymbolFun (SgPl LexicalPhrase) + | SymbolInteger Int + | SymbolPredicate Predicate + deriving (Show, Eq, Ord, Generic, Hashable) + + +data Predicate + = PredicateAdj LexicalPhrase + | PredicateVerb (SgPl LexicalPhrase) + | PredicateNoun (SgPl LexicalPhrase) -- ^ /@\<...\> is a \<...\>@/. + | PredicateRelation RelationSymbol + | PredicateSymbol Text + | PredicateNounStruct (SgPl LexicalPhrase) -- ^ /@\<...\> is a \<...\>@/. + deriving (Show, Eq, Ord, Generic, Hashable) + + +data Quantifier + = Universally + | Existentially + deriving (Show, Eq, Ord, Generic, Hashable) + +type Formula = Term +type Term = Expr +type Expr = ExprOf VarSymbol + + +-- | Internal higher-order expressions. +data ExprOf a + = TermVar a + -- ^ Fresh constants disjoint from all user-named identifiers. + -- These can be used to eliminate higher-order constructs. + -- + | TermSymbol Symbol [ExprOf a] + -- ^ Application of a symbol (including function and predicate symbols). + | TermSymbolStruct StructSymbol (Maybe (ExprOf a)) + -- + | Apply (ExprOf a) (NonEmpty (ExprOf a)) + -- ^ Higher-order application. + -- + | TermSep VarSymbol (ExprOf a) (Scope () ExprOf a) + -- ^ Set comprehension using seperation, e.g.: /@{ x ∈ X | P(x) }@/. + -- + | Iota VarSymbol (Scope () ExprOf a) + -- ^ Definite descriptor. + -- + | ReplacePred VarSymbol VarSymbol (ExprOf a) (Scope ReplacementVar ExprOf a) + -- ^ Replacement for single-valued predicates. The concrete syntax for these + -- syntactically requires a bounded existential quantifier in the condition: + -- + -- /@$\\{ y | \\exists x\\in A. P(x,y) \\}$@/ + -- + -- In definitions the single-valuedness of @P@ becomes a proof obligation. + -- In other cases we could instead add it as constraint + -- + -- /@$b\\in \\{ y | \\exists x\\in A. P(x,y) \\}$@/ + -- /@iff@/ + -- /@$\\exists x\\in A. P(x,y)$ and $P$ is single valued@/ + -- + -- + | ReplaceFun (NonEmpty (VarSymbol, ExprOf a)) (Scope VarSymbol ExprOf a) (Scope VarSymbol ExprOf a) + -- ^ Set comprehension using functional replacement, + -- e.g.: /@{ f(x, y) | x ∈ X; y ∈ Y; P(x, y) }@/. + -- The list of pairs gives the domains, the integers in the scope point to list indices. + -- The first scope is the lhs, the optional scope can be used for additional constraints + -- on the variables (i.e. implicit separation over the product of the domains). + -- An out-of-bound index is an error, since otherwise replacement becomes unsound. + -- + | Connected Connective (ExprOf a) (ExprOf a) + | Lambda (Scope VarSymbol ExprOf a) + | Quantified Quantifier (Scope VarSymbol ExprOf a) + | PropositionalConstant PropositionalConstant + | Not (ExprOf a) + deriving (Functor, Foldable, Traversable) + +data ReplacementVar = ReplacementDomVar | ReplacementRangeVar deriving (Show, Eq, Ord, Generic, Hashable) + +makeBound ''ExprOf + +deriveShow1 ''ExprOf +deriveEq1 ''ExprOf +deriveOrd1 ''ExprOf + +deriving instance Show a => Show (ExprOf a) +deriving instance Eq a => Eq (ExprOf a) +deriving instance Ord a => Ord (ExprOf a) + +deriving instance Generic (ExprOf a) +deriving instance Generic1 ExprOf + +deriving instance Hashable1 ExprOf + +deriving instance Hashable a => Hashable (ExprOf a) + + +abstractVarSymbol :: VarSymbol -> ExprOf VarSymbol -> Scope VarSymbol ExprOf VarSymbol +abstractVarSymbol x = abstract (\y -> if x == y then Just x else Nothing) + +abstractVarSymbols :: Foldable t => t VarSymbol -> ExprOf VarSymbol -> Scope VarSymbol ExprOf VarSymbol +abstractVarSymbols xs = abstract (\y -> if y `elem` xs then Just y else Nothing) + +-- | Use the given set of in scope structures to cast them to their carriers +-- when occurring on the rhs of the element relation. +-- Use the given 'Map' to annotate (unannotated) structure operations +-- with the most recent inscope appropriate label. +annotateWith :: Set VarSymbol -> HashMap StructSymbol VarSymbol -> Formula -> Formula +annotateWith = go + where + go :: (Ord a) => Set a -> HashMap StructSymbol a -> ExprOf a -> ExprOf a + go labels ops = \case + TermSymbolStruct symb Nothing -> + TermSymbolStruct symb (TermVar <$> HM.lookup symb ops) + e@TermSymbolStruct{} -> + e + a `IsElementOf` TermVar x | x `Set.member` labels -> + go labels ops a `IsElementOf` TermSymbolStruct CarrierSymbol (Just (TermVar x)) + Not a -> + Not (go labels ops a) + Connected conn a b -> + Connected conn (go labels ops a) (go labels ops b) + Quantified quant body -> + Quantified quant (toScope (go (Set.map F labels) (F <$> ops) (fromScope body))) + e@TermVar{} -> e + TermSymbol symb args -> + TermSymbol symb (go labels ops <$> args) + Apply e1 args -> + Apply (go labels ops e1) (go labels ops <$> args) + TermSep vs e scope -> + TermSep vs (go labels ops e) (toScope (go (Set.map F labels) (F <$> ops) (fromScope scope))) + Iota x body -> + Iota x (toScope (go (Set.map F labels) (F <$> ops) (fromScope body))) + ReplacePred y x xB scope -> + ReplacePred y x (go labels ops xB) (toScope (go (Set.map F labels) (F <$> ops) (fromScope scope))) + ReplaceFun bounds ap cond -> + ReplaceFun + (fmap (\(x, e) -> (x, go labels ops e)) bounds) + (toScope (go (Set.map F labels) (F <$> ops) (fromScope ap))) + (toScope (go (Set.map F labels) (F <$> ops) (fromScope cond))) + Lambda body -> + Lambda (toScope (go (Set.map F labels) (F <$> ops) (fromScope body))) + e@PropositionalConstant{} -> e + +containsHigherOrderConstructs :: ExprOf a -> Bool +containsHigherOrderConstructs = \case + TermSep {} -> True + Iota{} -> True + ReplacePred{}-> True + ReplaceFun{}-> True + Lambda{} -> True + Apply{} -> False -- FIXME: this is a lie in general; we need to add sortchecking to determine this. + TermVar{} -> False + PropositionalConstant{} -> False + TermSymbol _ es -> any containsHigherOrderConstructs es + Not e -> containsHigherOrderConstructs e + Connected _ e1 e2 -> containsHigherOrderConstructs e1 || containsHigherOrderConstructs e2 + Quantified _ scope -> containsHigherOrderConstructs (fromScope scope) + TermSymbolStruct _ _ -> False + +pattern TermOp :: FunctionSymbol -> [ExprOf a] -> ExprOf a +pattern TermOp op es = TermSymbol (SymbolMixfix op) es + +pattern TermConst :: Token -> ExprOf a +pattern TermConst c = TermOp [Just c] [] + +pattern TermPair :: ExprOf a -> ExprOf a -> ExprOf a +pattern TermPair e1 e2 = TermOp PairSymbol [e1, e2] + +pattern Atomic :: Predicate -> [ExprOf a] -> ExprOf a +pattern Atomic symbol args = TermSymbol (SymbolPredicate symbol) args + + +pattern FormulaAdj :: ExprOf a -> LexicalPhrase -> [ExprOf a] -> ExprOf a +pattern FormulaAdj e adj es = Atomic (PredicateAdj adj) (e:es) + +pattern FormulaVerb :: ExprOf a -> SgPl LexicalPhrase -> [ExprOf a] -> ExprOf a +pattern FormulaVerb e verb es = Atomic (PredicateVerb verb) (e:es) + +pattern FormulaNoun :: ExprOf a -> SgPl LexicalPhrase -> [ExprOf a] -> ExprOf a +pattern FormulaNoun e noun es = Atomic (PredicateNoun noun) (e:es) + +relationNoun :: Expr -> Formula +relationNoun arg = FormulaNoun arg (unsafeReadPhraseSgPl "relation[/s]") [] + +rightUniqueAdj :: Expr -> Formula +rightUniqueAdj arg = FormulaAdj arg (unsafeReadPhrase "right-unique") [] + +-- | Untyped quantification. +pattern Forall, Exists :: Scope VarSymbol ExprOf a -> ExprOf a +pattern Forall scope = Quantified Universally scope +pattern Exists scope = Quantified Existentially scope + +makeForall, makeExists :: Foldable t => t VarSymbol -> Formula -> Formula +makeForall xs e = Quantified Universally (abstractVarSymbols xs e) +makeExists xs e = Quantified Existentially (abstractVarSymbols xs e) + +instantiateSome :: NonEmpty VarSymbol -> Scope VarSymbol ExprOf VarSymbol -> Scope VarSymbol ExprOf VarSymbol +instantiateSome xs scope = toScope (instantiateEither inst scope) + where + inst (Left x) | x `elem` xs = TermVar (F x) + inst (Left b) = TermVar (B b) + inst (Right fv) = TermVar (F fv) + +-- | Bind all free variables not occuring in the given set universally +forallClosure :: Set VarSymbol -> Formula -> Formula +forallClosure xs phi = if isClosed phi + then phi + else Quantified Universally (abstract isNamedVar phi) + where + isNamedVar :: VarSymbol -> Maybe VarSymbol + isNamedVar x = if x `Set.member` xs then Nothing else Just x + +freeVars :: ExprOf VarSymbol -> Set VarSymbol +freeVars = Set.fromList . toList + +pattern And :: ExprOf a -> ExprOf a -> ExprOf a +pattern And e1 e2 = Connected Conjunction e1 e2 + +pattern Or :: ExprOf a -> ExprOf a -> ExprOf a +pattern Or e1 e2 = Connected Disjunction e1 e2 + +pattern Implies :: ExprOf a -> ExprOf a -> ExprOf a +pattern Implies e1 e2 = Connected Implication e1 e2 + +pattern Iff :: ExprOf a -> ExprOf a -> ExprOf a +pattern Iff e1 e2 = Connected Equivalence e1 e2 + +pattern Xor :: ExprOf a -> ExprOf a -> ExprOf a +pattern Xor e1 e2 = Connected ExclusiveOr e1 e2 + + +pattern Bottom :: ExprOf a +pattern Bottom = PropositionalConstant IsBottom + +pattern Top :: ExprOf a +pattern Top = PropositionalConstant IsTop + + +pattern Relation :: RelationSymbol -> [ExprOf a] -> ExprOf a +pattern Relation rel es = Atomic (PredicateRelation rel) es + +-- | Membership. +pattern IsElementOf :: ExprOf a -> ExprOf a -> ExprOf a +pattern IsElementOf e1 e2 = Atomic (PredicateRelation (Command "in")) (e1 : [e2]) + +-- | Membership. +pattern IsNotElementOf :: ExprOf a -> ExprOf a -> ExprOf a +pattern IsNotElementOf e1 e2 = Not (IsElementOf e1 e2) + +-- | Subset relation (non-strict). +pattern IsSubsetOf :: ExprOf a -> ExprOf a -> ExprOf a +pattern IsSubsetOf e1 e2 = Atomic (PredicateRelation (Command "subseteq")) (e1 : [e2]) + +-- | Ordinal predicate. +pattern IsOrd :: ExprOf a -> ExprOf a +pattern IsOrd e1 = Atomic (PredicateNoun (SgPl [Just "ordinal"] [Just "ordinals"])) [e1] + +-- | Equality. +pattern Equals :: ExprOf a -> ExprOf a -> ExprOf a +pattern Equals e1 e2 = Atomic (PredicateRelation (Symbol "=")) (e1 : [e2]) + +-- | Disequality. +pattern NotEquals :: ExprOf a -> ExprOf a -> ExprOf a +pattern NotEquals e1 e2 = Atomic (PredicateRelation (Command "neq")) (e1 : [e2]) + +pattern EmptySet :: ExprOf a +pattern EmptySet = TermSymbol (SymbolMixfix [Just (Command "emptyset")]) [] + +makeConjunction :: [ExprOf a] -> ExprOf a +makeConjunction = \case + [] -> Top + es -> List.foldl1' And es + +makeDisjunction :: [ExprOf a] -> ExprOf a +makeDisjunction = \case + [] -> Bottom + es -> List.foldl1' Or es + +finiteSet :: NonEmpty (ExprOf a) -> ExprOf a +finiteSet = foldr cons EmptySet + where + cons x y = TermSymbol (SymbolMixfix ConsSymbol) [x, y] + +isPositive :: ExprOf a -> Bool +isPositive = \case + Not _ -> False + _ -> True + +dual :: ExprOf a -> ExprOf a +dual = \case + Not f -> f + f -> Not f + + + +-- | Local assumptions. +data Asm + = Asm Formula + | AsmStruct VarSymbol StructPhrase + + +deriving instance Show Asm +deriving instance Eq Asm +deriving instance Ord Asm + +data StructAsm + = StructAsm VarSymbol StructPhrase + + + +data Axiom = Axiom [Asm] Formula + +deriving instance Show Axiom +deriving instance Eq Axiom +deriving instance Ord Axiom + + +data Lemma = Lemma [Asm] Formula + +deriving instance Show Lemma +deriving instance Eq Lemma +deriving instance Ord Lemma + + +data Defn + = DefnPredicate [Asm] Predicate (NonEmpty VarSymbol) Formula + | DefnFun [Asm] (SgPl LexicalPhrase) [VarSymbol] Term + | DefnOp FunctionSymbol [VarSymbol] Term + +deriving instance Show Defn +deriving instance Eq Defn +deriving instance Ord Defn + +data Inductive = Inductive + { inductiveSymbol :: FunctionSymbol + , inductiveParams :: [VarSymbol] + , inductiveDomain :: Expr + , inductiveIntros :: NonEmpty IntroRule + } + deriving (Show, Eq, Ord) + +data IntroRule = IntroRule + { introConditions :: [Formula] -- The inductively defined set may only appear as an argument of monotone operations on the rhs. + , introResult :: Formula -- TODO Refine. + } + deriving (Show, Eq, Ord) + + +data Proof + = Omitted + -- ^ Ends a proof without further verification. + -- This results in a “gap” in the formalization. + | Qed Justification + -- ^ Ends of a proof, leaving automation to discharge the current goal using the given justification. + | ByContradiction Proof + -- ^ Take the dual of the current goal as an assumption and + -- set the goal to absurdity. + | BySetInduction (Maybe Term) Proof + -- ^ ∈-induction. + | ByOrdInduction Proof + -- ^ Transfinite induction for ordinals. + | Assume Formula Proof + -- ^ Simplify goals that are implications or disjunctions. + | Fix (NonEmpty VarSymbol) Formula Proof + -- ^ Simplify universal goals (with an optional bound or such that statement) + | Take (NonEmpty VarSymbol) Formula Justification Proof + -- ^ Use existential assumptions. + | Suffices Formula Justification Proof + | ByCase [Case] + -- ^ Proof by case. Disjunction of the case hypotheses 'Case' + -- must hold for this step to succeed. Each case starts a subproof, + -- keeping the same goal but adding the case hypothesis as an assumption. + -- Often this will be a classical split between /@P@/ and /@not P@/, in + -- which case the proof that /@P or not P@/ holds is easy. + -- + | Have Formula Justification Proof + -- ^ An affirmation, e.g.: /@We have \ by \@/. + -- + | Calc Calc Proof + | Subclaim Formula Proof Proof + -- ^ A claim is a sublemma with its own proof: + -- + -- /@Show \. \. \.@/ + -- + -- A successful first proof adds the claimed formula as an assumption + -- for the remaining proof. + -- + | Define VarSymbol Term Proof + | DefineFunction VarSymbol VarSymbol Term Term Proof + +deriving instance Show Proof +deriving instance Eq Proof +deriving instance Ord Proof + + + +-- | A case of a case split. +data Case = Case + { caseOf :: Formula + , caseProof :: Proof + } + +deriving instance Show Case +deriving instance Eq Case +deriving instance Ord Case + +-- | See 'Syntax.Abstract.Calc'. +data Calc + = Equation Term (NonEmpty (Term, Justification)) + | Biconditionals Term (NonEmpty (Term, Justification)) + +deriving instance Show Calc +deriving instance Eq Calc +deriving instance Ord Calc + +calcResult :: Calc -> ExprOf VarSymbol +calcResult = \case + Equation e eqns -> e `Equals` fst (NonEmpty.last eqns) + Biconditionals phi phis -> phi `Iff` fst (NonEmpty.last phis) + +calculation :: Calc -> [(ExprOf VarSymbol, Justification)] +calculation = \case + Equation e1 eqns@((e2, jst) :| _) -> (e1 `Equals` e2, jst) : collectEquations (toList eqns) + Biconditionals p1 ps@((p2, jst) :| _) -> (p1 `Iff` p2, jst) : collectBiconditionals (toList ps) + + +collectEquations :: [(ExprOf a, b)] -> [(ExprOf a, b)] +collectEquations = \case + (e1, _) : eqns'@((e2, jst) : _) -> (e1 `Equals` e2, jst) : collectEquations eqns' + _ -> [] + +collectBiconditionals :: [(ExprOf a, b)] -> [(ExprOf a, b)] +collectBiconditionals = \case + (p1, _) : ps@((p2, jst) : _) -> (p1 `Iff` p2, jst) : collectEquations ps + _ -> [] + + +newtype Datatype + = DatatypeFin (NonEmpty Text) + deriving (Show, Eq, Ord) + + +data Signature + = SignaturePredicate Predicate (NonEmpty VarSymbol) + | SignatureFormula Formula -- TODO: Reconsider, this is pretty lossy. + +deriving instance Show Signature +deriving instance Eq Signature +deriving instance Ord Signature + +data StructDefn = StructDefn + { structPhrase :: StructPhrase + -- ^ The noun phrase naming the structure, e.g.: @partial order@ or @abelian group@. + , structParents :: Set StructPhrase + , structDefnLabel :: VarSymbol + , structDefnFixes :: Set StructSymbol + -- ^ List of commands representing operations, + -- e.g.: @\\contained@ or @\\inv@. These are used as default operation names + -- in instantiations such as @Let $G$ be a group@. + -- The commands should be set up to handle an optional struct label + -- which would typically be rendered as a sub- or superscript, e.g.: + -- @\\contained[A]@ could render as ”⊑ᴬ“. + -- -- + , structDefnAssumes :: [(Marker, Formula)] + -- ^ The assumption or axioms of the structure. + -- To be instantiate with the @structFixes@ of a given structure. + } + +deriving instance Show StructDefn +deriving instance Eq StructDefn +deriving instance Ord StructDefn + + +data Abbreviation + = Abbreviation Symbol (Scope Int ExprOf Void) + deriving (Show, Eq, Ord) + +data Block + = BlockAxiom SourcePos Marker Axiom + | BlockLemma SourcePos Marker Lemma + | BlockProof SourcePos Proof + | BlockDefn SourcePos Marker Defn + | BlockAbbr SourcePos Marker Abbreviation + | BlockStruct SourcePos Marker StructDefn + | BlockInductive SourcePos Marker Inductive + | BlockSig SourcePos [Asm] Signature + deriving (Show, Eq, Ord) + + +data Task = Task + { taskDirectness :: Directness + , taskHypotheses :: [(Marker, Formula)] -- ^ No guarantees on order. + , taskConjectureLabel :: Marker + , taskConjecture :: Formula + } deriving (Show, Eq, Generic, Hashable) + + +-- | Indicates whether a given proof is direct or indirect. +-- An indirect proof (i.e. a proof by contradiction) may +-- cause an ATP to emit a warning about contradictory axioms. +-- When we know that the proof is indirect, we want to ignore +-- this warning. For relevance filtering we also want to know +-- what our actual goal is, so we keep the original conjecture. +data Directness + = Indirect Formula -- ^ The former conjecture. + | Direct + deriving (Show, Eq, Generic, Hashable) + +isIndirect :: Task -> Bool +isIndirect task = case taskDirectness task of + Indirect _ -> True + Direct -> False + + +-- | Boolean contraction of a task. +contractionTask :: Task -> Task +contractionTask task = task + { taskHypotheses = mapMaybe contract (taskHypotheses task) + , taskConjecture = contraction (taskConjecture task) + } + +contract :: (Marker, Formula) -> Maybe (Marker, Formula) +contract (m, phi) = case contraction phi of + Top -> Nothing + phi' -> Just (m, phi') + + + +-- | Full boolean contraction. +contraction :: ExprOf a -> ExprOf a +contraction = \case + Connected conn f1 f2 -> atomicContraction (Connected conn (contraction f1) (contraction f2)) + Quantified quant scope -> atomicContraction (Quantified quant (hoistScope contraction scope)) + Not f -> Not (contraction f) + f -> f + + +-- | Atomic boolean contraction. +atomicContraction :: ExprOf a -> ExprOf a +atomicContraction = \case + Top `Iff` f -> f + Bottom `Iff` f -> Not f + f `Iff` Top -> f + f `Iff` Bottom -> Not f + + Top `Implies` f -> f + Bottom `Implies` _ -> Top + _ `Implies` Top -> Top + f `Implies` Bottom -> Not f + + Top `And` f -> f + Bottom `And` _ -> Bottom + f `And` Top -> f + _ `And` Bottom -> Bottom + + phi@(Quantified _quant scope) -> case unscope scope of + Top -> Top + Bottom -> Bottom + _ -> phi + + Not Top -> Bottom + Not Bottom -> Top + + f -> f diff --git a/source/Syntax/LexicalPhrase.hs b/source/Syntax/LexicalPhrase.hs new file mode 100644 index 0000000..1743255 --- /dev/null +++ b/source/Syntax/LexicalPhrase.hs @@ -0,0 +1,93 @@ +{-# LANGUAGE DeriveAnyClass #-} +{-# LANGUAGE NoImplicitPrelude #-} + +module Syntax.LexicalPhrase where + + +import Base +import Syntax.Token (Token(..)) + +import Data.Char (isAlpha) +import Data.Text qualified as Text +import Text.Earley.Mixfix (Holey) +import Text.Earley (Grammar, Prod, (), fullParses, parser, rule, token, satisfy) + + + +-- | 'LexicalPhrase's should be nonempty lists with at least one proper word token. +-- Hyphens and quotes in words are treated as letters. +-- Thus /@manifold-with-boundary@/ is a singleton lexical phrase (one word). +-- +type LexicalPhrase = Holey Token + +-- MAYBE Add this instance by making LexicalPhrase a proper Type? +-- Until then we can use the default instance for lists of prettyprintable things. +-- +-- instance Pretty LexicalPhrase where +-- pretty components = hsep (prettyComponent <$> components) +-- where +-- prettyComponent = \case +-- Nothing -> "_" +-- Just tok -> pretty tok + + + +-- | Split data by grammatical number (singular/plural). +-- The 'Eq' and 'Ord' instances only consider the singular +-- form so that we can prefer known irregular plurals over +-- guessed irregular plurals when inserting items into +-- the 'Lexicon'. +data SgPl a + = SgPl {sg :: a, pl :: a} + deriving (Show, Functor, Generic, Hashable) + +instance Eq a => Eq (SgPl a) where (==) = (==) `on` sg +instance Ord a => Ord (SgPl a) where compare = compare `on` sg + + +unsafeReadPhrase :: String -> LexicalPhrase +unsafeReadPhrase spec = case fst (fullParses (parser lexicalPhraseSpec) spec) of + pat : _ -> pat + _ -> error "unsafeReadPhrase failed" + +unsafeReadPhraseSgPl :: String -> SgPl LexicalPhrase +unsafeReadPhraseSgPl spec = case fst (fullParses (parser lexicalPhraseSpecSgPl) spec) of + pat : _ -> pat + _ -> error "unsafeReadPhraseSgPl failed" + + +lexicalPhraseSpec :: Grammar r (Prod r String Char LexicalPhrase) +lexicalPhraseSpec = do + hole <- rule $ Nothing <$ token '?' "hole" + word <- rule $ Just <$> many (satisfy (\c -> isAlpha c || c == '-')) + space <- rule $ Just . (:[]) <$> token ' ' + segment <- rule $ hole <|> word + rule $ (\s ss -> makePhrase (s:ss)) <$> segment <*> many (space *> segment) + where + makePhrase :: [Maybe String] -> LexicalPhrase + makePhrase pat = fmap makeWord pat + + +lexicalPhraseSpecSgPl :: Grammar r (Prod r String Char (SgPl LexicalPhrase)) +lexicalPhraseSpecSgPl = do + space <- rule $ Just . (:[]) <$> token ' ' + hole <- rule $ (Nothing, Nothing) <$ token '?' "hole" + + word <- rule (many (satisfy isAlpha) "word") + wordSgPl <- rule $ (,) <$> (token '[' *> word) <* token '/' <*> word <* token ']' + complexWord <- rule $ (\(a,b) -> (Just a, Just b)) . fuse <$> + many ((<>) <$> (dup <$> word) <*> wordSgPl) "word" + segment <- rule (hole <|> (dup . Just <$> word) <|> complexWord ) + rule $ (\s ss -> makePhrase (s:ss)) <$> segment <*> many (space *> segment) + where + dup x = (x,x) + fuse = \case + (a, b) : (c, d) : rest -> fuse ((a <> c, b <> d) : rest) + [(a, b)] -> (a, b) + _ -> error "Syntax.Abstract.fuse" + + makePhrase :: [(Maybe String, Maybe String)] -> SgPl LexicalPhrase + makePhrase = (\(patSg, patPl) -> SgPl (fmap makeWord patSg) (fmap makeWord patPl)) . unzip + +makeWord :: Maybe String -> Maybe Token +makeWord = fmap (Word . Text.pack) diff --git a/source/Syntax/Lexicon.hs b/source/Syntax/Lexicon.hs new file mode 100644 index 0000000..805b875 --- /dev/null +++ b/source/Syntax/Lexicon.hs @@ -0,0 +1,275 @@ +{-# LANGUAGE NoImplicitPrelude #-} + +-- | The 'Lexicon' describes the part of the grammar that extensible/dynamic. +-- +-- The items of the 'Lexicon' are organized by their meaning and their +-- syntactic behaviour. They are typically represented as some kind of +-- pattern data which is then used to generate various production rules +-- for the concrete grammar. This representation makes inspection and +-- extension easier. +-- + +module Syntax.Lexicon + ( module Syntax.Lexicon + , pattern ConsSymbol + , pattern PairSymbol + , pattern CarrierSymbol + , pattern ApplySymbol + , pattern DomSymbol + ) where + + +import Base +import Syntax.Abstract + +import Data.List qualified as List +import Data.Sequence qualified as Seq +import Data.HashSet qualified as Set +import Data.HashMap.Strict qualified as HM +import Data.Text qualified as Text +import Text.Earley.Mixfix (Holey, Associativity(..)) + + +data Lexicon = Lexicon + { lexiconMixfix :: Seq (HashMap (Holey Token) (Associativity, Marker)) + , lexiconConnectives :: [[(Holey Token, Associativity)]] + , lexiconPrefixPredicates :: LexicalItems PrefixPredicate + , lexiconStructFun :: LexicalItems StructSymbol + , lexiconRelationSymbols :: LexicalItems RelationSymbol + , lexiconVerbs :: LexicalItems (SgPl LexicalPhrase) + , lexiconAdjLs :: LexicalItems LexicalPhrase + , lexiconAdjRs :: LexicalItems LexicalPhrase + , lexiconNouns :: LexicalItems (SgPl LexicalPhrase) + , lexiconStructNouns :: LexicalItems (SgPl LexicalPhrase) + , lexiconFuns :: LexicalItems (SgPl LexicalPhrase) + } deriving (Show, Eq) + +-- Projection returning the union of both left and right attributes. +-- +lexiconAdjs :: Lexicon -> HashMap LexicalPhrase Marker +lexiconAdjs lexicon = lexiconAdjLs lexicon <> lexiconAdjRs lexicon + +lookupOp :: FunctionSymbol -> Seq (HashMap FunctionSymbol (assoc, Marker)) -> Either String Marker +lookupOp f ops = case snd <$> firstJust (HM.lookup f) ops of + Just m -> Right m + Nothing -> Left (show f) + +lookupLexicalItem :: (Hashable a, Show a) => a -> LexicalItems a -> Either String Marker +lookupLexicalItem a items = case HM.lookup a items of + Just m -> Right m + Nothing -> Left (show a) + +type LexicalItems a = HashMap a Marker + +builtins :: Lexicon +builtins = Lexicon + { lexiconMixfix = builtinMixfix + , lexiconPrefixPredicates = builtinPrefixPredicates + , lexiconStructFun = builtinStructOps + , lexiconConnectives = builtinConnectives + , lexiconRelationSymbols = builtinRelationSymbols + , lexiconAdjLs = mempty + , lexiconAdjRs = builtinAdjRs + , lexiconVerbs = builtinVerbs + , lexiconNouns = builtinNouns + , lexiconStructNouns = builtinStructNouns + , lexiconFuns = mempty + } + +-- INVARIANT: 10 precedence levels for now. +builtinMixfix :: Seq (HashMap FunctionSymbol (Associativity, Marker)) +builtinMixfix = Seq.fromList $ (HM.fromList <$>) + [ [] + , [binOp (Symbol "+") LeftAssoc "add", binOp (Command "union") LeftAssoc "union", binOp (Command "monus") LeftAssoc "monus"] + , [binOp (Command "relcomp") LeftAssoc "relcomp"] + , [binOp (Command "circ") LeftAssoc "circ"] + , [binOp (Command "mul") LeftAssoc "mul", binOp (Command "inter") LeftAssoc "inter"] + , [binOp (Command "setminus") LeftAssoc "setminus"] + , [binOp (Command "times") RightAssoc "times"] + , [] + , prefixOps + , builtinIdentifiers + ] + where + builtinIdentifiers :: [(FunctionSymbol, (Associativity, Marker))] + builtinIdentifiers = identifier <$> + [ "emptyset" + , "naturals" + , "rationals" + , "reals" + , "unit" + , "zero" + ] + +prefixOps :: [(FunctionSymbol, (Associativity, Marker))] +prefixOps = + [ ([Just (Command "unions"), Just InvisibleBraceL, Nothing, Just InvisibleBraceR], (NonAssoc, "unions")) + , ([Just (Command "cumul"), Just InvisibleBraceL, Nothing, Just InvisibleBraceR], (NonAssoc, "cumul")) + , ([Just (Command "fst"), Just InvisibleBraceL, Nothing, Just InvisibleBraceR], (NonAssoc, "fst")) + , ([Just (Command "snd"), Just InvisibleBraceL, Nothing, Just InvisibleBraceR], (NonAssoc, "snd")) + , ([Just (Command "pow"), Just InvisibleBraceL, Nothing, Just InvisibleBraceR], (NonAssoc, "pow")) + , (ConsSymbol, (NonAssoc, "cons")) + , (PairSymbol, (NonAssoc, "pair")) + -- NOTE Is now defined and hence no longer necessary , (ApplySymbol, (NonAssoc, "apply")) + ] + + +builtinStructOps :: LexicalItems StructSymbol +builtinStructOps = HM.fromList + [ (CarrierSymbol, "carrier") + ] + +identifier :: Text -> (Holey Token, (Associativity, Marker)) +identifier cmd = ([Just (Command cmd)], (NonAssoc, Marker cmd)) + + +builtinRelationSymbols :: LexicalItems RelationSymbol +builtinRelationSymbols = HM.fromList + [ (Symbol "=", "eq") + , (Command "neq", "neq") + , (Command "in", "elem") + , (Command "notin", "notelem") -- Alternative to @\not\in@. + ] + +builtinPrefixPredicates :: LexicalItems PrefixPredicate +builtinPrefixPredicates = HM.fromList + [ (PrefixPredicate "Cong" 4, "cong") + , (PrefixPredicate "Betw" 3, "betw") + ] + + +builtinConnectives :: [[(Holey Token, Associativity)]] +builtinConnectives = + [ [binOp' (Command "iff") NonAssoc] + , [binOp' (Command "implies") RightAssoc] + , [binOp' (Command "lor") LeftAssoc] + , [binOp' (Command "land") LeftAssoc] + , [([Just (Command "lnot"), Nothing], NonAssoc)] + ] + + +binOp :: Token -> Associativity -> Marker -> (Holey Token, (Associativity, Marker)) +binOp tok assoc m = ([Nothing, Just tok, Nothing], (assoc, m)) + +binOp' :: Token -> Associativity -> (Holey Token, Associativity) +binOp' tok assoc = ([Nothing, Just tok, Nothing], assoc) + +builtinAdjRs :: LexicalItems LexicalPhrase +builtinAdjRs = HM.fromList + [ (unsafeReadPhrase "equal to ?", "eq") + ] + +builtinVerbs :: LexicalItems (SgPl LexicalPhrase) +builtinVerbs = HM.fromList + [ (unsafeReadPhraseSgPl "equal[s/] ?", "eq") + ] + + +-- Some of these do/should correspond to mathlib structures, +-- e.g.: lattice, complete lattice, ring, etc. +-- +builtinNouns :: LexicalItems (SgPl LexicalPhrase) +builtinNouns = HM.mapKeys unsafeReadPhraseSgPl (HM.fromList + -- Nullary + [ ("set[/s]", "set") + , ("point[/s]", "point") + , ("element[/s] of ?", "elem") + ]) + +_Onesorted :: SgPl LexicalPhrase +_Onesorted = unsafeReadPhraseSgPl "onesorted structure[/s]" + +builtinStructNouns :: LexicalItems (SgPl LexicalPhrase) +builtinStructNouns = HM.singleton _Onesorted "onesorted_structure" + + +-- | Naïve splitting of lexical phrases to insert a variable slot for names in noun phrases, +-- as in /@there exists a linear form $h$ on $E$@/, where the underlying pattern is +-- /@linear form on ?@/. In this case we would get: +-- +-- > splitOnVariableSlot (sg (unsafeReadPhraseSgPl "linear form[/s] on ?")) +-- > == +-- > (unsafeReadPhrase "linear form", unsafeReadPhrase "on ?") +-- +splitOnVariableSlot :: LexicalPhrase -> (LexicalPhrase, LexicalPhrase) +splitOnVariableSlot phrase = case prepositionIndices <> nonhyphenatedSlotIndices of + [] -> (phrase, []) -- Place variable slot at the end. + is -> List.splitAt (minimum is) phrase + where + prepositionIndices, slotIndices, nonhyphenatedSlotIndices :: [Int] -- Ascending. + prepositionIndices = List.findIndices isPreposition phrase + slotIndices = List.findIndices isNothing phrase + nonhyphenatedSlotIndices = [i | i <- slotIndices, noHyphen (nth (i + 1) phrase)] + + isPreposition :: Maybe Token -> Bool + isPreposition = \case + Just (Word w) -> w `Set.member` prepositions + _ -> False + + noHyphen :: Maybe (Maybe Token) -> Bool + noHyphen = \case + Just (Just (Word w)) -> Text.head w /= '-' + -- If we arrive here, either the pattern is over (`Nothing`) or the next + -- part of the pattern is not a word that starts with a hyphen. + _ -> True + + +-- Preposition are a closed class, but this list is not yet exhaustive. +-- It can and should be extended when needed. The following list is a +-- selection of the prepositions found at +-- https://en.wikipedia.org/wiki/List_of_English_prepositions. +-- +prepositions :: HashSet Text +prepositions = Set.fromList + [ "about" + , "above" + , "across" + , "after" + , "against" + , "along", "alongside" + , "amid", "amidst" + , "among" + , "around" + , "as" + , "at" + , "atop" + , "before" + , "behind" + , "below" + , "beneath" + , "beside", "besides" + , "between" + , "beyond" + , "but" + , "by" + , "except" + , "for" + , "from" + , "in", "inside", "into" + , "like" + , "modulo", "mod" + , "near" + , "next" + , "of" + , "off" + , "on" + , "onto" + , "opposite" + , "out" + , "over" + , "past" + , "per" + , "sans" + , "till" + , "to" + , "under" + , "underneath" + , "unlike" + , "unto" + , "up", "upon" + , "versus" + , "via" + , "with" + , "within" + , "without" + ] diff --git a/source/Syntax/Token.hs b/source/Syntax/Token.hs new file mode 100644 index 0000000..7c8606c --- /dev/null +++ b/source/Syntax/Token.hs @@ -0,0 +1,438 @@ +{-# LANGUAGE DeriveAnyClass #-} +{-# LANGUAGE NoImplicitPrelude #-} + +-- | +-- This module defines the lexer and its associated data types. +-- The lexer takes `Text` as input and produces a stream of tokens +-- annotated with positional information. This information is bundled +-- together with the original raw input for producing error messages. +-- +-- The lexer perfoms some normalizations to make describing the grammar easier. +-- Words outside of math environments are case-folded. Some commands are analysed +-- as variable tokens and are equivalent to their respective unicode variants +-- (α, β, γ, ..., 𝔸, 𝔹, ℂ, ...). Similarly, @\\begin{...}@ and @\\end{...}@ commands +-- are each parsed as single tokens. +-- +module Syntax.Token + ( Token(..) + , tokToString + , tokToText + , TokStream(..) + , Located(..) + , runLexer + ) where + + +import Base hiding (many) + +import Control.Monad.Combinators +import Control.Monad.State.Strict +import Data.Char (isAsciiLower) +import Data.List.NonEmpty qualified as NonEmpty +import Data.Text qualified as Text +import Prettyprinter (Pretty(..)) +import Text.Megaparsec hiding (Token, Label, label) +import Text.Megaparsec.Char qualified as Char +import Text.Megaparsec.Char.Lexer qualified as Lexer + + +runLexer :: String -> Text -> Either (ParseErrorBundle Text Void) [Located Token] +runLexer file raw = runParser (evalStateT toks initLexerState) file raw + + +type Lexer = StateT LexerState (Parsec Void Text) + + +data LexerState = LexerState + { textNesting :: !Int + -- ^ Represents nesting of braces inside of the @\text{...}@ + -- command. When we encounter @\text@ the token mode switches + -- to text tokens. In order to switch back to math mode correctly + -- we need to count the braces. + , mode :: !Mode + } deriving (Show, Eq) + +initLexerState :: LexerState +initLexerState = LexerState 0 TextMode + +incrNesting, decrNesting :: LexerState -> LexerState +incrNesting (LexerState n m) = LexerState (succ n) m +decrNesting (LexerState n m) = LexerState (pred n) m + +data Mode = TextMode | MathMode deriving (Show, Eq) + +isTextMode, isMathMode :: Lexer Bool +isTextMode = do + m <- gets mode + pure (m == TextMode) +isMathMode = do + m <- gets mode + pure (m == MathMode) + +setTextMode, setMathMode :: Lexer () +setTextMode = do + st <- get + put st{mode = TextMode} +setMathMode = do + st <- get + put st{mode = MathMode} + +-- | +-- A token stream as input stream for a parser. Contains the raw input +-- before tokenization as 'Text' for showing error messages. +-- +data TokStream = TokStream + { rawInput :: !Text + , unTokStream :: ![Located Token] + } deriving (Show, Eq) + +instance Semigroup TokStream where + TokStream raw1 toks1 <> TokStream raw2 toks2 = TokStream (raw1 <> raw2) (toks1 <> toks2) + +instance Monoid TokStream where + mempty = TokStream mempty mempty + +-- | A LaTeX token. +-- Invisible delimiters 'InvisibleBraceL' and 'InvisibleBraceR' are +-- unescaped braces used for grouping in TEX (@{@), +-- visibles braces are escaped braces (@\\{@). +data Token + = Word !Text + | Variable !Text + | Symbol !Text + | Integer !Int + | Command !Text + | Label Text -- ^ A /@\\label{...}@/ command (case-sensitive). + | Ref (NonEmpty Text) -- ^ A /@\\ref{...}@/ command (case-sensitive). + | BeginEnv !Text + | EndEnv !Text + | ParenL | ParenR + | BracketL | BracketR + | VisibleBraceL | VisibleBraceR + | InvisibleBraceL | InvisibleBraceR + deriving (Show, Eq, Ord, Generic, Hashable) + +instance IsString Token where + fromString w = Word (Text.pack w) + +tokToText :: Token -> Text +tokToText = \case + Word w -> w + Variable v -> v + Symbol s -> s + Integer n -> Text.pack (show n) + Command cmd -> Text.cons '\\' cmd + Label m -> "\\label{" <> m <> "}" + Ref ms -> "\\ref{" <> Text.intercalate ", " (toList ms) <> "}" + BeginEnv "math" -> "$" + EndEnv "math" -> "$" + BeginEnv env -> "\\begin{" <> env <> "}" + EndEnv env -> "\\end{" <> env <> "}" + ParenL -> "(" + ParenR -> ")" + BracketL -> "[" + BracketR -> "]" + VisibleBraceL -> "\\{" + VisibleBraceR -> "\\}" + InvisibleBraceL -> "{" + InvisibleBraceR -> "}" + +tokToString :: Token -> String +tokToString = Text.unpack . tokToText + +instance Pretty Token where + pretty = \case + Word w -> pretty w + Variable v -> pretty v + Symbol s -> pretty s + Integer n -> pretty n + Command cmd -> "\\" <> pretty cmd + Label m -> "\\label{" <> pretty m <> "}" + Ref m -> "\\ref{" <> pretty m <> "}" + BeginEnv env -> "\\begin{" <> pretty env <> "}" + EndEnv env -> "\\end{" <> pretty env <> "}" + ParenL -> "(" + ParenR -> ")" + BracketL -> "[" + BracketR -> "]" + VisibleBraceL -> "\\{" + VisibleBraceR -> "\\}" + InvisibleBraceL -> "{" + InvisibleBraceR -> "}" + + +data Located a = Located + { startPos :: !SourcePos + , unLocated :: !a + } deriving (Show) + +instance Eq a => Eq (Located a) where (==) = (==) `on` unLocated +instance Ord a => Ord (Located a) where compare = compare `on` unLocated + + +-- | Parses tokens, switching tokenizing mode when encountering math environments. +toks :: Lexer [Located Token] +toks = whitespace *> goNormal id <* eof + where + goNormal f = do + r <- optional tok + case r of + Nothing -> pure (f []) + Just t@(Located _ (BeginEnv "math")) -> goMath (f . (t:)) + Just t@(Located _ (BeginEnv "align*")) -> goMath (f . (t:)) + Just t -> goNormal (f . (t:)) + goText f = do + r <- optional textToken + case r of + Nothing -> pure (f []) + Just t@(Located _ (BeginEnv "math")) -> goMathInText (f . (t:)) + Just t@(Located _ (EndEnv "text")) -> goMath (f . (t:)) + Just t@(Located _ (EndEnv "explanation")) -> goMath (f . (t:)) + Just t -> goText (f . (t:)) + goMath f = do + r <- optional mathToken + case r of + Nothing -> pure (f []) + Just t@(Located _ (EndEnv "math")) -> goNormal (f . (t:)) + Just t@(Located _ (EndEnv "align*")) -> goNormal (f . (t:)) + Just t@(Located _ (BeginEnv "text")) -> goText (f . (t:)) + Just t@(Located _ (BeginEnv "explanation")) -> goText (f . (t:)) + Just t -> goMath (f . (t:)) + goMathInText f = do + r <- optional mathToken + case r of + Nothing -> pure (f []) + Just t@(Located _ (EndEnv "math")) -> goText (f . (t:)) + Just t@(Located _ (BeginEnv "text")) -> goText (f . (t:)) + Just t -> goMathInText (f . (t:)) +{-# INLINE toks #-} + +-- | Parses a single normal mode token. +tok :: Lexer (Located Token) +tok = + word <|> var <|> symbol <|> mathBegin <|> alignBegin <|> begin <|> end <|> opening <|> closing <|> label <|> ref <|> command + +-- | Parses a single math mode token. +mathToken :: Lexer (Located Token) +mathToken = + var <|> symbol <|> number <|> begin <|> alignEnd <|> end <|> opening <|> closing <|> beginText <|> beginExplanation <|> mathEnd <|> command + +beginText :: Lexer (Located Token) +beginText = lexeme do + Char.string "\\text{" + setTextMode + pure (BeginEnv "text") + +-- | Same as text modulo spacing, so we treat it synonymously +beginExplanation :: Lexer (Located Token) +beginExplanation = lexeme do + Char.string "\\explanation{" + setTextMode + pure (BeginEnv "text") + +-- | Normal mode embedded into math mode via @\text{...}@. +textToken :: Lexer (Located Token) +textToken = word <|> symbol <|> begin <|> end <|> textEnd <|> mathBegin <|> alignBegin <|> opening' <|> closing' <|> ref <|> command + where + textEnd = lexeme do + 0 <- gets textNesting -- Otherwise fail. + Char.char '}' + setMathMode + pure (EndEnv "text") + + opening' = lexeme (brace <|> group <|> paren <|> bracket) + where + brace = VisibleBraceL <$ lexeme (Char.string "\\{") + group = InvisibleBraceL <$ lexeme (Char.char '{') <* modify' incrNesting + paren = ParenL <$ lexeme (Char.char '(') + bracket = BracketL <$ lexeme (Char.char '[') + + closing' = lexeme (brace <|> group <|> paren <|> bracket) + where + brace = VisibleBraceR <$ lexeme (Char.string "\\}") + group = InvisibleBraceR <$ lexeme (Char.char '}') <* modify' decrNesting + paren = ParenR <$ lexeme (Char.char ')') + bracket = BracketR <$ lexeme (Char.char ']') + + +-- | Parses a single begin math token. +mathBegin :: Lexer (Located Token) +mathBegin = guardM isTextMode *> lexeme do + Char.string "\\(" <|> Char.string "\\[" <|> Char.string "$" + setMathMode + pure (BeginEnv "math") + +alignBegin :: Lexer (Located Token) +alignBegin = guardM isTextMode *> lexeme do + Char.string "\\begin{align*}" + setMathMode + pure (BeginEnv "align*") + + +-- | Parses a single end math token. +mathEnd :: Lexer (Located Token) +mathEnd = guardM isMathMode *> lexeme do + Char.string "\\)" <|> Char.string "\\]" <|> Char.string "$" + setTextMode + pure (EndEnv "math") + +alignEnd :: Lexer (Located Token) +alignEnd = guardM isMathMode *> lexeme do + Char.string "\\end{align*}" + setTextMode + pure (EndEnv "align*") + + +-- | Parses a word. Words are returned casefolded, since we want to ignore their case later on. +word :: Lexer (Located Token) +word = guardM isTextMode *> lexeme do + w <- some (Char.letterChar <|> Char.char '\'' <|> Char.char '-') + let t = Word (Text.toCaseFold (Text.pack w)) + pure t + +number :: Lexer (Located Token) +number = lexeme $ Integer <$> Lexer.decimal + + +var :: Lexer (Located Token) +var = guardM isMathMode *> lexeme (fmap Variable var') + where + var' = do + alphabeticPart <- letter <|> bb <|> greek + variationPart <- subscriptNumber <|> ticked <|> pure "" + pure (alphabeticPart <> variationPart) + + subscriptNumber :: Lexer Text + subscriptNumber = do + Char.char '_' + n <- some Char.digitChar + pure (Text.pack n) + + -- Temporary hack to fit the TPTP format. + ticked :: Lexer Text + ticked = do + ticks <- some $ Char.char '\'' + let ticks' = "prime" <$ ticks :: [Text] + pure (Text.concat ticks') + + letter :: Lexer Text + letter = fmap Text.singleton Char.letterChar + + greek :: Lexer Text + greek = try do + Char.char '\\' + l <- symbolParser greeks + notFollowedBy Char.letterChar + pure l + + greeks :: [Text] + greeks = + [ "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta" + , "iota", "kappa", "lambda", "mu", "nu", "xi", "pi", "rho", "sigma" + , "tau", "upsilon", "phi", "chi", "psi", "omega" + , "Gamma", "Delta", "Theta", "Lambda", "Xi", "Pi", "Sigma", "Upsilon" + , "Phi", "Psi", "Omega" + ] + + bb :: Lexer Text + bb = do + Char.string "\\mathbb{" + l <- symbolParser bbs + Char.char '}' + pure $ "bb" <> l + + bbs :: [Text] + bbs = Text.singleton <$> ['A'..'Z'] + + + symbolParser :: [Text] -> Lexer Text + symbolParser symbols = asum (fmap Char.string symbols) + + +symbol :: Lexer (Located Token) +symbol = lexeme do + symb <- some (satisfy (`elem` symbols)) + pure (Symbol (Text.pack symb)) + where + symbols :: [Char] + symbols = ".,:;!?@=≠+-/|^><≤≥*&≈⊂⊃⊆⊇∈“”‘’" + +-- | Parses a TEX-style command. +command :: Lexer (Located Token) +command = lexeme do + Char.char '\\' + cmd <- some Char.letterChar + pure (Command (Text.pack cmd)) + +-- | Parses the beginning of an environment. +-- Commits only after having seen "\begin{". +begin :: Lexer (Located Token) +begin = lexeme do + Char.string "\\begin{" + env <- some (Char.letterChar <|> Char.char '*') + Char.char '}' + pure (BeginEnv (Text.pack env)) + +-- | Parses a label command and extracts its marker. +label :: Lexer (Located Token) +label = lexeme do + Char.string "\\label{" + m <- marker + Char.char '}' + pure (Label m) + +-- | Parses a label command and extracts its marker. +ref :: Lexer (Located Token) +ref = lexeme do + -- @\\cref@ is from @cleveref@ and @\\hyperref@ is from @hyperref@ + cmd <- Char.string "\\ref{" <|> Char.string "\\cref{" <|> Char.string "\\hyperref[" + ms <- NonEmpty.fromList <$> marker `sepBy1` Char.char ',' + case cmd of + "\\hyperref[" -> Char.string "]{" *> some (satisfy (/= '}')) *> Char.char '}' *> pure (Ref ms) + _ -> Char.char '}' *> pure (Ref ms) + +marker :: Lexer Text +marker = takeWhile1P Nothing (\x -> isAsciiLower x || x == '_') + +-- | Parses the end of an environment. +-- Commits only after having seen "\end{". +end :: Lexer (Located Token) +end = lexeme do + Char.string "\\end{" + env <- some (Char.letterChar <|> Char.char '*') + Char.char '}' + pure (EndEnv (Text.pack env)) + +-- | Parses an opening delimiter. +opening :: Lexer (Located Token) +opening = lexeme (paren <|> brace <|> group <|> bracket) + where + brace = VisibleBraceL <$ lexeme (Char.string "\\{") + group = InvisibleBraceL <$ lexeme (Char.char '{') + paren = ParenL <$ lexeme (Char.char '(') + bracket = BracketL <$ lexeme (Char.char '[') + +-- | Parses a closing delimiter. +closing :: Lexer (Located Token) +closing = lexeme (paren <|> brace <|> group <|> bracket) + where + brace = VisibleBraceR <$ lexeme (Char.string "\\}") + group = InvisibleBraceR <$ lexeme (Char.char '}') + paren = ParenR <$ lexeme (Char.char ')') + bracket = BracketR <$ lexeme (Char.char ']') + +-- | Turns a Lexer into one that tracks the source position of the token +-- and consumes trailing whitespace. +lexeme :: Lexer a -> Lexer (Located a) +lexeme p = do + start <- getSourcePos + t <- p + whitespace + pure (Located start t) + +space :: Lexer () +space = void (Char.char ' ' <|> Char.char '\n') + <|> void (Char.string "\\ " <|> Char.string "\\\\" <|> Char.string "\\!" <|> Char.string "\\," <|> Char.string "\\:" <|> Char.string "\\;" <|> Char.string "\\;") + +whitespace :: Lexer () +whitespace = Lexer.space (void (some space)) (Lexer.skipLineComment "%") empty -- cgit v1.2.3