From 442d732696ad431b84f6e5c72b6ee785be4fd968 Mon Sep 17 00:00:00 2001 From: adelon <22380201+adelon@users.noreply.github.com> Date: Sat, 10 Feb 2024 02:22:14 +0100 Subject: Initial commit --- source/Syntax/LexicalPhrase.hs | 93 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 source/Syntax/LexicalPhrase.hs (limited to 'source/Syntax/LexicalPhrase.hs') diff --git a/source/Syntax/LexicalPhrase.hs b/source/Syntax/LexicalPhrase.hs new file mode 100644 index 0000000..1743255 --- /dev/null +++ b/source/Syntax/LexicalPhrase.hs @@ -0,0 +1,93 @@ +{-# LANGUAGE DeriveAnyClass #-} +{-# LANGUAGE NoImplicitPrelude #-} + +module Syntax.LexicalPhrase where + + +import Base +import Syntax.Token (Token(..)) + +import Data.Char (isAlpha) +import Data.Text qualified as Text +import Text.Earley.Mixfix (Holey) +import Text.Earley (Grammar, Prod, (), fullParses, parser, rule, token, satisfy) + + + +-- | 'LexicalPhrase's should be nonempty lists with at least one proper word token. +-- Hyphens and quotes in words are treated as letters. +-- Thus /@manifold-with-boundary@/ is a singleton lexical phrase (one word). +-- +type LexicalPhrase = Holey Token + +-- MAYBE Add this instance by making LexicalPhrase a proper Type? +-- Until then we can use the default instance for lists of prettyprintable things. +-- +-- instance Pretty LexicalPhrase where +-- pretty components = hsep (prettyComponent <$> components) +-- where +-- prettyComponent = \case +-- Nothing -> "_" +-- Just tok -> pretty tok + + + +-- | Split data by grammatical number (singular/plural). +-- The 'Eq' and 'Ord' instances only consider the singular +-- form so that we can prefer known irregular plurals over +-- guessed irregular plurals when inserting items into +-- the 'Lexicon'. +data SgPl a + = SgPl {sg :: a, pl :: a} + deriving (Show, Functor, Generic, Hashable) + +instance Eq a => Eq (SgPl a) where (==) = (==) `on` sg +instance Ord a => Ord (SgPl a) where compare = compare `on` sg + + +unsafeReadPhrase :: String -> LexicalPhrase +unsafeReadPhrase spec = case fst (fullParses (parser lexicalPhraseSpec) spec) of + pat : _ -> pat + _ -> error "unsafeReadPhrase failed" + +unsafeReadPhraseSgPl :: String -> SgPl LexicalPhrase +unsafeReadPhraseSgPl spec = case fst (fullParses (parser lexicalPhraseSpecSgPl) spec) of + pat : _ -> pat + _ -> error "unsafeReadPhraseSgPl failed" + + +lexicalPhraseSpec :: Grammar r (Prod r String Char LexicalPhrase) +lexicalPhraseSpec = do + hole <- rule $ Nothing <$ token '?' "hole" + word <- rule $ Just <$> many (satisfy (\c -> isAlpha c || c == '-')) + space <- rule $ Just . (:[]) <$> token ' ' + segment <- rule $ hole <|> word + rule $ (\s ss -> makePhrase (s:ss)) <$> segment <*> many (space *> segment) + where + makePhrase :: [Maybe String] -> LexicalPhrase + makePhrase pat = fmap makeWord pat + + +lexicalPhraseSpecSgPl :: Grammar r (Prod r String Char (SgPl LexicalPhrase)) +lexicalPhraseSpecSgPl = do + space <- rule $ Just . (:[]) <$> token ' ' + hole <- rule $ (Nothing, Nothing) <$ token '?' "hole" + + word <- rule (many (satisfy isAlpha) "word") + wordSgPl <- rule $ (,) <$> (token '[' *> word) <* token '/' <*> word <* token ']' + complexWord <- rule $ (\(a,b) -> (Just a, Just b)) . fuse <$> + many ((<>) <$> (dup <$> word) <*> wordSgPl) "word" + segment <- rule (hole <|> (dup . Just <$> word) <|> complexWord ) + rule $ (\s ss -> makePhrase (s:ss)) <$> segment <*> many (space *> segment) + where + dup x = (x,x) + fuse = \case + (a, b) : (c, d) : rest -> fuse ((a <> c, b <> d) : rest) + [(a, b)] -> (a, b) + _ -> error "Syntax.Abstract.fuse" + + makePhrase :: [(Maybe String, Maybe String)] -> SgPl LexicalPhrase + makePhrase = (\(patSg, patPl) -> SgPl (fmap makeWord patSg) (fmap makeWord patPl)) . unzip + +makeWord :: Maybe String -> Maybe Token +makeWord = fmap (Word . Text.pack) -- cgit v1.2.3