summaryrefslogtreecommitdiff
path: root/source/Syntax/LexicalPhrase.hs
blob: 174325536422eaf5a8533d7733330cc1ed8b6664 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE NoImplicitPrelude #-}

module Syntax.LexicalPhrase where


import Base
import Syntax.Token (Token(..))

import Data.Char (isAlpha)
import Data.Text qualified as Text
import Text.Earley.Mixfix (Holey)
import Text.Earley (Grammar, Prod, (<?>), fullParses, parser, rule, token, satisfy)



-- | 'LexicalPhrase's should be nonempty lists with at least one proper word token.
-- Hyphens and quotes in words are treated as letters.
-- Thus /@manifold-with-boundary@/ is a singleton lexical phrase (one word).
--
type LexicalPhrase = Holey Token

-- MAYBE Add this instance by making LexicalPhrase a proper Type?
-- Until then we can use the default instance for lists of prettyprintable things.
--
-- instance Pretty LexicalPhrase where
--    pretty components = hsep (prettyComponent <$> components)
--       where
--          prettyComponent = \case
--             Nothing -> "_"
--             Just tok -> pretty tok



-- | Split data by grammatical number (singular/plural).
-- The 'Eq' and 'Ord' instances only consider the singular
-- form so that we can prefer known irregular plurals over
-- guessed irregular plurals when inserting items into
-- the 'Lexicon'.
data SgPl a
    = SgPl {sg :: a, pl :: a}
    deriving (Show, Functor, Generic, Hashable)

instance Eq a => Eq (SgPl a) where (==) = (==) `on` sg
instance Ord a => Ord (SgPl a) where compare = compare `on` sg


unsafeReadPhrase :: String -> LexicalPhrase
unsafeReadPhrase spec = case fst (fullParses (parser lexicalPhraseSpec) spec) of
    pat : _ -> pat
    _ -> error "unsafeReadPhrase failed"

unsafeReadPhraseSgPl :: String -> SgPl LexicalPhrase
unsafeReadPhraseSgPl spec = case fst (fullParses (parser lexicalPhraseSpecSgPl) spec) of
    pat : _ -> pat
    _ -> error "unsafeReadPhraseSgPl failed"


lexicalPhraseSpec :: Grammar r (Prod r String Char LexicalPhrase)
lexicalPhraseSpec = do
    hole     <- rule $ Nothing <$ token '?' <?> "hole"
    word     <- rule $ Just <$> many (satisfy (\c -> isAlpha c || c == '-'))
    space    <- rule $ Just . (:[]) <$> token ' '
    segment  <- rule $ hole <|> word
    rule $ (\s ss -> makePhrase (s:ss)) <$> segment <*> many (space *> segment)
    where
        makePhrase :: [Maybe String] -> LexicalPhrase
        makePhrase pat = fmap makeWord pat


lexicalPhraseSpecSgPl :: Grammar r (Prod r String Char (SgPl LexicalPhrase))
lexicalPhraseSpecSgPl = do
    space <- rule $ Just . (:[]) <$> token ' '
    hole  <- rule $ (Nothing, Nothing) <$ token '?'<?> "hole"

    word <- rule (many (satisfy isAlpha) <?> "word")
    wordSgPl <- rule $ (,) <$> (token '[' *> word) <* token '/' <*> word <* token ']'
    complexWord <- rule $ (\(a,b) -> (Just a, Just b)) . fuse <$>
        many ((<>) <$> (dup <$> word) <*> wordSgPl) <?> "word"
    segment  <- rule (hole <|> (dup . Just <$> word) <|> complexWord )
    rule $ (\s ss -> makePhrase (s:ss)) <$> segment <*> many (space *> segment)
    where
        dup x = (x,x)
        fuse = \case
            (a, b) : (c, d) : rest -> fuse ((a <> c, b <> d) : rest)
            [(a, b)] -> (a, b)
            _ -> error "Syntax.Abstract.fuse"

        makePhrase :: [(Maybe String, Maybe String)] -> SgPl LexicalPhrase
        makePhrase = (\(patSg, patPl) -> SgPl (fmap makeWord patSg) (fmap makeWord patPl)) . unzip

makeWord :: Maybe String -> Maybe Token
makeWord = fmap (Word . Text.pack)