src/runtime/haskell/PGF/Macros.hs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370

{-# LANGUAGE MagicHash, BangPatterns, FlexibleContexts #-}
module PGF.Macros where
import Prelude hiding ((<>)) -- GHC 8.4.1 clash with Text.PrettyPrint

import PGF.CId
import PGF.Data
import Control.Monad
import qualified Data.Map    as Map
--import qualified Data.Set    as Set
--import qualified Data.IntMap as IntMap
--import qualified Data.IntSet as IntSet
import qualified Data.Array  as Array
--import Data.Maybe
import Data.List
import Data.Array.IArray
import Text.PrettyPrint
import GHC.Prim
import GHC.Base(getTag)
import Data.Char

-- operations for manipulating PGF grammars and objects

mapConcretes :: (Concr -> Concr) -> PGF -> PGF
mapConcretes f pgf = pgf { concretes = Map.map f (concretes pgf) }

lookType :: Abstr -> CId -> Type
lookType abs f = 
  case lookMap (error $ "lookType " ++ show f) f (funs abs) of
    (ty,_,_,_) -> ty

isData :: Abstr -> CId -> Bool
isData abs f =
  case Map.lookup f (funs abs) of
    Just (_,_,Nothing,_) -> True       -- the encoding of data constrs
    _                    -> False

lookValCat :: Abstr -> CId -> CId
lookValCat abs = valCat . lookType abs

lookStartCat :: PGF -> CId
lookStartCat pgf = mkCId $
  case msum $ Data.List.map (Map.lookup (mkCId "startcat")) [gflags pgf, aflags (abstract pgf)] of
    Just (LStr s) -> s
    _             -> "S"

lookGlobalFlag :: PGF -> CId -> Maybe Literal
lookGlobalFlag pgf f = Map.lookup f (gflags pgf)

lookAbsFlag :: PGF -> CId -> Maybe Literal
lookAbsFlag pgf f = Map.lookup f (aflags (abstract pgf))

lookConcr :: PGF -> Language -> Concr
lookConcr pgf cnc = 
    lookMap (error $ "Missing concrete syntax: " ++ showCId cnc) cnc $ concretes pgf

-- use if name fails, use abstract + name; so e.g. "Eng" becomes "DemoEng" 
lookConcrComplete :: PGF -> CId -> Concr
lookConcrComplete pgf cnc = 
  case Map.lookup cnc (concretes pgf) of
    Just c -> c
    _ -> lookConcr pgf (mkCId (showCId (absname pgf) ++ showCId cnc))

lookConcrFlag :: PGF -> CId -> CId -> Maybe Literal
lookConcrFlag pgf lang f = Map.lookup f $ cflags $ lookConcr pgf lang

functionsToCat :: PGF -> CId -> [(CId,Type)]
functionsToCat pgf cat =
  [(f,ty) | (_,f) <- fs, Just (ty,_,_,_) <- [Map.lookup f $ funs $ abstract pgf]]
 where 
   (_,fs,_) = lookMap ([],[],0) cat $ cats $ abstract pgf

-- | List of functions that lack linearizations in the given language.
missingLins :: PGF -> Language -> [CId]
missingLins pgf lang = [c | c <- fs, not (hasl c)] where
  fs = Map.keys $ funs $ abstract pgf
  hasl = hasLin pgf lang

hasLin :: PGF -> Language -> CId -> Bool
hasLin pgf lang f = Map.member f $ lproductions $ lookConcr pgf lang

restrictPGF :: (CId -> Bool) -> PGF -> PGF
restrictPGF cond pgf = pgf {
  abstract = abstr {
    funs = Map.filterWithKey (\c _ -> cond c) (funs abstr),
    cats = Map.map (\(hyps,fs,p) -> (hyps,filter (cond . snd) fs,p)) (cats abstr)
    }
  }  ---- restrict concrs also, might be needed
 where
  abstr = abstract pgf

depth :: Expr -> Int
depth (EAbs _ _ t) = depth t
depth (EApp e1 e2) = max (depth e1) (depth e2) + 1
depth _            = 1

cftype :: [CId] -> CId -> Type
cftype args val = DTyp [(Explicit,wildCId,cftype [] arg) | arg <- args] val []

typeOfHypo :: Hypo -> Type
typeOfHypo (_,_,ty) = ty

catSkeleton :: Type -> ([CId],CId)
catSkeleton ty = case ty of
  DTyp hyps val _ -> ([valCat (typeOfHypo h) | h <- hyps],val)

typeSkeleton :: Type -> ([(Int,CId)],CId)
typeSkeleton ty = case ty of
  DTyp hyps val _ -> ([(contextLength ty, valCat ty) | h <- hyps, let ty = typeOfHypo h],val)

valCat :: Type -> CId
valCat ty = case ty of
  DTyp _ val _ -> val

contextLength :: Type -> Int
contextLength ty = case ty of
  DTyp hyps _ _ -> length hyps

-- | Show the printname of function or category
showPrintName :: PGF -> Language -> CId -> String
showPrintName pgf lang id = lookMap (showCId id) id $ printnames $ lookMap (error "no lang") lang $ concretes pgf

-- lookup with default value
lookMap :: Ord i => a -> i -> Map.Map i a -> a 
lookMap d c m = Map.findWithDefault d c m

--- from Operations
combinations :: [[a]] -> [[a]]
combinations t = case t of 
  []    -> [[]]
  aa:uu -> [a:u | a <- aa, u <- combinations uu]

cidString = mkCId "String"
cidInt    = mkCId "Int"
cidFloat  = mkCId "Float"
cidVar    = mkCId "__gfVar"


-- Utilities for doing linearization

-- | BracketedString represents a sentence that is linearized
-- as usual but we also want to retain the ''brackets'' that
-- mark the beginning and the end of each constituent.
data BracketedString
  = Leaf Token                                                                -- ^ this is the leaf i.e. a single token
  | Bracket CId {-# UNPACK #-} !FId {-# UNPACK #-} !FId {-# UNPACK #-} !LIndex CId [Expr] [BracketedString]
                                                                               -- ^ this is a bracket. The 'CId' is the category of
                                                                               -- the phrase. The 'FId' is an unique identifier for
                                                                               -- every phrase in the sentence. For context-free grammars
                                                                               -- i.e. without discontinuous constituents this identifier
                                                                               -- is also unique for every bracket. When there are discontinuous 
                                                                               -- phrases then the identifiers are unique for every phrase but
                                                                               -- not for every bracket since the bracket represents a constituent.
                                                                               -- The different constituents could still be distinguished by using
                                                                               -- the constituent index i.e. 'LIndex'. If the grammar is reduplicating
                                                                               -- then the constituent indices will be the same for all brackets
                                                                               -- that represents the same constituent.

data BracketedTokn
  = Bracket_ CId {-# UNPACK #-} !FId {-# UNPACK #-} !FId {-# UNPACK #-} !LIndex CId [Expr] [BracketedTokn]    -- Invariant: the list is not empty
  | LeafKS Token
  | LeafNE
  | LeafBIND
  | LeafSOFT_BIND
  | LeafCAPIT
  | LeafKP [BracketedTokn] [([BracketedTokn],[String])]
  deriving Eq

type LinTable = ([CId],Array.Array LIndex [BracketedTokn])

-- | Renders the bracketed string as string where 
-- the brackets are shown as @(S ...)@ where
-- @S@ is the category.
showBracketedString :: BracketedString -> String
showBracketedString = render . ppBracketedString

ppBracketedString (Leaf t) = text t
ppBracketedString (Bracket cat fid fid' index _ _ bss) = parens (ppCId cat <> colon <> int fid <+> hsep (map ppBracketedString bss))

-- | The length of the bracketed string in number of tokens.
lengthBracketedString :: BracketedString -> Int
lengthBracketedString (Leaf _)                  = 1
lengthBracketedString (Bracket _ _ _ _ _ _ bss) = sum (map lengthBracketedString bss)

untokn :: Maybe String -> [BracketedTokn] -> (Maybe String,[BracketedString])
untokn nw bss =
  let (nw',bss') = mapAccumR untokn nw bss
  in case sequence bss' of
       Just bss -> (nw,concat bss)
       Nothing  -> (nw,[])
  where
    untokn nw (Bracket_ cat fid fid' index fun es bss) =
      let (nw',bss') = mapAccumR untokn nw bss
      in case sequence bss' of
           Just bss -> (nw',Just [Bracket cat fid fid' index fun es (concat bss)])
           Nothing  -> (Nothing, Nothing)
    untokn nw (LeafKS t)
      | null t              = (nw,Just [])
      | otherwise           = (Just t,Just [Leaf t])
    untokn nw LeafNE        = (Nothing, Nothing)
    untokn nw (LeafKP d vs) = let (nw',bss') = mapAccumR untokn nw (sel d vs nw)
                              in case sequence bss' of
                                   Just bss -> (nw',Just (concat bss))
                                   Nothing  -> (Nothing, Nothing)
                              where
                                sel d vs Nothing  = d
                                sel d vs (Just w) =
                                  case [v | (v,cs) <- vs, any (\c -> isPrefixOf c w) cs] of
                                    v:_ -> v
                                    _   -> d

type CncType = (CId, FId)    -- concrete type is the abstract type (the category) + the forest id

mkLinTable :: Concr -> (CncType -> Bool) -> [CId] -> FunId -> [(CncType,FId,CId,[Expr],LinTable)] -> LinTable
mkLinTable cnc filter xs funid args = (xs,listArray (bounds lins) [computeSeq filter (elems (sequences cnc ! seqid)) args | seqid <- elems lins])
  where
    (CncFun _ lins) = cncfuns cnc ! funid

computeSeq :: (CncType -> Bool) -> [Symbol] -> [(CncType,FId,CId,[Expr],LinTable)] -> [BracketedTokn]
computeSeq filter seq args = concatMap compute seq
  where
    compute (SymCat d r)      = getArg d r
    compute (SymLit d r)      = getArg d r
    compute (SymVar d r)      = getVar d r
    compute (SymKS t)         = [LeafKS t]
    compute SymNE             = [LeafNE]
    compute SymBIND           = [LeafKS "&+"]
    compute SymSOFT_BIND      = []
    compute SymSOFT_SPACE     = []
    compute SymCAPIT          = [LeafKS "&|"]
    compute SymALL_CAPIT      = [LeafKS "&|"]
    compute (SymKP syms alts) = [LeafKP (concatMap compute syms) [(concatMap compute syms,cs) | (syms,cs) <- alts]]

    getArg d r
      | not (null arg_lin) &&
        filter ct   = [Bracket_ cat fid fid' r fun es arg_lin]
      | otherwise   = arg_lin
      where
        arg_lin                              = lin ! r
        (ct@(cat,fid),fid',fun,es,(_xs,lin)) = args !! d

    getVar d r = [LeafKS (showCId (xs !! r))]
      where
        (_ct,_,_fun,_es,(xs,_lin)) = args !! d

flattenBracketedString :: BracketedString -> [String]
flattenBracketedString (Leaf w)                  = [w]
flattenBracketedString (Bracket _ _ _ _ _ _ bss) = concatMap flattenBracketedString bss


-- The following is a version of Data.List.sortBy which together
-- with the sorting also eliminates duplicate values 
sortNubBy cmp = mergeAll . sequences
  where
    sequences (a:b:xs) =
      case cmp a b of
        GT -> descending b [a]  xs
        EQ -> sequences (b:xs)
        LT -> ascending  b (a:) xs
    sequences xs = [xs]

    descending a as []     = [a:as]
    descending a as (b:bs) =
      case cmp a b of
        GT -> descending b (a:as) bs
        EQ -> descending a as bs
        LT -> (a:as) : sequences (b:bs)

    ascending a as []     = let !x = as [a]
                            in [x]
    ascending a as (b:bs) =
      case cmp a b of
        GT -> let !x = as [a]
              in x : sequences (b:bs)
        EQ -> ascending a as bs
        LT -> ascending b (\ys -> as (a:ys)) bs

    mergeAll [x] = x
    mergeAll xs  = mergeAll (mergePairs xs)

    mergePairs (a:b:xs) = let !x = merge a b
                          in x : mergePairs xs
    mergePairs xs       = xs

    merge as@(a:as') bs@(b:bs') =
      case cmp a b of
        GT -> b:merge as  bs'
        EQ -> a:merge as' bs'
        LT -> a:merge as' bs
    merge [] bs         = bs
    merge as []         = as


-- The following function does case-insensitive comparison of sequences.
-- This is used to allow case-insensitive parsing, while
-- the linearizer still has access to the original cases.
compareCaseInsensitve s1 s2 =
    case compareSeq (elems s1) (elems s2) of
      (EQ,c) -> c
      (c, _) -> c
    where
      compareSeq []     []     = dup EQ
      compareSeq []     _      = dup LT
      compareSeq _      []     = dup GT
      compareSeq (x:xs) (y:ys) =
        case compareSym x y of
          (EQ,EQ) -> compareSeq xs ys
          (EQ,c2) -> case compareSeq xs ys of
                       (c1,_) -> (c1,c2)
          x       -> x

      compareSym s1 s2 =
        case s1 of
          SymCat d1 r1
            -> case s2 of
                 SymCat d2 r2
                   -> case compare d1 d2 of
                        EQ -> dup (r1 `compare` r2)
                        x  -> dup x
                 _ -> dup LT
          SymLit d1 r1
            -> case s2 of
                 SymCat {} -> dup GT
                 SymLit d2 r2
                   -> case compare d1 d2 of
                        EQ -> dup (r1 `compare` r2)
                        x  -> dup x
                 _ -> dup LT
          SymVar d1 r1
            -> if tagToEnum# (getTag s2 ># 2#)
                 then dup LT
                 else case s2 of
                        SymVar d2 r2
                          -> case compare d1 d2 of
                               EQ -> dup (r1 `compare` r2)
                               x  -> dup x
                        _ -> dup GT
          SymKS t1
            -> if tagToEnum# (getTag s2 ># 3#)
                 then dup LT
                 else case s2 of
                        SymKS t2 -> t1 `compareToken` t2
                        _          -> dup GT
          SymKP a1 b1
            -> if tagToEnum# (getTag s2 ># 4#)
                 then dup LT
                 else case s2 of
                        SymKP a2 b2
                          -> case compare a1 a2 of
                               EQ -> dup (b1 `compare` b2)
                               x  -> dup x
                        _ -> dup GT
          _ -> let t1 = getTag s1
                   t2 = getTag s2
               in if tagToEnum# (t1 <# t2) 
                    then dup LT
                    else if tagToEnum# (t1 ==# t2)
                           then dup EQ
                           else dup GT
  
      compareToken []     []     = dup EQ
      compareToken []     _      = dup LT
      compareToken _      []     = dup GT
      compareToken (x:xs) (y:ys)
        | x == y    = compareToken xs ys
        | otherwise = case compare (toLower x) (toLower y) of
                        EQ -> case compareToken xs ys of
                                (c,_) -> (c,compare x y)
                        c  -> dup c

      dup x = (x,x)