src/PGF/Data.hs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201

module PGF.Data where

import PGF.CId
import GF.Text.UTF8
import GF.Data.Assoc

import qualified Data.Map as Map
import Data.List
import Data.Array

-- internal datatypes for PGF

-- | An abstract data type representing multilingual grammar
-- in Portable Grammar Format.
data PGF = PGF {
  absname   :: CId ,
  cncnames  :: [CId] ,
  gflags    :: Map.Map CId String,   -- value of a global flag
  abstract  :: Abstr ,
  concretes :: Map.Map CId Concr
  }

data Abstr = Abstr {
  aflags  :: Map.Map CId String,      -- value of a flag
  funs    :: Map.Map CId (Type,Expr), -- type and def of a fun
  cats    :: Map.Map CId [Hypo],      -- context of a cat
  catfuns :: Map.Map CId [CId]        -- funs to a cat (redundant, for fast lookup)
  }

data Concr = Concr {
  cflags  :: Map.Map CId String,    -- value of a flag
  lins    :: Map.Map CId Term,      -- lin of a fun
  opers   :: Map.Map CId Term,      -- oper generated by subex elim
  lincats :: Map.Map CId Term,      -- lin type of a cat
  lindefs :: Map.Map CId Term,      -- lin default of a cat
  printnames :: Map.Map CId Term,   -- printname of a cat or a fun
  paramlincats :: Map.Map CId Term, -- lin type of cat, with printable param names
  parser  :: Maybe ParserInfo       -- parser
  }

data Type =
   DTyp [Hypo] CId [Expr]
  deriving (Eq,Ord,Show)

data Literal = 
   LStr String                      -- ^ string constant
 | LInt Integer                     -- ^ integer constant
 | LFlt Double                      -- ^ floating point constant
 deriving (Eq,Ord,Show)

-- | The tree is an evaluated expression in the abstract syntax
-- of the grammar. The type is especially restricted to not
-- allow unapplied lambda abstractions. The tree is used directly 
-- from the linearizer and is produced directly from the parser.
data Tree = 
   Abs [CId] Tree                   -- ^ lambda abstraction. The list of variables is non-empty
 | Var CId                          -- ^ variable
 | Fun CId [Tree]                   -- ^ function application
 | Lit Literal                      -- ^ literal
 | Meta Int                         -- ^ meta variable
  deriving (Show, Eq, Ord)

-- | An expression represents a potentially unevaluated expression
-- in the abstract syntax of the grammar. It can be evaluated with
-- the 'expr2tree' function and then linearized or it can be used
-- directly in the dependent types.
data Expr =
   EAbs CId Expr                    -- ^ lambda abstraction
 | EApp Expr Expr                   -- ^ application
 | ELit Literal                     -- ^ literal
 | EMeta  Int                       -- ^ meta variable
 | EVar   CId                       -- ^ variable or function reference
 | EEq [Equation]                   -- ^ lambda function defined as a set of equations with pattern matching
  deriving (Eq,Ord,Show)

data Term =
   R [Term]
 | P Term Term
 | S [Term]
 | K Tokn
 | V Int
 | C Int
 | F CId
 | FV [Term]
 | W String Term
 | TM String
  deriving (Eq,Ord,Show)

data Tokn =
   KS String
 | KP [String] [Alternative]
  deriving (Eq,Ord,Show)

data Alternative =
   Alt [String] [String]
  deriving (Eq,Ord,Show)

data Hypo =
   Hyp CId Type
  deriving (Eq,Ord,Show)

-- | The equation is used to define lambda function as a sequence
-- of equations with pattern matching. The list of 'Expr' represents
-- the patterns and the second 'Expr' is the function body for this
-- equation.
data Equation =
   Equ [Expr] Expr
  deriving (Eq,Ord,Show)


type FToken    = String
type FCat      = Int
type FIndex    = Int
data FSymbol
  = FSymCat {-# UNPACK #-} !FIndex {-# UNPACK #-} !Int
  | FSymTok FToken
type Profile   = [Int]
type FPointPos = Int
type FGrammar  = ([FRule], Map.Map CId [FCat])
data FRule     = FRule CId [Profile] [FCat] FCat (Array FIndex (Array FPointPos FSymbol))

type RuleId = Int

data ParserInfo
    = ParserInfo { allRules           :: Array RuleId FRule
                 , topdownRules       :: Assoc FCat [RuleId]
	  	   -- ^ used in 'GF.Parsing.MCFG.Active' (Earley):
	           -- , emptyRules    :: [RuleId]
	         , epsilonRules       :: [RuleId]
		   -- ^ used in 'GF.Parsing.MCFG.Active' (Kilbury):
	         , leftcornerCats     :: Assoc FCat   [RuleId]
	         , leftcornerTokens   :: Assoc FToken [RuleId]
		   -- ^ used in 'GF.Parsing.MCFG.Active' (Kilbury):
	         , grammarCats        :: [FCat]
	         , grammarToks        :: [FToken]
	         , startupCats        :: Map.Map CId [FCat]
	         }


fcatString, fcatInt, fcatFloat, fcatVar :: Int
fcatString = (-1)
fcatInt    = (-2)
fcatFloat  = (-3)
fcatVar    = (-4)


-- print statistics

statGFCC :: PGF -> String
statGFCC pgf = unlines [
  "Abstract\t" ++ prCId (absname pgf), 
  "Concretes\t" ++ unwords (map prCId (cncnames pgf)), 
  "Categories\t" ++ unwords (map prCId (Map.keys (cats (abstract pgf)))) 
  ]

-- merge two GFCCs; fails is differens absnames; priority to second arg

unionPGF :: PGF -> PGF -> PGF
unionPGF one two = case absname one of
  n | n == wildCId     -> two    -- extending empty grammar
    | n == absname two -> one {  -- extending grammar with same abstract
      concretes = Map.union (concretes two) (concretes one),
      cncnames  = union (cncnames two) (cncnames one)
    }
  _ -> one   -- abstracts don't match ---- print error msg

emptyPGF :: PGF
emptyPGF = PGF {
  absname   = wildCId,
  cncnames  = [] ,
  gflags    = Map.empty,
  abstract  = error "empty grammar, no abstract",
  concretes = Map.empty
  }

-- encode idenfifiers and strings in UTF8

utf8GFCC :: PGF -> PGF
utf8GFCC pgf = pgf {
  concretes = Map.map u8concr (concretes pgf)
  }
 where 
   u8concr cnc = cnc {
     lins = Map.map u8term (lins cnc),
     opers = Map.map u8term (opers cnc)
     }
   u8term = convertStringsInTerm encodeUTF8

---- TODO: convert identifiers and flags

convertStringsInTerm conv t = case t of
  K (KS s) -> K (KS (conv s))
  W s r    -> W (conv s) (convs r)
  R ts     -> R $ map convs ts
  S ts     -> S $ map convs ts
  FV ts    -> FV $ map convs ts
  P u v    -> P (convs u) (convs v)
  _        -> t
 where
  convs = convertStringsInTerm conv