summaryrefslogtreecommitdiff
path: root/src/PGF/Data.hs
diff options
context:
space:
mode:
authoraarne <aarne@cs.chalmers.se>2008-06-25 16:54:35 +0000
committeraarne <aarne@cs.chalmers.se>2008-06-25 16:54:35 +0000
commite9e80fc389365e24d4300d7d5390c7d833a96c50 (patch)
treef0b58473adaa670bd8fc52ada419d8cad470ee03 /src/PGF/Data.hs
parentb96b36f43de3e2f8b58d5f539daa6f6d47f25870 (diff)
changed names of resource-1.3; added a note on homepage on release
Diffstat (limited to 'src/PGF/Data.hs')
-rw-r--r--src/PGF/Data.hs201
1 files changed, 201 insertions, 0 deletions
diff --git a/src/PGF/Data.hs b/src/PGF/Data.hs
new file mode 100644
index 000000000..3f9aaa6ab
--- /dev/null
+++ b/src/PGF/Data.hs
@@ -0,0 +1,201 @@
+module PGF.Data where
+
+import PGF.CId
+import GF.Text.UTF8
+import GF.Data.Assoc
+
+import qualified Data.Map as Map
+import Data.List
+import Data.Array
+
+-- internal datatypes for PGF
+
+-- | An abstract data type representing multilingual grammar
+-- in Portable Grammar Format.
+data PGF = PGF {
+ absname :: CId ,
+ cncnames :: [CId] ,
+ gflags :: Map.Map CId String, -- value of a global flag
+ abstract :: Abstr ,
+ concretes :: Map.Map CId Concr
+ }
+
+data Abstr = Abstr {
+ aflags :: Map.Map CId String, -- value of a flag
+ funs :: Map.Map CId (Type,Expr), -- type and def of a fun
+ cats :: Map.Map CId [Hypo], -- context of a cat
+ catfuns :: Map.Map CId [CId] -- funs to a cat (redundant, for fast lookup)
+ }
+
+data Concr = Concr {
+ cflags :: Map.Map CId String, -- value of a flag
+ lins :: Map.Map CId Term, -- lin of a fun
+ opers :: Map.Map CId Term, -- oper generated by subex elim
+ lincats :: Map.Map CId Term, -- lin type of a cat
+ lindefs :: Map.Map CId Term, -- lin default of a cat
+ printnames :: Map.Map CId Term, -- printname of a cat or a fun
+ paramlincats :: Map.Map CId Term, -- lin type of cat, with printable param names
+ parser :: Maybe ParserInfo -- parser
+ }
+
+data Type =
+ DTyp [Hypo] CId [Expr]
+ deriving (Eq,Ord,Show)
+
+data Literal =
+ LStr String -- ^ string constant
+ | LInt Integer -- ^ integer constant
+ | LFlt Double -- ^ floating point constant
+ deriving (Eq,Ord,Show)
+
+-- | The tree is an evaluated expression in the abstract syntax
+-- of the grammar. The type is especially restricted to not
+-- allow unapplied lambda abstractions. The tree is used directly
+-- from the linearizer and is produced directly from the parser.
+data Tree =
+ Abs [CId] Tree -- ^ lambda abstraction. The list of variables is non-empty
+ | Var CId -- ^ variable
+ | Fun CId [Tree] -- ^ function application
+ | Lit Literal -- ^ literal
+ | Meta Int -- ^ meta variable
+ deriving (Show, Eq, Ord)
+
+-- | An expression represents a potentially unevaluated expression
+-- in the abstract syntax of the grammar. It can be evaluated with
+-- the 'expr2tree' function and then linearized or it can be used
+-- directly in the dependent types.
+data Expr =
+ EAbs CId Expr -- ^ lambda abstraction
+ | EApp Expr Expr -- ^ application
+ | ELit Literal -- ^ literal
+ | EMeta Int -- ^ meta variable
+ | EVar CId -- ^ variable or function reference
+ | EEq [Equation] -- ^ lambda function defined as a set of equations with pattern matching
+ deriving (Eq,Ord,Show)
+
+data Term =
+ R [Term]
+ | P Term Term
+ | S [Term]
+ | K Tokn
+ | V Int
+ | C Int
+ | F CId
+ | FV [Term]
+ | W String Term
+ | TM String
+ deriving (Eq,Ord,Show)
+
+data Tokn =
+ KS String
+ | KP [String] [Alternative]
+ deriving (Eq,Ord,Show)
+
+data Alternative =
+ Alt [String] [String]
+ deriving (Eq,Ord,Show)
+
+data Hypo =
+ Hyp CId Type
+ deriving (Eq,Ord,Show)
+
+-- | The equation is used to define lambda function as a sequence
+-- of equations with pattern matching. The list of 'Expr' represents
+-- the patterns and the second 'Expr' is the function body for this
+-- equation.
+data Equation =
+ Equ [Expr] Expr
+ deriving (Eq,Ord,Show)
+
+
+type FToken = String
+type FCat = Int
+type FIndex = Int
+data FSymbol
+ = FSymCat {-# UNPACK #-} !FIndex {-# UNPACK #-} !Int
+ | FSymTok FToken
+type Profile = [Int]
+type FPointPos = Int
+type FGrammar = ([FRule], Map.Map CId [FCat])
+data FRule = FRule CId [Profile] [FCat] FCat (Array FIndex (Array FPointPos FSymbol))
+
+type RuleId = Int
+
+data ParserInfo
+ = ParserInfo { allRules :: Array RuleId FRule
+ , topdownRules :: Assoc FCat [RuleId]
+ -- ^ used in 'GF.Parsing.MCFG.Active' (Earley):
+ -- , emptyRules :: [RuleId]
+ , epsilonRules :: [RuleId]
+ -- ^ used in 'GF.Parsing.MCFG.Active' (Kilbury):
+ , leftcornerCats :: Assoc FCat [RuleId]
+ , leftcornerTokens :: Assoc FToken [RuleId]
+ -- ^ used in 'GF.Parsing.MCFG.Active' (Kilbury):
+ , grammarCats :: [FCat]
+ , grammarToks :: [FToken]
+ , startupCats :: Map.Map CId [FCat]
+ }
+
+
+fcatString, fcatInt, fcatFloat, fcatVar :: Int
+fcatString = (-1)
+fcatInt = (-2)
+fcatFloat = (-3)
+fcatVar = (-4)
+
+
+-- print statistics
+
+statGFCC :: PGF -> String
+statGFCC pgf = unlines [
+ "Abstract\t" ++ prCId (absname pgf),
+ "Concretes\t" ++ unwords (map prCId (cncnames pgf)),
+ "Categories\t" ++ unwords (map prCId (Map.keys (cats (abstract pgf))))
+ ]
+
+-- merge two GFCCs; fails is differens absnames; priority to second arg
+
+unionPGF :: PGF -> PGF -> PGF
+unionPGF one two = case absname one of
+ n | n == wildCId -> two -- extending empty grammar
+ | n == absname two -> one { -- extending grammar with same abstract
+ concretes = Map.union (concretes two) (concretes one),
+ cncnames = union (cncnames two) (cncnames one)
+ }
+ _ -> one -- abstracts don't match ---- print error msg
+
+emptyPGF :: PGF
+emptyPGF = PGF {
+ absname = wildCId,
+ cncnames = [] ,
+ gflags = Map.empty,
+ abstract = error "empty grammar, no abstract",
+ concretes = Map.empty
+ }
+
+-- encode idenfifiers and strings in UTF8
+
+utf8GFCC :: PGF -> PGF
+utf8GFCC pgf = pgf {
+ concretes = Map.map u8concr (concretes pgf)
+ }
+ where
+ u8concr cnc = cnc {
+ lins = Map.map u8term (lins cnc),
+ opers = Map.map u8term (opers cnc)
+ }
+ u8term = convertStringsInTerm encodeUTF8
+
+---- TODO: convert identifiers and flags
+
+convertStringsInTerm conv t = case t of
+ K (KS s) -> K (KS (conv s))
+ W s r -> W (conv s) (convs r)
+ R ts -> R $ map convs ts
+ S ts -> S $ map convs ts
+ FV ts -> FV $ map convs ts
+ P u v -> P (convs u) (convs v)
+ _ -> t
+ where
+ convs = convertStringsInTerm conv
+