diff options
| author | aarne <aarne@cs.chalmers.se> | 2008-06-25 16:54:35 +0000 |
|---|---|---|
| committer | aarne <aarne@cs.chalmers.se> | 2008-06-25 16:54:35 +0000 |
| commit | e9e80fc389365e24d4300d7d5390c7d833a96c50 (patch) | |
| tree | f0b58473adaa670bd8fc52ada419d8cad470ee03 /src/PGF/Data.hs | |
| parent | b96b36f43de3e2f8b58d5f539daa6f6d47f25870 (diff) | |
changed names of resource-1.3; added a note on homepage on release
Diffstat (limited to 'src/PGF/Data.hs')
| -rw-r--r-- | src/PGF/Data.hs | 201 |
1 files changed, 201 insertions, 0 deletions
diff --git a/src/PGF/Data.hs b/src/PGF/Data.hs new file mode 100644 index 000000000..3f9aaa6ab --- /dev/null +++ b/src/PGF/Data.hs @@ -0,0 +1,201 @@ +module PGF.Data where + +import PGF.CId +import GF.Text.UTF8 +import GF.Data.Assoc + +import qualified Data.Map as Map +import Data.List +import Data.Array + +-- internal datatypes for PGF + +-- | An abstract data type representing multilingual grammar +-- in Portable Grammar Format. +data PGF = PGF { + absname :: CId , + cncnames :: [CId] , + gflags :: Map.Map CId String, -- value of a global flag + abstract :: Abstr , + concretes :: Map.Map CId Concr + } + +data Abstr = Abstr { + aflags :: Map.Map CId String, -- value of a flag + funs :: Map.Map CId (Type,Expr), -- type and def of a fun + cats :: Map.Map CId [Hypo], -- context of a cat + catfuns :: Map.Map CId [CId] -- funs to a cat (redundant, for fast lookup) + } + +data Concr = Concr { + cflags :: Map.Map CId String, -- value of a flag + lins :: Map.Map CId Term, -- lin of a fun + opers :: Map.Map CId Term, -- oper generated by subex elim + lincats :: Map.Map CId Term, -- lin type of a cat + lindefs :: Map.Map CId Term, -- lin default of a cat + printnames :: Map.Map CId Term, -- printname of a cat or a fun + paramlincats :: Map.Map CId Term, -- lin type of cat, with printable param names + parser :: Maybe ParserInfo -- parser + } + +data Type = + DTyp [Hypo] CId [Expr] + deriving (Eq,Ord,Show) + +data Literal = + LStr String -- ^ string constant + | LInt Integer -- ^ integer constant + | LFlt Double -- ^ floating point constant + deriving (Eq,Ord,Show) + +-- | The tree is an evaluated expression in the abstract syntax +-- of the grammar. The type is especially restricted to not +-- allow unapplied lambda abstractions. The tree is used directly +-- from the linearizer and is produced directly from the parser. +data Tree = + Abs [CId] Tree -- ^ lambda abstraction. The list of variables is non-empty + | Var CId -- ^ variable + | Fun CId [Tree] -- ^ function application + | Lit Literal -- ^ literal + | Meta Int -- ^ meta variable + deriving (Show, Eq, Ord) + +-- | An expression represents a potentially unevaluated expression +-- in the abstract syntax of the grammar. It can be evaluated with +-- the 'expr2tree' function and then linearized or it can be used +-- directly in the dependent types. +data Expr = + EAbs CId Expr -- ^ lambda abstraction + | EApp Expr Expr -- ^ application + | ELit Literal -- ^ literal + | EMeta Int -- ^ meta variable + | EVar CId -- ^ variable or function reference + | EEq [Equation] -- ^ lambda function defined as a set of equations with pattern matching + deriving (Eq,Ord,Show) + +data Term = + R [Term] + | P Term Term + | S [Term] + | K Tokn + | V Int + | C Int + | F CId + | FV [Term] + | W String Term + | TM String + deriving (Eq,Ord,Show) + +data Tokn = + KS String + | KP [String] [Alternative] + deriving (Eq,Ord,Show) + +data Alternative = + Alt [String] [String] + deriving (Eq,Ord,Show) + +data Hypo = + Hyp CId Type + deriving (Eq,Ord,Show) + +-- | The equation is used to define lambda function as a sequence +-- of equations with pattern matching. The list of 'Expr' represents +-- the patterns and the second 'Expr' is the function body for this +-- equation. +data Equation = + Equ [Expr] Expr + deriving (Eq,Ord,Show) + + +type FToken = String +type FCat = Int +type FIndex = Int +data FSymbol + = FSymCat {-# UNPACK #-} !FIndex {-# UNPACK #-} !Int + | FSymTok FToken +type Profile = [Int] +type FPointPos = Int +type FGrammar = ([FRule], Map.Map CId [FCat]) +data FRule = FRule CId [Profile] [FCat] FCat (Array FIndex (Array FPointPos FSymbol)) + +type RuleId = Int + +data ParserInfo + = ParserInfo { allRules :: Array RuleId FRule + , topdownRules :: Assoc FCat [RuleId] + -- ^ used in 'GF.Parsing.MCFG.Active' (Earley): + -- , emptyRules :: [RuleId] + , epsilonRules :: [RuleId] + -- ^ used in 'GF.Parsing.MCFG.Active' (Kilbury): + , leftcornerCats :: Assoc FCat [RuleId] + , leftcornerTokens :: Assoc FToken [RuleId] + -- ^ used in 'GF.Parsing.MCFG.Active' (Kilbury): + , grammarCats :: [FCat] + , grammarToks :: [FToken] + , startupCats :: Map.Map CId [FCat] + } + + +fcatString, fcatInt, fcatFloat, fcatVar :: Int +fcatString = (-1) +fcatInt = (-2) +fcatFloat = (-3) +fcatVar = (-4) + + +-- print statistics + +statGFCC :: PGF -> String +statGFCC pgf = unlines [ + "Abstract\t" ++ prCId (absname pgf), + "Concretes\t" ++ unwords (map prCId (cncnames pgf)), + "Categories\t" ++ unwords (map prCId (Map.keys (cats (abstract pgf)))) + ] + +-- merge two GFCCs; fails is differens absnames; priority to second arg + +unionPGF :: PGF -> PGF -> PGF +unionPGF one two = case absname one of + n | n == wildCId -> two -- extending empty grammar + | n == absname two -> one { -- extending grammar with same abstract + concretes = Map.union (concretes two) (concretes one), + cncnames = union (cncnames two) (cncnames one) + } + _ -> one -- abstracts don't match ---- print error msg + +emptyPGF :: PGF +emptyPGF = PGF { + absname = wildCId, + cncnames = [] , + gflags = Map.empty, + abstract = error "empty grammar, no abstract", + concretes = Map.empty + } + +-- encode idenfifiers and strings in UTF8 + +utf8GFCC :: PGF -> PGF +utf8GFCC pgf = pgf { + concretes = Map.map u8concr (concretes pgf) + } + where + u8concr cnc = cnc { + lins = Map.map u8term (lins cnc), + opers = Map.map u8term (opers cnc) + } + u8term = convertStringsInTerm encodeUTF8 + +---- TODO: convert identifiers and flags + +convertStringsInTerm conv t = case t of + K (KS s) -> K (KS (conv s)) + W s r -> W (conv s) (convs r) + R ts -> R $ map convs ts + S ts -> S $ map convs ts + FV ts -> FV $ map convs ts + P u v -> P (convs u) (convs v) + _ -> t + where + convs = convertStringsInTerm conv + |
