diff options
| author | aarne <aarne@cs.chalmers.se> | 2008-05-21 09:26:44 +0000 |
|---|---|---|
| committer | aarne <aarne@cs.chalmers.se> | 2008-05-21 09:26:44 +0000 |
| commit | 055c0d0d5a5bb0dc75904fe53df7f2e4f5732a8f (patch) | |
| tree | 0e63fb68c69c8f6ad0f78893c63420f0a3600e1c /src-3.0/GF/GFCC | |
| parent | 915a1de71783ab8446b1af9e72c7ba7dfbc12d3f (diff) | |
GF/src is now for 2.9, and the new sources are in src-3.0 - keep it this way until the release of GF 3
Diffstat (limited to 'src-3.0/GF/GFCC')
28 files changed, 4550 insertions, 0 deletions
diff --git a/src-3.0/GF/GFCC/API.hs b/src-3.0/GF/GFCC/API.hs new file mode 100644 index 000000000..c266a5553 --- /dev/null +++ b/src-3.0/GF/GFCC/API.hs @@ -0,0 +1,140 @@ +---------------------------------------------------------------------- +-- | +-- Module : GFCCAPI +-- Maintainer : Aarne Ranta +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: +-- > CVS $Author: +-- > CVS $Revision: +-- +-- Reduced Application Programmer's Interface to GF, meant for +-- embedded GF systems. AR 19/9/2007 +----------------------------------------------------------------------------- + +module GF.GFCC.API where + +import GF.GFCC.Linearize +import GF.GFCC.Generate +import GF.GFCC.Macros +import GF.GFCC.DataGFCC +import GF.GFCC.CId +import GF.GFCC.Raw.ConvertGFCC +import GF.GFCC.Raw.ParGFCCRaw +import GF.Command.PPrTree + +import GF.Data.ErrM + +import GF.Parsing.FCFG + +--import GF.Data.Operations +--import GF.Infra.UseIO +import qualified Data.Map as Map +import System.Random (newStdGen) +import System.Directory (doesFileExist) + + +-- This API is meant to be used when embedding GF grammars in Haskell +-- programs. The embedded system is supposed to use the +-- .gfcc grammar format, which is first produced by the gf program. + +--------------------------------------------------- +-- Interface +--------------------------------------------------- + +data MultiGrammar = MultiGrammar {gfcc :: GFCC} +type Language = String +type Category = String +type Tree = Exp + +file2grammar :: FilePath -> IO MultiGrammar + +linearize :: MultiGrammar -> Language -> Tree -> String +parse :: MultiGrammar -> Language -> Category -> String -> [Tree] + +linearizeAll :: MultiGrammar -> Tree -> [String] +linearizeAllLang :: MultiGrammar -> Tree -> [(Language,String)] + +parseAll :: MultiGrammar -> Category -> String -> [[Tree]] +parseAllLang :: MultiGrammar -> Category -> String -> [(Language,[Tree])] + +generateAll :: MultiGrammar -> Category -> [Tree] +generateRandom :: MultiGrammar -> Category -> IO [Tree] +generateAllDepth :: MultiGrammar -> Category -> Maybe Int -> [Tree] + +readTree :: MultiGrammar -> String -> Tree +showTree :: Tree -> String + +languages :: MultiGrammar -> [Language] +categories :: MultiGrammar -> [Category] + +startCat :: MultiGrammar -> Category + +--------------------------------------------------- +-- Implementation +--------------------------------------------------- + +file2grammar f = do + gfcc <- file2gfcc f + return (MultiGrammar gfcc) + +file2gfcc f = do + s <- readFileIf f + g <- parseGrammar s + return $ toGFCC g + +linearize mgr lang = GF.GFCC.Linearize.linearize (gfcc mgr) (CId lang) + +parse mgr lang cat s = + case lookParser (gfcc mgr) (CId lang) of + Nothing -> error "no parser" + Just pinfo -> case parseFCF "bottomup" pinfo (CId cat) (words s) of + Ok x -> x + Bad s -> error s + +linearizeAll mgr = map snd . linearizeAllLang mgr +linearizeAllLang mgr t = + [(lang,linearThis mgr lang t) | lang <- languages mgr] + +parseAll mgr cat = map snd . parseAllLang mgr cat + +parseAllLang mgr cat s = + [(lang,ts) | lang <- languages mgr, let ts = parse mgr lang cat s, not (null ts)] + +generateRandom mgr cat = do + gen <- newStdGen + return $ genRandom gen (gfcc mgr) (CId cat) + +generateAll mgr cat = generate (gfcc mgr) (CId cat) Nothing +generateAllDepth mgr cat = generate (gfcc mgr) (CId cat) + +readTree _ = pTree + +showTree = prExp + +prIdent :: CId -> String +prIdent (CId s) = s + +abstractName mgr = prIdent (absname (gfcc mgr)) + +languages mgr = [l | CId l <- cncnames (gfcc mgr)] + +categories mgr = [c | CId c <- Map.keys (cats (abstract (gfcc mgr)))] + +startCat mgr = lookStartCat (gfcc mgr) + +emptyMultiGrammar = MultiGrammar emptyGFCC + +------------ for internal use only + +linearThis = GF.GFCC.API.linearize + +err f g ex = case ex of + Ok x -> g x + Bad s -> f s + +readFileIf f = do + b <- doesFileExist f + if b then readFile f + else putStrLn ("file " ++ f ++ " not found") >> return "" diff --git a/src-3.0/GF/GFCC/CId.hs b/src-3.0/GF/GFCC/CId.hs new file mode 100644 index 000000000..e4efa98ba --- /dev/null +++ b/src-3.0/GF/GFCC/CId.hs @@ -0,0 +1,14 @@ +module GF.GFCC.CId ( + module GF.GFCC.Raw.AbsGFCCRaw, + prCId, + cId + ) where + +import GF.GFCC.Raw.AbsGFCCRaw (CId(CId)) + +prCId :: CId -> String +prCId (CId s) = s + +cId :: String -> CId +cId = CId + diff --git a/src-3.0/GF/GFCC/CheckGFCC.hs b/src-3.0/GF/GFCC/CheckGFCC.hs new file mode 100644 index 000000000..d59dba1a9 --- /dev/null +++ b/src-3.0/GF/GFCC/CheckGFCC.hs @@ -0,0 +1,186 @@ +module GF.GFCC.CheckGFCC (checkGFCC, checkGFCCio, checkGFCCmaybe) where + +import GF.GFCC.CId +import GF.GFCC.Macros +import GF.GFCC.DataGFCC +import GF.Data.ErrM + +import qualified Data.Map as Map +import Control.Monad +import Debug.Trace + +checkGFCCio :: GFCC -> IO GFCC +checkGFCCio gfcc = case checkGFCC gfcc of + Ok (gc,b) -> do + putStrLn $ if b then "OK" else "Corrupted GFCC" + return gc + Bad s -> do + putStrLn s + error "building GFCC failed" + +---- needed in old Custom +checkGFCCmaybe :: GFCC -> Maybe GFCC +checkGFCCmaybe gfcc = case checkGFCC gfcc of + Ok (gc,b) -> return gc + Bad s -> Nothing + +checkGFCC :: GFCC -> Err (GFCC,Bool) +checkGFCC gfcc = do + (cs,bs) <- mapM (checkConcrete gfcc) + (Map.assocs (concretes gfcc)) >>= return . unzip + return (gfcc {concretes = Map.fromAscList cs}, and bs) + + +-- errors are non-fatal; replace with 'fail' to change this +msg s = trace s (return ()) + +andMapM :: Monad m => (a -> m Bool) -> [a] -> m Bool +andMapM f xs = mapM f xs >>= return . and + +labelBoolErr :: String -> Err (x,Bool) -> Err (x,Bool) +labelBoolErr ms iob = do + (x,b) <- iob + if b then return (x,b) else (msg ms >> return (x,b)) + + +checkConcrete :: GFCC -> (CId,Concr) -> Err ((CId,Concr),Bool) +checkConcrete gfcc (lang,cnc) = + labelBoolErr ("happened in language " ++ printCId lang) $ do + (rs,bs) <- mapM checkl (Map.assocs (lins cnc)) >>= return . unzip + return ((lang,cnc{lins = Map.fromAscList rs}),and bs) + where + checkl = checkLin gfcc lang + +checkLin :: GFCC -> CId -> (CId,Term) -> Err ((CId,Term),Bool) +checkLin gfcc lang (f,t) = + labelBoolErr ("happened in function " ++ printCId f) $ do + (t',b) <- checkTerm (lintype gfcc lang f) t --- $ inline gfcc lang t + return ((f,t'),b) + +inferTerm :: [CType] -> Term -> Err (Term,CType) +inferTerm args trm = case trm of + K _ -> returnt str + C i -> returnt $ ints i + V i -> do + testErr (i < length args) ("too large index " ++ show i) + returnt $ args !! i + S ts -> do + (ts',tys) <- mapM infer ts >>= return . unzip + let tys' = filter (/=str) tys + testErr (null tys') + ("expected Str in " ++ show trm ++ " not " ++ unwords (map show tys')) + return (S ts',str) + R ts -> do + (ts',tys) <- mapM infer ts >>= return . unzip + return $ (R ts',tuple tys) + P t u -> do + (t',tt) <- infer t + (u',tu) <- infer u + case tt of + R tys -> case tu of + R vs -> infer $ foldl P t' [P u' (C i) | i <- [0 .. length vs - 1]] + --- R [v] -> infer $ P t v + --- R (v:vs) -> infer $ P (head tys) (R vs) + + C i -> do + testErr (i < length tys) + ("required more than " ++ show i ++ " fields in " ++ show (R tys)) + return (P t' u', tys !! i) -- record: index must be known + _ -> do + let typ = head tys + testErr (all (==typ) tys) ("different types in table " ++ show trm) + return (P t' u', typ) -- table: types must be same + _ -> Bad $ "projection from " ++ show t ++ " : " ++ show tt + FV [] -> returnt tm0 ---- + FV (t:ts) -> do + (t',ty) <- infer t + (ts',tys) <- mapM infer ts >>= return . unzip + testErr (all (eqType ty) tys) ("different types in variants " ++ show trm) + return (FV (t':ts'),ty) + W s r -> infer r + _ -> Bad ("no type inference for " ++ show trm) + where + returnt ty = return (trm,ty) + infer = inferTerm args + +checkTerm :: LinType -> Term -> Err (Term,Bool) +checkTerm (args,val) trm = case inferTerm args trm of + Ok (t,ty) -> if eqType ty val + then return (t,True) + else do + msg ("term: " ++ show trm ++ + "\nexpected type: " ++ show val ++ + "\ninferred type: " ++ show ty) + return (t,False) + Bad s -> do + msg s + return (trm,False) + +eqType :: CType -> CType -> Bool +eqType inf exp = case (inf,exp) of + (C k, C n) -> k <= n -- only run-time corr. + (R rs,R ts) -> length rs == length ts && and [eqType r t | (r,t) <- zip rs ts] + (TM _, _) -> True ---- for variants [] ; not safe + _ -> inf == exp + +-- should be in a generic module, but not in the run-time DataGFCC + +type CType = Term +type LinType = ([CType],CType) + +tuple :: [CType] -> CType +tuple = R + +ints :: Int -> CType +ints = C + +str :: CType +str = S [] + +lintype :: GFCC -> CId -> CId -> LinType +lintype gfcc lang fun = case typeSkeleton (lookType gfcc fun) of + (cs,c) -> (map vlinc cs, linc c) ---- HOAS + where + linc = lookLincat gfcc lang + vlinc (0,c) = linc c + vlinc (i,c) = case linc c of + R ts -> R (ts ++ replicate i str) + +inline :: GFCC -> CId -> Term -> Term +inline gfcc lang t = case t of + F c -> inl $ look c + _ -> composSafeOp inl t + where + inl = inline gfcc lang + look = lookLin gfcc lang + +composOp :: Monad m => (Term -> m Term) -> Term -> m Term +composOp f trm = case trm of + R ts -> liftM R $ mapM f ts + S ts -> liftM S $ mapM f ts + FV ts -> liftM FV $ mapM f ts + P t u -> liftM2 P (f t) (f u) + W s t -> liftM (W s) $ f t + _ -> return trm + +composSafeOp :: (Term -> Term) -> Term -> Term +composSafeOp f = maybe undefined id . composOp (return . f) + +-- from GF.Data.Oper + +maybeErr :: String -> Maybe a -> Err a +maybeErr s = maybe (Bad s) Ok + +testErr :: Bool -> String -> Err () +testErr cond msg = if cond then return () else Bad msg + +errVal :: a -> Err a -> a +errVal a = err (const a) id + +errIn :: String -> Err a -> Err a +errIn msg = err (\s -> Bad (s ++ "\nOCCURRED IN\n" ++ msg)) return + +err :: (String -> b) -> (a -> b) -> Err a -> b +err d f e = case e of + Ok a -> f a + Bad s -> d s diff --git a/src-3.0/GF/GFCC/ComposOp.hs b/src-3.0/GF/GFCC/ComposOp.hs new file mode 100644 index 000000000..de2522bc7 --- /dev/null +++ b/src-3.0/GF/GFCC/ComposOp.hs @@ -0,0 +1,30 @@ +{-# OPTIONS_GHC -fglasgow-exts #-} +module GF.GFCC.ComposOp (Compos(..),composOp,composOpM,composOpM_,composOpMonoid, + composOpMPlus,composOpFold) where + +import Control.Monad.Identity +import Data.Monoid + +class Compos t where + compos :: (forall a. a -> m a) -> (forall a b. m (a -> b) -> m a -> m b) + -> (forall a. t a -> m (t a)) -> t c -> m (t c) + +composOp :: Compos t => (forall a. t a -> t a) -> t c -> t c +composOp f = runIdentity . composOpM (Identity . f) + +composOpM :: (Compos t, Monad m) => (forall a. t a -> m (t a)) -> t c -> m (t c) +composOpM = compos return ap + +composOpM_ :: (Compos t, Monad m) => (forall a. t a -> m ()) -> t c -> m () +composOpM_ = composOpFold (return ()) (>>) + +composOpMonoid :: (Compos t, Monoid m) => (forall a. t a -> m) -> t c -> m +composOpMonoid = composOpFold mempty mappend + +composOpMPlus :: (Compos t, MonadPlus m) => (forall a. t a -> m b) -> t c -> m b +composOpMPlus = composOpFold mzero mplus + +composOpFold :: Compos t => b -> (b -> b -> b) -> (forall a. t a -> b) -> t c -> b +composOpFold z c f = unC . compos (\_ -> C z) (\(C x) (C y) -> C (c x y)) (C . f) + +newtype C b a = C { unC :: b } diff --git a/src-3.0/GF/GFCC/DataGFCC.hs b/src-3.0/GF/GFCC/DataGFCC.hs new file mode 100644 index 000000000..077d62b19 --- /dev/null +++ b/src-3.0/GF/GFCC/DataGFCC.hs @@ -0,0 +1,152 @@ +module GF.GFCC.DataGFCC where + +import GF.GFCC.CId +import GF.Infra.CompactPrint +import GF.Text.UTF8 +import GF.Formalism.FCFG +import GF.Parsing.FCFG.PInfo + +import Data.Map +import Data.List + +-- internal datatypes for GFCC + +data GFCC = GFCC { + absname :: CId , + cncnames :: [CId] , + gflags :: Map CId String, -- value of a global flag + abstract :: Abstr , + concretes :: Map CId Concr + } + +data Abstr = Abstr { + aflags :: Map CId String, -- value of a flag + funs :: Map CId (Type,Exp), -- type and def of a fun + cats :: Map CId [Hypo], -- context of a cat + catfuns :: Map CId [CId] -- funs to a cat (redundant, for fast lookup) + } + +data Concr = Concr { + cflags :: Map CId String, -- value of a flag + lins :: Map CId Term, -- lin of a fun + opers :: Map CId Term, -- oper generated by subex elim + lincats :: Map CId Term, -- lin type of a cat + lindefs :: Map CId Term, -- lin default of a cat + printnames :: Map CId Term, -- printname of a cat or a fun + paramlincats :: Map CId Term, -- lin type of cat, with printable param names + parser :: Maybe FCFPInfo -- parser + } + +data Type = + DTyp [Hypo] CId [Exp] + deriving (Eq,Ord,Show) + +data Exp = + DTr [CId] Atom [Exp] + | EEq [Equation] + deriving (Eq,Ord,Show) + +data Atom = + AC CId + | AS String + | AI Integer + | AF Double + | AM Integer + | AV CId + deriving (Eq,Ord,Show) + +data Term = + R [Term] + | P Term Term + | S [Term] + | K Tokn + | V Int + | C Int + | F CId + | FV [Term] + | W String Term + | TM String + | RP Term Term + deriving (Eq,Ord,Show) + +data Tokn = + KS String + | KP [String] [Variant] + deriving (Eq,Ord,Show) + +data Variant = + Var [String] [String] + deriving (Eq,Ord,Show) + +data Hypo = + Hyp CId Type + deriving (Eq,Ord,Show) + +data Equation = + Equ [Exp] Exp + deriving (Eq,Ord,Show) + +-- print statistics + +statGFCC :: GFCC -> String +statGFCC gfcc = unlines [ + "Abstract\t" ++ pr (absname gfcc), + "Concretes\t" ++ unwords (lmap pr (cncnames gfcc)), + "Categories\t" ++ unwords (lmap pr (keys (cats (abstract gfcc)))) + ] + where pr (CId s) = s + +printCId :: CId -> String +printCId (CId s) = s + +-- merge two GFCCs; fails is differens absnames; priority to second arg + +unionGFCC :: GFCC -> GFCC -> GFCC +unionGFCC one two = case absname one of + CId "" -> two -- extending empty grammar + n | n == absname two -> one { -- extending grammar with same abstract + concretes = Data.Map.union (concretes two) (concretes one), + cncnames = Data.List.union (cncnames two) (cncnames one) + } + _ -> one -- abstracts don't match ---- print error msg + +emptyGFCC :: GFCC +emptyGFCC = GFCC { + absname = CId "", + cncnames = [] , + gflags = empty, + abstract = error "empty grammar, no abstract", + concretes = empty + } + +-- default map and filter are for Map here +lmap = Prelude.map +lfilter = Prelude.filter +mmap = Data.Map.map + +-- encode idenfifiers and strings in UTF8 + +utf8GFCC :: GFCC -> GFCC +utf8GFCC gfcc = gfcc { + concretes = mmap u8concr (concretes gfcc) + } + where + u8concr cnc = cnc { + lins = mmap u8term (lins cnc), + opers = mmap u8term (opers cnc) + } + u8term = convertStringsInTerm encodeUTF8 + +---- TODO: convert identifiers and flags + +convertStringsInTerm conv t = case t of + K (KS s) -> K (KS (conv s)) + W s r -> W (conv s) (convs r) + R ts -> R $ lmap convs ts + S ts -> S $ lmap convs ts + FV ts -> FV $ lmap convs ts + P u v -> P (convs u) (convs v) + _ -> t + where + convs = convertStringsInTerm conv + diff --git a/src-3.0/GF/GFCC/GFCC.cf b/src-3.0/GF/GFCC/GFCC.cf new file mode 100644 index 000000000..96d68649b --- /dev/null +++ b/src-3.0/GF/GFCC/GFCC.cf @@ -0,0 +1,81 @@ +Grm. Grammar ::= + "grammar" CId "(" [CId] ")" "(" [Flag] ")" ";" + Abstract ";" + [Concrete] ; + +Abs. Abstract ::= + "abstract" "{" + "flags" [Flag] + "fun" [FunDef] + "cat" [CatDef] + "}" ; + +Cnc. Concrete ::= + "concrete" CId "{" + "flags" [Flag] + "lin" [LinDef] + "oper" [LinDef] + "lincat" [LinDef] + "lindef" [LinDef] + "printname" [LinDef] + "param" [LinDef] -- lincats with param value names + "}" ; + +Flg. Flag ::= CId "=" String ; +Cat. CatDef ::= CId "[" [Hypo] "]" ; + +Fun. FunDef ::= CId ":" Type "=" Exp ; +Lin. LinDef ::= CId "=" Term ; + +DTyp. Type ::= "[" [Hypo] "]" CId [Exp] ; -- dependent type +DTr. Exp ::= "[" "(" [CId] ")" Atom [Exp] "]" ; -- term with bindings + +AC. Atom ::= CId ; +AS. Atom ::= String ; +AI. Atom ::= Integer ; +AF. Atom ::= Double ; +AM. Atom ::= "?" Integer ; + +R. Term ::= "[" [Term] "]" ; -- record/table +P. Term ::= "(" Term "!" Term ")" ; -- projection/selection +S. Term ::= "(" [Term] ")" ; -- concatenated sequence +K. Term ::= Tokn ; -- token +V. Term ::= "$" Integer ; -- argument +C. Term ::= Integer ; -- parameter value/label +F. Term ::= CId ; -- global constant +FV. Term ::= "[|" [Term] "|]" ; -- free variation +W. Term ::= "(" String "+" Term ")" ; -- prefix + suffix table +TM. Term ::= "?" ; -- lin of metavariable + +KS. Tokn ::= String ; +KP. Tokn ::= "[" "pre" [String] "[" [Variant] "]" "]" ; +Var. Variant ::= [String] "/" [String] ; + + +RP. Term ::= "(" Term "@" Term ")"; -- DEPRECATED: record parameter alias + +terminator Concrete ";" ; +terminator Flag ";" ; +terminator CatDef ";" ; +terminator FunDef ";" ; +terminator LinDef ";" ; +separator CId "," ; +separator Term "," ; +terminator Exp "" ; +terminator String "" ; +separator Variant "," ; + +token CId (('_' | letter) (letter | digit | '\'' | '_')*) ; + + +-- the following are needed if dependent types or HOAS or defs are present + +Hyp. Hypo ::= CId ":" Type ; +AV. Atom ::= "$" CId ; + +EEq. Exp ::= "{" [Equation] "}" ; -- list of pattern eqs; primitive: [] +Equ. Equation ::= [Exp] "->" Exp ; -- patterns are encoded as exps + +separator Hypo "," ; +terminator Equation ";" ; + diff --git a/src-3.0/GF/GFCC/Generate.hs b/src-3.0/GF/GFCC/Generate.hs new file mode 100644 index 000000000..63bdb3b9a --- /dev/null +++ b/src-3.0/GF/GFCC/Generate.hs @@ -0,0 +1,70 @@ +module GF.GFCC.Generate where + +import GF.GFCC.Macros +import GF.GFCC.DataGFCC +import GF.GFCC.CId + +import qualified Data.Map as M +import System.Random + +-- generate an infinite list of trees exhaustively +generate :: GFCC -> CId -> Maybe Int -> [Exp] +generate gfcc cat dp = concatMap (\i -> gener i cat) depths + where + gener 0 c = [tree (AC f) [] | (f, ([],_)) <- fns c] + gener i c = [ + tr | + (f, (cs,_)) <- fns c, + let alts = map (gener (i-1)) cs, + ts <- combinations alts, + let tr = tree (AC f) ts, + depth tr >= i + ] + fns c = [(f,catSkeleton ty) | (f,ty) <- functionsToCat gfcc c] + depths = maybe [0 ..] (\d -> [0..d]) dp + +-- generate an infinite list of trees randomly +genRandom :: StdGen -> GFCC -> CId -> [Exp] +genRandom gen gfcc cat = genTrees (randomRs (0.0, 1.0 :: Double) gen) cat where + + timeout = 47 -- give up + + genTrees ds0 cat = + let (ds,ds2) = splitAt (timeout+1) ds0 -- for time out, else ds + (t,k) = genTree ds cat + in (if k>timeout then id else (t:)) + (genTrees ds2 cat) -- else (drop k ds) + + genTree rs = gett rs where + gett ds (CId "String") = (tree (AS "foo") [], 1) + gett ds (CId "Int") = (tree (AI 12345) [], 1) + gett [] _ = (tree (AS "TIMEOUT") [], 1) ---- + gett ds cat = case fns cat of + [] -> (tree (AM 0) [],1) + fs -> let + d:ds2 = ds + (f,args) = getf d fs + (ts,k) = getts ds2 args + in (tree (AC f) ts, k+1) + getf d fs = let lg = (length fs) in + fs !! (floor (d * fromIntegral lg)) + getts ds cats = case cats of + c:cs -> let + (t, k) = gett ds c + (ts,ks) = getts (drop k ds) cs + in (t:ts, k + ks) + _ -> ([],0) + + fns cat = [(f,(fst (catSkeleton ty))) | (f,ty) <- functionsToCat gfcc cat] + + +{- +-- brute-force parsing method; only returns the first result +-- note: you cannot throw away rules with unknown words from the grammar +-- because it is not known which field in each rule may match the input + +searchParse :: Int -> GFCC -> CId -> [String] -> [Exp] +searchParse i gfcc cat ws = [t | t <- gen, s <- lins t, words s == ws] where + gen = take i $ generate gfcc cat + lins t = [linearize gfcc lang t | lang <- cncnames gfcc] +-} diff --git a/src-3.0/GF/GFCC/LexGFCC.hs b/src-3.0/GF/GFCC/LexGFCC.hs new file mode 100644 index 000000000..c86195e3d --- /dev/null +++ b/src-3.0/GF/GFCC/LexGFCC.hs @@ -0,0 +1,349 @@ +{-# OPTIONS -fglasgow-exts -cpp #-} +{-# LINE 3 "GF/GFCC/LexGFCC.x" #-} +{-# OPTIONS -fno-warn-incomplete-patterns #-} +module GF.GFCC.LexGFCC where + + + +#if __GLASGOW_HASKELL__ >= 603 +#include "ghcconfig.h" +#else +#include "config.h" +#endif +#if __GLASGOW_HASKELL__ >= 503 +import Data.Array +import Data.Char (ord) +import Data.Array.Base (unsafeAt) +#else +import Array +import Char (ord) +#endif +#if __GLASGOW_HASKELL__ >= 503 +import GHC.Exts +#else +import GlaExts +#endif +alex_base :: AlexAddr +alex_base = AlexA# "\x01\x00\x00\x00\x39\x00\x00\x00\x42\x00\x00\x00\x00\x00\x00\x00\xcb\xff\xff\xff\xeb\xff\xff\xff\x0b\x00\x00\x00\x9a\x00\x00\x00\x6a\x01\x00\x00\x00\x00\x00\x00\x15\x01\x00\x00\xd3\x00\x00\x00\x35\x00\x00\x00\xe5\x00\x00\x00\x3f\x00\x00\x00\xf0\x00\x00\x00\x1b\x01\x00\x00\xb8\x01\x00\x00"# + +alex_table :: AlexAddr +alex_table = AlexA# "\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x02\x00\x02\x00\x02\x00\x02\x00\x02\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x02\x00\x03\x00\x0a\x00\xff\xff\x03\x00\xff\xff\xff\xff\xff\xff\x03\x00\x03\x00\xff\xff\x03\x00\x03\x00\x06\x00\xff\xff\x03\x00\x0c\x00\x0c\x00\x0c\x00\x0c\x00\x0c\x00\x0c\x00\x0c\x00\x0c\x00\x0c\x00\x0c\x00\x03\x00\x03\x00\xff\xff\x03\x00\xff\xff\x03\x00\x03\x00\x02\x00\x02\x00\x02\x00\x02\x00\x02\x00\x03\x00\x03\x00\x03\x00\x00\x00\x02\x00\x02\x00\x02\x00\x02\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x04\x00\xff\xff\x03\x00\xff\xff\x07\x00\xff\xff\x02\x00\x0f\x00\x00\x00\x0c\x00\x0c\x00\x0c\x00\x0c\x00\x0c\x00\x0c\x00\x0c\x00\x0c\x00\x0c\x00\x0c\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x03\x00\x05\x00\x03\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x0a\x00\x00\x00\x00\x00\xff\xff\x07\x00\x0a\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x0d\x00\x0d\x00\x0d\x00\x0d\x00\x0d\x00\x0d\x00\x0d\x00\x0d\x00\x0d\x00\x0d\x00\xff\xff\x0d\x00\x0d\x00\x0d\x00\x0d\x00\x0d\x00\x0d\x00\x0d\x00\x0d\x00\x0d\x00\x0d\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x09\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0a\x00\x00\x00\x00\x00\x11\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x0b\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x08\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x07\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x10\x00\x00\x00\x00\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x00\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x00\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"# + +alex_check :: AlexAddr +alex_check = AlexA# "\xff\xff\x00\x00\x01\x00\x02\x00\x03\x00\x04\x00\x05\x00\x06\x00\x07\x00\x08\x00\x09\x00\x0a\x00\x0b\x00\x0c\x00\x0d\x00\x0e\x00\x0f\x00\x10\x00\x11\x00\x12\x00\x13\x00\x14\x00\x15\x00\x16\x00\x17\x00\x18\x00\x19\x00\x1a\x00\x1b\x00\x1c\x00\x1d\x00\x1e\x00\x1f\x00\x20\x00\x21\x00\x22\x00\x23\x00\x24\x00\x25\x00\x26\x00\x27\x00\x28\x00\x29\x00\x2a\x00\x2b\x00\x2c\x00\x2d\x00\x2e\x00\x2f\x00\x30\x00\x31\x00\x32\x00\x33\x00\x34\x00\x35\x00\x36\x00\x37\x00\x38\x00\x39\x00\x3a\x00\x3b\x00\x3c\x00\x3d\x00\x3e\x00\x3f\x00\x40\x00\x09\x00\x0a\x00\x0b\x00\x0c\x00\x0d\x00\x7c\x00\x5d\x00\x3e\x00\xff\xff\x09\x00\x0a\x00\x0b\x00\x0c\x00\x0d\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x20\x00\xff\xff\xff\xff\x5b\x00\x5c\x00\x5d\x00\x5e\x00\x5f\x00\x60\x00\x20\x00\x2e\x00\xff\xff\x30\x00\x31\x00\x32\x00\x33\x00\x34\x00\x35\x00\x36\x00\x37\x00\x38\x00\x39\x00\x30\x00\x31\x00\x32\x00\x33\x00\x34\x00\x35\x00\x36\x00\x37\x00\x38\x00\x39\x00\xff\xff\xff\xff\xff\xff\x7b\x00\x7c\x00\x7d\x00\x7e\x00\x7f\x00\x80\x00\x81\x00\x82\x00\x83\x00\x84\x00\x85\x00\x86\x00\x87\x00\x88\x00\x89\x00\x8a\x00\x8b\x00\x8c\x00\x8d\x00\x8e\x00\x8f\x00\x90\x00\x91\x00\x92\x00\x93\x00\x94\x00\x95\x00\x96\x00\x97\x00\x98\x00\x99\x00\x9a\x00\x9b\x00\x9c\x00\x9d\x00\x9e\x00\x9f\x00\xa0\x00\xa1\x00\xa2\x00\xa3\x00\xa4\x00\xa5\x00\xa6\x00\xa7\x00\xa8\x00\xa9\x00\xaa\x00\xab\x00\xac\x00\xad\x00\xae\x00\xaf\x00\xb0\x00\xb1\x00\xb2\x00\xb3\x00\xb4\x00\xb5\x00\xb6\x00\xb7\x00\xb8\x00\xb9\x00\xba\x00\xbb\x00\xbc\x00\xbd\x00\xbe\x00\xbf\x00\x27\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x30\x00\x31\x00\x32\x00\x33\x00\x34\x00\x35\x00\x36\x00\x37\x00\x38\x00\x39\x00\xff\xff\xff\xff\xff\xff\xff\xff\xd7\x00\xff\xff\xff\xff\x41\x00\x42\x00\x43\x00\x44\x00\x45\x00\x46\x00\x47\x00\x48\x00\x49\x00\x4a\x00\x4b\x00\x4c\x00\x4d\x00\x4e\x00\x4f\x00\x50\x00\x51\x00\x52\x00\x53\x00\x54\x00\x55\x00\x56\x00\x57\x00\x58\x00\x59\x00\x5a\x00\x22\x00\xff\xff\xff\xff\xf7\x00\x5f\x00\x27\x00\x61\x00\x62\x00\x63\x00\x64\x00\x65\x00\x66\x00\x67\x00\x68\x00\x69\x00\x6a\x00\x6b\x00\x6c\x00\x6d\x00\x6e\x00\x6f\x00\x70\x00\x71\x00\x72\x00\x73\x00\x74\x00\x75\x00\x76\x00\x77\x00\x78\x00\x79\x00\x7a\x00\x30\x00\x31\x00\x32\x00\x33\x00\x34\x00\x35\x00\x36\x00\x37\x00\x38\x00\x39\x00\x0a\x00\x30\x00\x31\x00\x32\x00\x33\x00\x34\x00\x35\x00\x36\x00\x37\x00\x38\x00\x39\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x5c\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x22\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x6e\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x74\x00\xff\xff\xff\xff\x65\x00\x30\x00\x31\x00\x32\x00\x33\x00\x34\x00\x35\x00\x36\x00\x37\x00\x38\x00\x39\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xc0\x00\xc1\x00\xc2\x00\xc3\x00\xc4\x00\xc5\x00\xc6\x00\xc7\x00\xc8\x00\xc9\x00\xca\x00\xcb\x00\xcc\x00\xcd\x00\xce\x00\xcf\x00\xd0\x00\xd1\x00\xd2\x00\xd3\x00\xd4\x00\xd5\x00\xd6\x00\x5c\x00\xd8\x00\xd9\x00\xda\x00\xdb\x00\xdc\x00\xdd\x00\xde\x00\xdf\x00\xe0\x00\xe1\x00\xe2\x00\xe3\x00\xe4\x00\xe5\x00\xe6\x00\xe7\x00\xe8\x00\xe9\x00\xea\x00\xeb\x00\xec\x00\xed\x00\xee\x00\xef\x00\xf0\x00\xf1\x00\xf2\x00\xf3\x00\xf4\x00\xf5\x00\xf6\x00\x27\x00\xf8\x00\xf9\x00\xfa\x00\xfb\x00\xfc\x00\xfd\x00\xfe\x00\xff\x00\x30\x00\x31\x00\x32\x00\x33\x00\x34\x00\x35\x00\x36\x00\x37\x00\x38\x00\x39\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x41\x00\x42\x00\x43\x00\x44\x00\x45\x00\x46\x00\x47\x00\x48\x00\x49\x00\x4a\x00\x4b\x00\x4c\x00\x4d\x00\x4e\x00\x4f\x00\x50\x00\x51\x00\x52\x00\x53\x00\x54\x00\x55\x00\x56\x00\x57\x00\x58\x00\x59\x00\x5a\x00\xff\xff\xff\xff\xff\xff\xff\xff\x5f\x00\xff\xff\x61\x00\x62\x00\x63\x00\x64\x00\x65\x00\x66\x00\x67\x00\x68\x00\x69\x00\x6a\x00\x6b\x00\x6c\x00\x6d\x00\x6e\x00\x6f\x00\x70\x00\x71\x00\x72\x00\x73\x00\x74\x00\x75\x00\x76\x00\x77\x00\x78\x00\x79\x00\x7a\x00\x2d\x00\xff\xff\xff\xff\x30\x00\x31\x00\x32\x00\x33\x00\x34\x00\x35\x00\x36\x00\x37\x00\x38\x00\x39\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xc0\x00\xc1\x00\xc2\x00\xc3\x00\xc4\x00\xc5\x00\xc6\x00\xc7\x00\xc8\x00\xc9\x00\xca\x00\xcb\x00\xcc\x00\xcd\x00\xce\x00\xcf\x00\xd0\x00\xd1\x00\xd2\x00\xd3\x00\xd4\x00\xd5\x00\xd6\x00\xff\xff\xd8\x00\xd9\x00\xda\x00\xdb\x00\xdc\x00\xdd\x00\xde\x00\xdf\x00\xe0\x00\xe1\x00\xe2\x00\xe3\x00\xe4\x00\xe5\x00\xe6\x00\xe7\x00\xe8\x00\xe9\x00\xea\x00\xeb\x00\xec\x00\xed\x00\xee\x00\xef\x00\xf0\x00\xf1\x00\xf2\x00\xf3\x00\xf4\x00\xf5\x00\xf6\x00\xff\xff\xf8\x00\xf9\x00\xfa\x00\xfb\x00\xfc\x00\xfd\x00\xfe\x00\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"# + +alex_deflt :: AlexAddr +alex_deflt = AlexA# "\x08\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x0a\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"# + +alex_accept = listArray (0::Int,17) [[],[],[(AlexAccSkip)],[(AlexAcc (alex_action_1))],[(AlexAcc (alex_action_1))],[],[],[(AlexAcc (alex_action_2))],[(AlexAcc (alex_action_2))],[(AlexAcc (alex_action_4))],[],[],[(AlexAcc (alex_action_5))],[(AlexAcc (alex_action_6))],[(AlexAcc (alex_action_6))],[],[],[]] +{-# LINE 33 "GF/GFCC/LexGFCC.x" #-} + +tok f p s = f p s + +share :: String -> String +share = id + +data Tok = + TS !String -- reserved words and symbols + | TL !String -- string literals + | TI !String -- integer literals + | TV !String -- identifiers + | TD !String -- double precision float literals + | TC !String -- character literals + | T_CId !String + + deriving (Eq,Show,Ord) + +data Token = + PT Posn Tok + | Err Posn + deriving (Eq,Show,Ord) + +tokenPos (PT (Pn _ l _) _ :_) = "line " ++ show l +tokenPos (Err (Pn _ l _) :_) = "line " ++ show l +tokenPos _ = "end of file" + +posLineCol (Pn _ l c) = (l,c) +mkPosToken t@(PT p _) = (posLineCol p, prToken t) + +prToken t = case t of + PT _ (TS s) -> s + PT _ (TI s) -> s + PT _ (TV s) -> s + PT _ (TD s) -> s + PT _ (TC s) -> s + PT _ (T_CId s) -> s + + _ -> show t + +data BTree = N | B String Tok BTree BTree deriving (Show) + +eitherResIdent :: (String -> Tok) -> String -> Tok +eitherResIdent tv s = treeFind resWords + where + treeFind N = tv s + treeFind (B a t left right) | s < a = treeFind left + | s > a = treeFind right + | s == a = t + +resWords = b "lin" (b "flags" (b "cat" (b "abstract" N N) (b "concrete" N N)) (b "grammar" (b "fun" N N) N)) (b "param" (b "lindef" (b "lincat" N N) (b "oper" N N)) (b "printname" (b "pre" N N) N)) + where b s = B s (TS s) + +unescapeInitTail :: String -> String +unescapeInitTail = unesc . tail where + unesc s = case s of + '\\':c:cs | elem c ['\"', '\\', '\''] -> c : unesc cs + '\\':'n':cs -> '\n' : unesc cs + '\\':'t':cs -> '\t' : unesc cs + '"':[] -> [] + c:cs -> c : unesc cs + _ -> [] + +------------------------------------------------------------------- +-- Alex wrapper code. +-- A modified "posn" wrapper. +------------------------------------------------------------------- + +data Posn = Pn !Int !Int !Int + deriving (Eq, Show,Ord) + +alexStartPos :: Posn +alexStartPos = Pn 0 1 1 + +alexMove :: Posn -> Char -> Posn +alexMove (Pn a l c) '\t' = Pn (a+1) l (((c+7) `div` 8)*8+1) +alexMove (Pn a l c) '\n' = Pn (a+1) (l+1) 1 +alexMove (Pn a l c) _ = Pn (a+1) l (c+1) + +type AlexInput = (Posn, -- current position, + Char, -- previous char + String) -- current input string + +tokens :: String -> [Token] +tokens str = go (alexStartPos, '\n', str) + where + go :: (Posn, Char, String) -> [Token] + go inp@(pos, _, str) = + case alexScan inp 0 of + AlexEOF -> [] + AlexError (pos, _, _) -> [Err pos] + AlexSkip inp' len -> go inp' + AlexToken inp' len act -> act pos (take len str) : (go inp') + +alexGetChar :: AlexInput -> Maybe (Char,AlexInput) +alexGetChar (p, c, []) = Nothing +alexGetChar (p, _, (c:s)) = + let p' = alexMove p c + in p' `seq` Just (c, (p', c, s)) + +alexInputPrevChar :: AlexInput -> Char +alexInputPrevChar (p, c, s) = c + +alex_action_1 = tok (\p s -> PT p (TS $ share s)) +alex_action_2 = tok (\p s -> PT p (eitherResIdent (T_CId . share) s)) +alex_action_3 = tok (\p s -> PT p (eitherResIdent (TV . share) s)) +alex_action_4 = tok (\p s -> PT p (TL $ share $ unescapeInitTail s)) +alex_action_5 = tok (\p s -> PT p (TI $ share s)) +alex_action_6 = tok (\p s -> PT p (TD $ share s)) +{-# LINE 1 "GenericTemplate.hs" #-} +{-# LINE 1 "<built-in>" #-} +{-# LINE 1 "<command line>" #-} +{-# LINE 1 "GenericTemplate.hs" #-} +-- ----------------------------------------------------------------------------- +-- ALEX TEMPLATE +-- +-- This code is in the PUBLIC DOMAIN; you may copy it freely and use +-- it for any purpose whatsoever. + +-- ----------------------------------------------------------------------------- +-- INTERNALS and main scanner engine + + +{-# LINE 35 "GenericTemplate.hs" #-} + + + + + + + + + + + + +data AlexAddr = AlexA# Addr# + +#if __GLASGOW_HASKELL__ < 503 +uncheckedShiftL# = shiftL# +#endif + +{-# INLINE alexIndexInt16OffAddr #-} +alexIndexInt16OffAddr (AlexA# arr) off = +#ifdef WORDS_BIGENDIAN + narrow16Int# i + where + i = word2Int# ((high `uncheckedShiftL#` 8#) `or#` low) + high = int2Word# (ord# (indexCharOffAddr# arr (off' +# 1#))) + low = int2Word# (ord# (indexCharOffAddr# arr off')) + off' = off *# 2# +#else + indexInt16OffAddr# arr off +#endif + + + + + +{-# INLINE alexIndexInt32OffAddr #-} +alexIndexInt32OffAddr (AlexA# arr) off = +#ifdef WORDS_BIGENDIAN + narrow32Int# i + where + i = word2Int# ((b3 `uncheckedShiftL#` 24#) `or#` + (b2 `uncheckedShiftL#` 16#) `or#` + (b1 `uncheckedShiftL#` 8#) `or#` b0) + b3 = int2Word# (ord# (indexCharOffAddr# arr (off' +# 3#))) + b2 = int2Word# (ord# (indexCharOffAddr# arr (off' +# 2#))) + b1 = int2Word# (ord# (indexCharOffAddr# arr (off' +# 1#))) + b0 = int2Word# (ord# (indexCharOffAddr# arr off')) + off' = off *# 4# +#else + indexInt32OffAddr# arr off +#endif + + + + + +#if __GLASGOW_HASKELL__ < 503 +quickIndex arr i = arr ! i +#else +-- GHC >= 503, unsafeAt is available from Data.Array.Base. +quickIndex = unsafeAt +#endif + + + + +-- ----------------------------------------------------------------------------- +-- Main lexing routines + +data AlexReturn a + = AlexEOF + | AlexError !AlexInput + | AlexSkip !AlexInput !Int + | AlexToken !AlexInput !Int a + +-- alexScan :: AlexInput -> StartCode -> Maybe (AlexInput,Int,act) +alexScan input (I# (sc)) + = alexScanUser undefined input (I# (sc)) + +alexScanUser user input (I# (sc)) + = case alex_scan_tkn user input 0# input sc AlexNone of + (AlexNone, input') -> + case alexGetChar input of + Nothing -> + + + + AlexEOF + Just _ -> + + + + AlexError input' + + (AlexLastSkip input len, _) -> + + + + AlexSkip input len + + (AlexLastAcc k input len, _) -> + + + + AlexToken input len k + + +-- Push the input through the DFA, remembering the most recent accepting +-- state it encountered. + +alex_scan_tkn user orig_input len input s last_acc = + input `seq` -- strict in the input + case s of + -1# -> (last_acc, input) + _ -> alex_scan_tkn' user orig_input len input s last_acc + +alex_scan_tkn' user orig_input len input s last_acc = + let + new_acc = check_accs (alex_accept `quickIndex` (I# (s))) + in + new_acc `seq` + case alexGetChar input of + Nothing -> (new_acc, input) + Just (c, new_input) -> + + + + let + base = alexIndexInt32OffAddr alex_base s + (I# (ord_c)) = ord c + offset = (base +# ord_c) + check = alexIndexInt16OffAddr alex_check offset + + new_s = if (offset >=# 0#) && (check ==# ord_c) + then alexIndexInt16OffAddr alex_table offset + else alexIndexInt16OffAddr alex_deflt s + in + alex_scan_tkn user orig_input (len +# 1#) new_input new_s new_acc + + where + check_accs [] = last_acc + check_accs (AlexAcc a : _) = AlexLastAcc a input (I# (len)) + check_accs (AlexAccSkip : _) = AlexLastSkip input (I# (len)) + check_accs (AlexAccPred a pred : rest) + | pred user orig_input (I# (len)) input + = AlexLastAcc a input (I# (len)) + check_accs (AlexAccSkipPred pred : rest) + | pred user orig_input (I# (len)) input + = AlexLastSkip input (I# (len)) + check_accs (_ : rest) = check_accs rest + +data AlexLastAcc a + = AlexNone + | AlexLastAcc a !AlexInput !Int + | AlexLastSkip !AlexInput !Int + +data AlexAcc a user + = AlexAcc a + | AlexAccSkip + | AlexAccPred a (AlexAccPred user) + | AlexAccSkipPred (AlexAccPred user) + +type AlexAccPred user = user -> AlexInput -> Int -> AlexInput -> Bool + +-- ----------------------------------------------------------------------------- +-- Predicates on a rule + +alexAndPred p1 p2 user in1 len in2 + = p1 user in1 len in2 && p2 user in1 len in2 + +--alexPrevCharIsPred :: Char -> AlexAccPred _ +alexPrevCharIs c _ input _ _ = c == alexInputPrevChar input + +--alexPrevCharIsOneOfPred :: Array Char Bool -> AlexAccPred _ +alexPrevCharIsOneOf arr _ input _ _ = arr ! alexInputPrevChar input + +--alexRightContext :: Int -> AlexAccPred _ +alexRightContext (I# (sc)) user _ _ input = + case alex_scan_tkn user input 0# input sc AlexNone of + (AlexNone, _) -> False + _ -> True + -- TODO: there's no need to find the longest + -- match when checking the right context, just + -- the first match will do. + +-- used by wrappers +iUnbox (I# (i)) = i diff --git a/src-3.0/GF/GFCC/Linearize.hs b/src-3.0/GF/GFCC/Linearize.hs new file mode 100644 index 000000000..c66ff93c1 --- /dev/null +++ b/src-3.0/GF/GFCC/Linearize.hs @@ -0,0 +1,91 @@ +module GF.GFCC.Linearize where + +import GF.GFCC.Macros +import GF.GFCC.DataGFCC +import GF.GFCC.CId +import Data.Map +import Data.List + +import Debug.Trace + +-- linearization and computation of concrete GFCC Terms + +linearize :: GFCC -> CId -> Exp -> String +linearize mcfg lang = realize . linExp mcfg lang + +realize :: Term -> String +realize trm = case trm of + R ts -> realize (ts !! 0) + S ss -> unwords $ lmap realize ss + K t -> case t of + KS s -> s + KP s _ -> unwords s ---- prefix choice TODO + W s t -> s ++ realize t + FV ts -> realize (ts !! 0) ---- other variants TODO + RP _ r -> realize r ---- DEPREC + TM s -> s + _ -> "ERROR " ++ show trm ---- debug + +linExp :: GFCC -> CId -> Exp -> Term +linExp mcfg lang tree@(DTr xs at trees) = + addB $ case at of + AC fun -> comp (lmap lin trees) $ look fun + AS s -> R [kks (show s)] -- quoted + AI i -> R [kks (show i)] + --- [C lst, kks (show i), C size] where + --- lst = mod (fromInteger i) 10 ; size = if i < 10 then 0 else 1 + AF d -> R [kks (show d)] + AV x -> TM (prCId x) + AM i -> TM (show i) + where + lin = linExp mcfg lang + comp = compute mcfg lang + look = lookLin mcfg lang + addB t + | Data.List.null xs = t + | otherwise = case t of + R ts -> R $ ts ++ (Data.List.map (kks . prCId) xs) + TM s -> R $ t : (Data.List.map (kks . prCId) xs) + +compute :: GFCC -> CId -> [Term] -> Term -> Term +compute mcfg lang args = comp where + comp trm = case trm of + P r p -> proj (comp r) (comp p) + RP i t -> RP (comp i) (comp t) ---- DEPREC + W s t -> W s (comp t) + R ts -> R $ lmap comp ts + V i -> idx args i -- already computed + F c -> comp $ look c -- not computed (if contains argvar) + FV ts -> FV $ lmap comp ts + S ts -> S $ lfilter (/= S []) $ lmap comp ts + _ -> trm + + look = lookOper mcfg lang + + idx xs i = if i > length xs - 1 + then trace + ("too large " ++ show i ++ " for\n" ++ unlines (lmap show xs) ++ "\n") tm0 + else xs !! i + + proj r p = case (r,p) of + (_, FV ts) -> FV $ lmap (proj r) ts + (FV ts, _ ) -> FV $ lmap (\t -> proj t p) ts + (W s t, _) -> kks (s ++ getString (proj t p)) + _ -> comp $ getField r (getIndex p) + + getString t = case t of + K (KS s) -> s + _ -> error ("ERROR in grammar compiler: string from "++ show t) "ERR" + + getIndex t = case t of + C i -> i + RP p _ -> getIndex p ---- DEPREC + TM _ -> 0 -- default value for parameter + _ -> trace ("ERROR in grammar compiler: index from " ++ show t) 666 + + getField t i = case t of + R rs -> idx rs i + RP _ r -> getField r i ---- DEPREC + TM s -> TM s + _ -> error ("ERROR in grammar compiler: field from " ++ show t) t + diff --git a/src-3.0/GF/GFCC/Macros.hs b/src-3.0/GF/GFCC/Macros.hs new file mode 100644 index 000000000..4897aa667 --- /dev/null +++ b/src-3.0/GF/GFCC/Macros.hs @@ -0,0 +1,121 @@ +module GF.GFCC.Macros where + +import GF.GFCC.CId +import GF.GFCC.DataGFCC +import GF.Formalism.FCFG (FGrammar) +import GF.Parsing.FCFG.PInfo (FCFPInfo, fcfPInfoToFGrammar) +----import GF.GFCC.PrintGFCC +import Control.Monad +import Data.Map +import Data.Maybe +import Data.List + +-- operations for manipulating GFCC grammars and objects + +lookLin :: GFCC -> CId -> CId -> Term +lookLin gfcc lang fun = + lookMap tm0 fun $ lins $ lookMap (error "no lang") lang $ concretes gfcc + +lookOper :: GFCC -> CId -> CId -> Term +lookOper gfcc lang fun = + lookMap tm0 fun $ opers $ lookMap (error "no lang") lang $ concretes gfcc + +lookLincat :: GFCC -> CId -> CId -> Term +lookLincat gfcc lang fun = + lookMap tm0 fun $ lincats $ lookMap (error "no lang") lang $ concretes gfcc + +lookParamLincat :: GFCC -> CId -> CId -> Term +lookParamLincat gfcc lang fun = + lookMap tm0 fun $ paramlincats $ lookMap (error "no lang") lang $ concretes gfcc + +lookType :: GFCC -> CId -> Type +lookType gfcc f = + fst $ lookMap (error $ "lookType " ++ show f) f (funs (abstract gfcc)) + +lookParser :: GFCC -> CId -> Maybe FCFPInfo +lookParser gfcc lang = parser $ lookMap (error "no lang") lang $ concretes gfcc + +lookFCFG :: GFCC -> CId -> Maybe FGrammar +lookFCFG gfcc lang = fmap fcfPInfoToFGrammar $ lookParser gfcc lang + +lookStartCat :: GFCC -> String +lookStartCat gfcc = fromMaybe "S" $ msum $ Data.List.map (Data.Map.lookup (CId "startcat")) + [gflags gfcc, aflags (abstract gfcc)] + +lookGlobalFlag :: GFCC -> CId -> String +lookGlobalFlag gfcc f = + lookMap "?" f (gflags gfcc) + +lookAbsFlag :: GFCC -> CId -> String +lookAbsFlag gfcc f = + lookMap "?" f (aflags (abstract gfcc)) + +lookCncFlag :: GFCC -> CId -> CId -> String +lookCncFlag gfcc lang f = + lookMap "?" f $ cflags $ lookMap (error "no lang") lang $ concretes gfcc + +functionsToCat :: GFCC -> CId -> [(CId,Type)] +functionsToCat gfcc cat = + [(f,ty) | f <- fs, Just (ty,_) <- [Data.Map.lookup f $ funs $ abstract gfcc]] + where + fs = lookMap [] cat $ catfuns $ abstract gfcc + +depth :: Exp -> Int +depth tr = case tr of + DTr _ _ [] -> 1 + DTr _ _ ts -> maximum (lmap depth ts) + 1 + +tree :: Atom -> [Exp] -> Exp +tree = DTr [] + +cftype :: [CId] -> CId -> Type +cftype args val = DTyp [Hyp wildCId (cftype [] arg) | arg <- args] val [] + +catSkeleton :: Type -> ([CId],CId) +catSkeleton ty = case ty of + DTyp hyps val _ -> ([valCat ty | Hyp _ ty <- hyps],val) + +typeSkeleton :: Type -> ([(Int,CId)],CId) +typeSkeleton ty = case ty of + DTyp hyps val _ -> ([(contextLength ty, valCat ty) | Hyp _ ty <- hyps],val) + +valCat :: Type -> CId +valCat ty = case ty of + DTyp _ val _ -> val + +contextLength :: Type -> Int +contextLength ty = case ty of + DTyp hyps _ _ -> length hyps + +cid :: String -> CId +cid = CId + +wildCId :: CId +wildCId = cid "_" + +exp0 :: Exp +exp0 = tree (AM 0) [] + +primNotion :: Exp +primNotion = EEq [] + +term0 :: CId -> Term +term0 = TM . prCId + +tm0 :: Term +tm0 = TM "?" + +kks :: String -> Term +kks = K . KS + +-- lookup with default value +lookMap :: (Show i, Ord i) => a -> i -> Map i a -> a +lookMap d c m = maybe d id $ Data.Map.lookup c m + +--- from Operations +combinations :: [[a]] -> [[a]] +combinations t = case t of + [] -> [[]] + aa:uu -> [a:u | a <- aa, u <- combinations uu] + + diff --git a/src-3.0/GF/GFCC/OptimizeGFCC.hs b/src-3.0/GF/GFCC/OptimizeGFCC.hs new file mode 100644 index 000000000..394458041 --- /dev/null +++ b/src-3.0/GF/GFCC/OptimizeGFCC.hs @@ -0,0 +1,116 @@ +module GF.GFCC.OptimizeGFCC where + +import GF.GFCC.CId +import GF.GFCC.DataGFCC + +import GF.Data.Operations + +import Data.List +import qualified Data.Map as Map + + +-- back-end optimization: +-- suffix analysis followed by common subexpression elimination + +optGFCC :: GFCC -> GFCC +optGFCC gfcc = gfcc { + concretes = Map.map opt (concretes gfcc) + } + where + opt cnc = subex $ cnc { + lins = Map.map optTerm (lins cnc), + lindefs = Map.map optTerm (lindefs cnc), + printnames = Map.map optTerm (printnames cnc) + } + +-- analyse word form lists into prefix + suffixes +-- suffix sets can later be shared by subex elim + +optTerm :: Term -> Term +optTerm tr = case tr of + R ts@(_:_:_) | all isK ts -> mkSuff $ optToks [s | K (KS s) <- ts] + R ts -> R $ map optTerm ts + P t v -> P (optTerm t) v + _ -> tr + where + optToks ss = prf : suffs where + prf = pref (head ss) (tail ss) + suffs = map (drop (length prf)) ss + pref cand ss = case ss of + s1:ss2 -> if isPrefixOf cand s1 then pref cand ss2 else pref (init cand) ss + _ -> cand + isK t = case t of + K (KS _) -> True + _ -> False + mkSuff ("":ws) = R (map (K . KS) ws) + mkSuff (p:ws) = W p (R (map (K . KS) ws)) + + +-- common subexpression elimination + +---subex :: [(CId,Term)] -> [(CId,Term)] +subex :: Concr -> Concr +subex cnc = errVal cnc $ do + (tree,_) <- appSTM (getSubtermsMod cnc) (Map.empty,0) + return $ addSubexpConsts tree cnc + +type TermList = Map.Map Term (Int,Int) -- number of occs, id +type TermM a = STM (TermList,Int) a + +addSubexpConsts :: TermList -> Concr -> Concr +addSubexpConsts tree cnc = cnc { + opers = Map.fromList [(f,recomp f trm) | (f,trm) <- ops], + lins = rec lins, + lindefs = rec lindefs, + printnames = rec printnames + } + where + ops = [(fid id, trm) | (trm,(_,id)) <- Map.assocs tree] + mkOne (f,trm) = (f, recomp f trm) + recomp f t = case Map.lookup t tree of + Just (_,id) | fid id /= f -> F $ fid id -- not to replace oper itself + _ -> case t of + R ts -> R $ map (recomp f) ts + S ts -> S $ map (recomp f) ts + W s t -> W s (recomp f t) + P t p -> P (recomp f t) (recomp f p) + _ -> t + fid n = CId $ "_" ++ show n + rec field = Map.fromAscList [(f,recomp f trm) | (f,trm) <- Map.assocs (field cnc)] + + +getSubtermsMod :: Concr -> TermM TermList +getSubtermsMod cnc = do + mapM getSubterms (Map.assocs (lins cnc)) + mapM getSubterms (Map.assocs (lindefs cnc)) + mapM getSubterms (Map.assocs (printnames cnc)) + (tree0,_) <- readSTM + return $ Map.filter (\ (nu,_) -> nu > 1) tree0 + where + getSubterms (f,trm) = collectSubterms trm >> return () + +collectSubterms :: Term -> TermM () +collectSubterms t = case t of + R ts -> do + mapM collectSubterms ts + add t + S ts -> do + mapM collectSubterms ts + add t + W s u -> do + collectSubterms u + add t + P p u -> do + collectSubterms p + collectSubterms u + add t + _ -> return () + where + add t = do + (ts,i) <- readSTM + let + ((count,id),next) = case Map.lookup t ts of + Just (nu,id) -> ((nu+1,id), i) + _ -> ((1, i ), i+1) + writeSTM (Map.insert t (count,id) ts, next) + diff --git a/src-3.0/GF/GFCC/Raw/AbsGFCCRaw.hs b/src-3.0/GF/GFCC/Raw/AbsGFCCRaw.hs new file mode 100644 index 000000000..ab5f184a8 --- /dev/null +++ b/src-3.0/GF/GFCC/Raw/AbsGFCCRaw.hs @@ -0,0 +1,17 @@ +module GF.GFCC.Raw.AbsGFCCRaw where + +-- Haskell module generated by the BNF converter + +newtype CId = CId String deriving (Eq,Ord,Show) +data Grammar = + Grm [RExp] + deriving (Eq,Ord,Show) + +data RExp = + App CId [RExp] + | AInt Integer + | AStr String + | AFlt Double + | AMet + deriving (Eq,Ord,Show) + diff --git a/src-3.0/GF/GFCC/Raw/ConvertGFCC.hs b/src-3.0/GF/GFCC/Raw/ConvertGFCC.hs new file mode 100644 index 000000000..0b010d604 --- /dev/null +++ b/src-3.0/GF/GFCC/Raw/ConvertGFCC.hs @@ -0,0 +1,277 @@ +module GF.GFCC.Raw.ConvertGFCC (toGFCC,fromGFCC) where + +import GF.GFCC.DataGFCC +import GF.GFCC.Raw.AbsGFCCRaw + +import GF.Data.Assoc +import GF.Formalism.FCFG +import GF.Formalism.Utilities (NameProfile(..), Profile(..), SyntaxForest(..)) +import GF.Parsing.FCFG.PInfo (FCFPInfo(..), buildFCFPInfo) + +import qualified Data.Array as Array +import Data.Map + +pgfMajorVersion, pgfMinorVersion :: Integer +(pgfMajorVersion, pgfMinorVersion) = (1,0) + +-- convert parsed grammar to internal GFCC + +toGFCC :: Grammar -> GFCC +toGFCC (Grm [ + App (CId "pgf") (AInt v1 : AInt v2 : App a []:cs), + App (CId "flags") gfs, + ab@( + App (CId "abstract") [ + App (CId "fun") fs, + App (CId "cat") cts + ]), + App (CId "concrete") ccs + ]) = GFCC { + absname = a, + cncnames = [c | App c [] <- cs], + gflags = fromAscList [(f,v) | App f [AStr v] <- gfs], + abstract = + let + aflags = fromAscList [(f,v) | App f [AStr v] <- gfs] + lfuns = [(f,(toType typ,toExp def)) | App f [typ, def] <- fs] + funs = fromAscList lfuns + lcats = [(c, Prelude.map toHypo hyps) | App c hyps <- cts] + cats = fromAscList lcats + catfuns = fromAscList + [(cat,[f | (f, (DTyp _ c _,_)) <- lfuns, c==cat]) | (cat,_) <- lcats] + in Abstr aflags funs cats catfuns, + concretes = fromAscList [(lang, toConcr ts) | App lang ts <- ccs] + } + where + +toConcr :: [RExp] -> Concr +toConcr = foldl add (Concr { + cflags = empty, + lins = empty, + opers = empty, + lincats = empty, + lindefs = empty, + printnames = empty, + paramlincats = empty, + parser = Nothing + }) + where + add :: Concr -> RExp -> Concr + add cnc (App (CId "flags") ts) = cnc { cflags = fromAscList [(f,v) | App f [AStr v] <- ts] } + add cnc (App (CId "lin") ts) = cnc { lins = mkTermMap ts } + add cnc (App (CId "oper") ts) = cnc { opers = mkTermMap ts } + add cnc (App (CId "lincat") ts) = cnc { lincats = mkTermMap ts } + add cnc (App (CId "lindef") ts) = cnc { lindefs = mkTermMap ts } + add cnc (App (CId "printname") ts) = cnc { printnames = mkTermMap ts } + add cnc (App (CId "param") ts) = cnc { paramlincats = mkTermMap ts } + add cnc (App (CId "parser") ts) = cnc { parser = Just (toPInfo ts) } + +toPInfo :: [RExp] -> FCFPInfo +toPInfo [App (CId "rules") rs, App (CId "startupcats") cs] = buildFCFPInfo (rules, cats) + where + rules = lmap toFRule rs + cats = fromList [(c, lmap expToInt fs) | App c fs <- cs] + + toFRule :: RExp -> FRule + toFRule (App (CId "rule") + [n, + App (CId "cats") (rt:at), + App (CId "R") ls]) = FRule name args res lins + where + name = toFName n + args = lmap expToInt at + res = expToInt rt + lins = mkArray [mkArray [toSymbol s | s <- l] | App (CId "S") l <- ls] + +toFName :: RExp -> FName +toFName (App (CId "_A") [x]) = Name (CId "_") [Unify [expToInt x]] +toFName (App f ts) = Name f (lmap toProfile ts) + where + toProfile :: RExp -> Profile (SyntaxForest CId) + toProfile AMet = Unify [] + toProfile (App (CId "_A") [t]) = Unify [expToInt t] + toProfile (App (CId "_U") ts) = Unify [expToInt t | App (CId "_A") [t] <- ts] + toProfile t = Constant (toSyntaxForest t) + + toSyntaxForest :: RExp -> SyntaxForest CId + toSyntaxForest AMet = FMeta + toSyntaxForest (App n ts) = FNode n [lmap toSyntaxForest ts] + toSyntaxForest (AStr s) = FString s + toSyntaxForest (AInt i) = FInt i + toSyntaxForest (AFlt f) = FFloat f + +toSymbol :: RExp -> FSymbol +toSymbol (App (CId "P") [c,n,l]) = FSymCat (expToInt c) (expToInt l) (expToInt n) +toSymbol (AStr t) = FSymTok t + +toType :: RExp -> Type +toType e = case e of + App cat [App (CId "H") hypos, App (CId "X") exps] -> + DTyp (lmap toHypo hypos) cat (lmap toExp exps) + _ -> error $ "type " ++ show e + +toHypo :: RExp -> Hypo +toHypo e = case e of + App x [typ] -> Hyp x (toType typ) + _ -> error $ "hypo " ++ show e + +toExp :: RExp -> Exp +toExp e = case e of + App (CId "App") [App fun [], App (CId "B") xs, App (CId "X") exps] -> + DTr [x | App x [] <- xs] (AC fun) (lmap toExp exps) + App (CId "Eq") eqs -> + EEq [Equ (lmap toExp ps) (toExp v) | App (CId "E") (v:ps) <- eqs] + App (CId "Var") [App i []] -> DTr [] (AV i) [] + AMet -> DTr [] (AM 0) [] + AInt i -> DTr [] (AI i) [] + AFlt i -> DTr [] (AF i) [] + AStr i -> DTr [] (AS i) [] + _ -> error $ "exp " ++ show e + +toTerm :: RExp -> Term +toTerm e = case e of + App (CId "R") es -> R (lmap toTerm es) + App (CId "S") es -> S (lmap toTerm es) + App (CId "FV") es -> FV (lmap toTerm es) + App (CId "P") [e,v] -> P (toTerm e) (toTerm v) + App (CId "RP") [e,v] -> RP (toTerm e) (toTerm v) ---- + App (CId "W") [AStr s,v] -> W s (toTerm v) + App (CId "A") [AInt i] -> V (fromInteger i) + App f [] -> F f + AInt i -> C (fromInteger i) + AMet -> TM "?" + AStr s -> K (KS s) ---- + _ -> error $ "term " ++ show e + +------------------------------ +--- from internal to parser -- +------------------------------ + +fromGFCC :: GFCC -> Grammar +fromGFCC gfcc0 = Grm [ + app "pgf" (AInt pgfMajorVersion:AInt pgfMinorVersion + : App (absname gfcc) [] : lmap (flip App []) (cncnames gfcc)), + app "flags" [App f [AStr v] | (f,v) <- toList (gflags gfcc `union` aflags agfcc)], + app "abstract" [ + app "fun" [App f [fromType t,fromExp d] | (f,(t,d)) <- toList (funs agfcc)], + app "cat" [App f (lmap fromHypo hs) | (f,hs) <- toList (cats agfcc)] + ], + app "concrete" [App lang (fromConcrete c) | (lang,c) <- toList (concretes gfcc)] + ] + where + gfcc = utf8GFCC gfcc0 + app s = App (CId s) + agfcc = abstract gfcc + fromConcrete cnc = [ + app "flags" [App f [AStr v] | (f,v) <- toList (cflags cnc)], + app "lin" [App f [fromTerm v] | (f,v) <- toList (lins cnc)], + app "oper" [App f [fromTerm v] | (f,v) <- toList (opers cnc)], + app "lincat" [App f [fromTerm v] | (f,v) <- toList (lincats cnc)], + app "lindef" [App f [fromTerm v] | (f,v) <- toList (lindefs cnc)], + app "printname" [App f [fromTerm v] | (f,v) <- toList (printnames cnc)], + app "param" [App f [fromTerm v] | (f,v) <- toList (paramlincats cnc)] + ] ++ maybe [] (\p -> [fromPInfo p]) (parser cnc) + +fromType :: Type -> RExp +fromType e = case e of + DTyp hypos cat exps -> + App cat [ + App (CId "H") (lmap fromHypo hypos), + App (CId "X") (lmap fromExp exps)] + +fromHypo :: Hypo -> RExp +fromHypo e = case e of + Hyp x typ -> App x [fromType typ] + +fromExp :: Exp -> RExp +fromExp e = case e of + DTr xs (AC fun) exps -> + App (CId "App") [App fun [], App (CId "B") (lmap (flip App []) xs), App (CId "X") (lmap fromExp exps)] + DTr [] (AV x) [] -> App (CId "Var") [App x []] + DTr [] (AS s) [] -> AStr s + DTr [] (AF d) [] -> AFlt d + DTr [] (AI i) [] -> AInt (toInteger i) + DTr [] (AM _) [] -> AMet ---- + EEq eqs -> + App (CId "Eq") [App (CId "E") (lmap fromExp (v:ps)) | Equ ps v <- eqs] + _ -> error $ "exp " ++ show e + +fromTerm :: Term -> RExp +fromTerm e = case e of + R es -> app "R" (lmap fromTerm es) + S es -> app "S" (lmap fromTerm es) + FV es -> app "FV" (lmap fromTerm es) + P e v -> app "P" [fromTerm e, fromTerm v] + RP e v -> app "RP" [fromTerm e, fromTerm v] ---- + W s v -> app "W" [AStr s, fromTerm v] + C i -> AInt (toInteger i) + TM _ -> AMet + F f -> App f [] + V i -> App (CId "A") [AInt (toInteger i)] + K (KS s) -> AStr s ---- + K (KP d vs) -> app "FV" (str d : [str v | Var v _ <- vs]) ---- + where + app = App . CId + str v = app "S" (lmap AStr v) + +-- ** Parsing info + +fromPInfo :: FCFPInfo -> RExp +fromPInfo p = app "parser" [ + app "rules" [fromFRule rule | rule <- Array.elems (allRules p)], + app "startupcats" [App f (lmap intToExp cs) | (f,cs) <- toList (startupCats p)] + ] + +fromFRule :: FRule -> RExp +fromFRule (FRule n args res lins) = + app "rule" [fromFName n, + app "cats" (intToExp res:lmap intToExp args), + app "R" [app "S" [fromSymbol s | s <- Array.elems l] | l <- Array.elems lins] + ] + +fromFName :: FName -> RExp +fromFName n = case n of + Name (CId "_") [p] -> fromProfile p + Name f ps -> App f (lmap fromProfile ps) + where + fromProfile :: Profile (SyntaxForest CId) -> RExp + fromProfile (Unify []) = AMet + fromProfile (Unify [x]) = daughter x + fromProfile (Unify args) = app "_U" (lmap daughter args) + fromProfile (Constant forest) = fromSyntaxForest forest + + daughter n = app "_A" [intToExp n] + + fromSyntaxForest :: SyntaxForest CId -> RExp + fromSyntaxForest FMeta = AMet + -- FIXME: is there always just one element here? + fromSyntaxForest (FNode n [args]) = App n (lmap fromSyntaxForest args) + fromSyntaxForest (FString s) = AStr s + fromSyntaxForest (FInt i) = AInt i + fromSyntaxForest (FFloat f) = AFlt f + +fromSymbol :: FSymbol -> RExp +fromSymbol (FSymCat c l n) = app "P" [intToExp c, intToExp n, intToExp l] +fromSymbol (FSymTok t) = AStr t + +-- ** Utilities + +mkTermMap :: [RExp] -> Map CId Term +mkTermMap ts = fromAscList [(f,toTerm v) | App f [v] <- ts] + +app :: String -> [RExp] -> RExp +app = App . CId + +mkArray :: [a] -> Array.Array Int a +mkArray xs = Array.listArray (0, length xs - 1) xs + +expToInt :: Integral a => RExp -> a +expToInt (App (CId "neg") [AInt i]) = fromIntegral (negate i) +expToInt (AInt i) = fromIntegral i + +expToStr :: RExp -> String +expToStr (AStr s) = s + +intToExp :: Integral a => a -> RExp +intToExp x | x < 0 = App (CId "neg") [AInt (fromIntegral (negate x))] + | otherwise = AInt (fromIntegral x) diff --git a/src-3.0/GF/GFCC/Raw/GFCCRaw.cf b/src-3.0/GF/GFCC/Raw/GFCCRaw.cf new file mode 100644 index 000000000..bedaef685 --- /dev/null +++ b/src-3.0/GF/GFCC/Raw/GFCCRaw.cf @@ -0,0 +1,12 @@ +Grm. Grammar ::= [RExp] ; + +App. RExp ::= "(" CId [RExp] ")" ; +AId. RExp ::= CId ; +AInt. RExp ::= Integer ; +AStr. RExp ::= String ; +AFlt. RExp ::= Double ; +AMet. RExp ::= "?" ; + +terminator RExp "" ; + +token CId (('_' | letter) (letter | digit | '\'' | '_')*) ; diff --git a/src-3.0/GF/GFCC/Raw/ParGFCCRaw.hs b/src-3.0/GF/GFCC/Raw/ParGFCCRaw.hs new file mode 100644 index 000000000..b71904948 --- /dev/null +++ b/src-3.0/GF/GFCC/Raw/ParGFCCRaw.hs @@ -0,0 +1,99 @@ +module GF.GFCC.Raw.ParGFCCRaw (parseGrammar) where + +import GF.GFCC.Raw.AbsGFCCRaw + +import Control.Monad +import Data.Char + +parseGrammar :: String -> IO Grammar +parseGrammar s = case runP pGrammar s of + Just (x,"") -> return x + _ -> fail "Parse error" + +pGrammar :: P Grammar +pGrammar = liftM Grm pTerms + +pTerms :: P [RExp] +pTerms = liftM2 (:) (pTerm 1) pTerms <++ (skipSpaces >> return []) + +pTerm :: Int -> P RExp +pTerm n = skipSpaces >> (pParen <++ pApp <++ pNum <++ pStr <++ pMeta) + where pParen = between (char '(') (char ')') (pTerm 0) + pApp = liftM2 App pIdent (if n == 0 then pTerms else return []) + pStr = char '"' >> liftM AStr (manyTill (pEsc <++ get) (char '"')) + pEsc = char '\\' >> get + pNum = do x <- munch1 isDigit + ((char '.' >> munch1 isDigit >>= \y -> return (AFlt (read (x++"."++y)))) + <++ + return (AInt (read x))) + pMeta = char '?' >> return AMet + pIdent = liftM CId $ liftM2 (:) (satisfy isIdentFirst) (munch isIdentRest) + isIdentFirst c = c == '_' || isAlpha c + isIdentRest c = c == '_' || c == '\'' || isAlphaNum c + +-- Parser combinators with only left-biased choice + +newtype P a = P { runP :: String -> Maybe (a,String) } + +instance Monad P where + return x = P (\ts -> Just (x,ts)) + P p >>= f = P (\ts -> p ts >>= \ (x,ts') -> runP (f x) ts') + fail _ = pfail + +instance MonadPlus P where + mzero = pfail + mplus = (<++) + + +get :: P Char +get = P (\ts -> case ts of + [] -> Nothing + c:cs -> Just (c,cs)) + +look :: P String +look = P (\ts -> Just (ts,ts)) + +(<++) :: P a -> P a -> P a +P p <++ P q = P (\ts -> p ts `mplus` q ts) + +pfail :: P a +pfail = P (\ts -> Nothing) + +satisfy :: (Char -> Bool) -> P Char +satisfy p = do c <- get + if p c then return c else pfail + +char :: Char -> P Char +char c = satisfy (c==) + +string :: String -> P String +string this = look >>= scan this + where + scan [] _ = return this + scan (x:xs) (y:ys) | x == y = get >> scan xs ys + scan _ _ = pfail + +skipSpaces :: P () +skipSpaces = look >>= skip + where + skip (c:s) | isSpace c = get >> skip s + skip _ = return () + +manyTill :: P a -> P end -> P [a] +manyTill p end = scan + where scan = (end >> return []) <++ liftM2 (:) p scan + +munch :: (Char -> Bool) -> P String +munch p = munch1 p <++ return [] + +munch1 :: (Char -> Bool) -> P String +munch1 p = liftM2 (:) (satisfy p) (munch p) + +choice :: [P a] -> P a +choice = msum + +between :: P open -> P close -> P a -> P a +between open close p = do open + x <- p + close + return x diff --git a/src-3.0/GF/GFCC/Raw/PrintGFCCRaw.hs b/src-3.0/GF/GFCC/Raw/PrintGFCCRaw.hs new file mode 100644 index 000000000..d46d8096f --- /dev/null +++ b/src-3.0/GF/GFCC/Raw/PrintGFCCRaw.hs @@ -0,0 +1,36 @@ +module GF.GFCC.Raw.PrintGFCCRaw (printTree) where + +import GF.GFCC.Raw.AbsGFCCRaw + +import Data.List (intersperse) +import Numeric (showFFloat) + +printTree :: Grammar -> String +printTree g = prGrammar g "" + +prGrammar :: Grammar -> ShowS +prGrammar (Grm xs) = prRExpList xs + +prRExp :: Int -> RExp -> ShowS +prRExp _ (App x []) = prCId x +prRExp n (App x xs) = p (prCId x . showChar ' ' . prRExpList xs) + where p s = if n == 0 then s else showChar '(' . s . showChar ')' +prRExp _ (AInt x) = shows x +prRExp _ (AStr x) = showChar '"' . concatS (map mkEsc x) . showChar '"' +prRExp _ (AFlt x) = showFFloat Nothing x +prRExp _ AMet = showChar '?' + +mkEsc :: Char -> ShowS +mkEsc s = case s of + '"' -> showString "\\\"" + '\\' -> showString "\\\\" + _ -> showChar s + +prRExpList :: [RExp] -> ShowS +prRExpList = concatS . intersperse (showChar ' ') . map (prRExp 1) + +prCId :: CId -> ShowS +prCId (CId x) = showString x + +concatS :: [ShowS] -> ShowS +concatS = foldr (.) id diff --git a/src-3.0/GF/GFCC/ShowLinearize.hs b/src-3.0/GF/GFCC/ShowLinearize.hs new file mode 100644 index 000000000..f627dfd28 --- /dev/null +++ b/src-3.0/GF/GFCC/ShowLinearize.hs @@ -0,0 +1,87 @@ +module GF.GFCC.ShowLinearize ( + tableLinearize, + recordLinearize, + termLinearize, + allLinearize + ) where + +import GF.GFCC.Linearize +import GF.GFCC.Macros +import GF.GFCC.DataGFCC +import GF.GFCC.CId +--import GF.GFCC.PrintGFCC ---- + +import GF.Data.Operations +import Data.List + +-- printing linearizations in different ways with source parameters + +-- internal representation, only used internally in this module +data Record = + RR [(String,Record)] + | RT [(String,Record)] + | RFV [Record] + | RS String + | RCon String + +prRecord :: Record -> String +prRecord = prr where + prr t = case t of + RR fs -> concat $ + "{" : + (intersperse ";" (map (\ (l,v) -> unwords [l,"=", prr v]) fs)) ++ ["}"] + RT fs -> concat $ + "table {" : + (intersperse ";" (map (\ (l,v) -> unwords [l,"=>",prr v]) fs)) ++ ["}"] + RFV ts -> concat $ + "variants {" : (intersperse ";" (map prr ts)) ++ ["}"] + RS s -> prQuotedString s + RCon s -> s + +-- uses the encoding of record types in GFCC.paramlincat +mkRecord :: Term -> Term -> Record +mkRecord typ trm = case (typ,trm) of + (R rs, R ts) -> RR [(str lab, mkRecord ty t) | (P lab ty, t) <- zip rs ts] + (S [FV ps,ty],R ts) -> RT [(str par, mkRecord ty t) | (par, t) <- zip ps ts] + (_,W s (R ts)) -> mkRecord typ (R [K (KS (s ++ u)) | K (KS u) <- ts]) + (FV ps, C i) -> RCon $ str $ ps !! i + (S [], _) -> RS $ realize trm + _ -> RS $ show trm ---- printTree trm + where + str = realize + +-- show all branches, without labels and params +allLinearize :: GFCC -> CId -> Exp -> String +allLinearize gfcc lang = concat . map pr . tabularLinearize gfcc lang where + pr (p,vs) = unlines vs + +-- show all branches, with labels and params +tableLinearize :: GFCC -> CId -> Exp -> String +tableLinearize gfcc lang = unlines . map pr . tabularLinearize gfcc lang where + pr (p,vs) = p +++ ":" +++ unwords (intersperse "|" vs) + +-- create a table from labels+params to variants +tabularLinearize :: GFCC -> CId -> Exp -> [(String,[String])] +tabularLinearize gfcc lang = branches . recLinearize gfcc lang where + branches r = case r of + RR fs -> [(lab +++ b,s) | (lab,t) <- fs, (b,s) <- branches t] + RT fs -> [(lab +++ b,s) | (lab,t) <- fs, (b,s) <- branches t] + RFV rs -> [([], ss) | (_,ss) <- concatMap branches rs] + RS s -> [([], [s])] + RCon _ -> [] + +-- show record in GF-source-like syntax +recordLinearize :: GFCC -> CId -> Exp -> String +recordLinearize gfcc lang = prRecord . recLinearize gfcc lang + +-- create a GF-like record, forming the basis of all functions above +recLinearize :: GFCC -> CId -> Exp -> Record +recLinearize gfcc lang exp = mkRecord typ $ linExp gfcc lang exp where + typ = case exp of + DTr _ (AC f) _ -> lookParamLincat gfcc lang $ valCat $ lookType gfcc f + +-- show GFCC term +termLinearize :: GFCC -> CId -> Exp -> String +termLinearize gfcc lang = show . linExp gfcc lang + + diff --git a/src-3.0/GF/GFCC/SkelGFCC.hs b/src-3.0/GF/GFCC/SkelGFCC.hs new file mode 100644 index 000000000..6972fd3c3 --- /dev/null +++ b/src-3.0/GF/GFCC/SkelGFCC.hs @@ -0,0 +1,109 @@ +module GF.GFCC.SkelGFCC where + +-- Haskell module generated by the BNF converter + +import GF.GFCC.AbsGFCC +import GF.Data.ErrM +type Result = Err String + +failure :: Show a => a -> Result +failure x = Bad $ "Undefined case: " ++ show x + +transCId :: CId -> Result +transCId x = case x of + CId str -> failure x + + +transGrammar :: Grammar -> Result +transGrammar x = case x of + Grm cid cids abstract concretes -> failure x + + +transAbstract :: Abstract -> Result +transAbstract x = case x of + Abs flags fundefs catdefs -> failure x + + +transConcrete :: Concrete -> Result +transConcrete x = case x of + Cnc cid flags lindefs0 lindefs1 lindefs2 lindefs3 lindefs -> failure x + + +transFlag :: Flag -> Result +transFlag x = case x of + Flg cid str -> failure x + + +transCatDef :: CatDef -> Result +transCatDef x = case x of + Cat cid hypos -> failure x + + +transFunDef :: FunDef -> Result +transFunDef x = case x of + Fun cid type' exp -> failure x + + +transLinDef :: LinDef -> Result +transLinDef x = case x of + Lin cid term -> failure x + + +transType :: Type -> Result +transType x = case x of + DTyp hypos cid exps -> failure x + + +transExp :: Exp -> Result +transExp x = case x of + DTr cids atom exps -> failure x + EEq equations -> failure x + + +transAtom :: Atom -> Result +transAtom x = case x of + AC cid -> failure x + AS str -> failure x + AI n -> failure x + AF d -> failure x + AM n -> failure x + AV cid -> failure x + + +transTerm :: Term -> Result +transTerm x = case x of + R terms -> failure x + P term0 term -> failure x + S terms -> failure x + K tokn -> failure x + V n -> failure x + C n -> failure x + F cid -> failure x + FV terms -> failure x + W str term -> failure x + TM -> failure x + RP term0 term -> failure x + + +transTokn :: Tokn -> Result +transTokn x = case x of + KS str -> failure x + KP strs variants -> failure x + + +transVariant :: Variant -> Result +transVariant x = case x of + Var strs0 strs -> failure x + + +transHypo :: Hypo -> Result +transHypo x = case x of + Hyp cid type' -> failure x + + +transEquation :: Equation -> Result +transEquation x = case x of + Equ exps exp -> failure x + + + diff --git a/src-3.0/GF/GFCC/TestGFCC.hs b/src-3.0/GF/GFCC/TestGFCC.hs new file mode 100644 index 000000000..c379a687a --- /dev/null +++ b/src-3.0/GF/GFCC/TestGFCC.hs @@ -0,0 +1,58 @@ +-- automatically generated by BNF Converter +module Main where + + +import IO ( stdin, hGetContents ) +import System ( getArgs, getProgName ) + +import GF.GFCC.LexGFCC +import GF.GFCC.ParGFCC +import GF.GFCC.SkelGFCC +import GF.GFCC.PrintGFCC +import GF.GFCC.AbsGFCC + + + + +import GF.Data.ErrM + +type ParseFun a = [Token] -> Err a + +myLLexer = myLexer + +type Verbosity = Int + +putStrV :: Verbosity -> String -> IO () +putStrV v s = if v > 1 then putStrLn s else return () + +runFile :: (Print a, Show a) => Verbosity -> ParseFun a -> FilePath -> IO () +runFile v p f = putStrLn f >> readFile f >>= run v p + +run :: (Print a, Show a) => Verbosity -> ParseFun a -> String -> IO () +run v p s = let ts = myLLexer s in case p ts of + Bad s -> do putStrLn "\nParse Failed...\n" + putStrV v "Tokens:" + putStrV v $ show ts + putStrLn s + Ok tree -> do putStrLn "\nParse Successful!" + showTree v tree + + + +showTree :: (Show a, Print a) => Int -> a -> IO () +showTree v tree + = do + putStrV v $ "\n[Abstract Syntax]\n\n" ++ show tree + putStrV v $ "\n[Linearized tree]\n\n" ++ printTree tree + +main :: IO () +main = do args <- getArgs + case args of + [] -> hGetContents stdin >>= run 2 pGrammar + "-s":fs -> mapM_ (runFile 0 pGrammar) fs + fs -> mapM_ (runFile 2 pGrammar) fs + + + + + diff --git a/src-3.0/GF/GFCC/doc/Eng.gf b/src-3.0/GF/GFCC/doc/Eng.gf new file mode 100644 index 000000000..c64f46313 --- /dev/null +++ b/src-3.0/GF/GFCC/doc/Eng.gf @@ -0,0 +1,13 @@ +concrete Eng of Ex = { + lincat + S = {s : Str} ; + NP = {s : Str ; n : Num} ; + VP = {s : Num => Str} ; + param + Num = Sg | Pl ; + lin + Pred np vp = {s = np.s ++ vp.s ! np.n} ; + She = {s = "she" ; n = Sg} ; + They = {s = "they" ; n = Pl} ; + Sleep = {s = table {Sg => "sleeps" ; Pl => "sleep"}} ; +} diff --git a/src-3.0/GF/GFCC/doc/Ex.gf b/src-3.0/GF/GFCC/doc/Ex.gf new file mode 100644 index 000000000..bd0b03483 --- /dev/null +++ b/src-3.0/GF/GFCC/doc/Ex.gf @@ -0,0 +1,8 @@ +abstract Ex = { + cat + S ; NP ; VP ; + fun + Pred : NP -> VP -> S ; + She, They : NP ; + Sleep : VP ; +} diff --git a/src-3.0/GF/GFCC/doc/Swe.gf b/src-3.0/GF/GFCC/doc/Swe.gf new file mode 100644 index 000000000..1d6672371 --- /dev/null +++ b/src-3.0/GF/GFCC/doc/Swe.gf @@ -0,0 +1,13 @@ +concrete Swe of Ex = { + lincat + S = {s : Str} ; + NP = {s : Str} ; + VP = {s : Str} ; + param + Num = Sg | Pl ; + lin + Pred np vp = {s = np.s ++ vp.s} ; + She = {s = "hon"} ; + They = {s = "de"} ; + Sleep = {s = "sover"} ; +} diff --git a/src-3.0/GF/GFCC/doc/Test.gf b/src-3.0/GF/GFCC/doc/Test.gf new file mode 100644 index 000000000..5cd4c5474 --- /dev/null +++ b/src-3.0/GF/GFCC/doc/Test.gf @@ -0,0 +1,64 @@ +-- to test GFCC compilation + +flags coding=utf8 ; + +cat S ; NP ; N ; VP ; + +fun Pred : NP -> VP -> S ; +fun Pred2 : NP -> VP -> NP -> S ; +fun Det, Dets : N -> NP ; +fun Mina, Sina, Me, Te : NP ; +fun Raha, Paska, Pallo : N ; +fun Puhua, Munia, Sanoa : VP ; + +param Person = P1 | P2 | P3 ; +param Number = Sg | Pl ; +param Case = Nom | Part ; + +param NForm = NF Number Case ; +param VForm = VF Number Person ; + +lincat N = Noun ; +lincat VP = Verb ; + +oper Noun = {s : NForm => Str} ; +oper Verb = {s : VForm => Str} ; + +lincat NP = {s : Case => Str ; a : {n : Number ; p : Person}} ; + +lin Pred np vp = {s = np.s ! Nom ++ vp.s ! VF np.a.n np.a.p} ; +lin Pred2 np vp ob = {s = np.s ! Nom ++ vp.s ! VF np.a.n np.a.p ++ ob.s ! Part} ; +lin Det no = {s = \\c => no.s ! NF Sg c ; a = {n = Sg ; p = P3}} ; +lin Dets no = {s = \\c => no.s ! NF Pl c ; a = {n = Pl ; p = P3}} ; +lin Mina = {s = table Case ["minä" ; "minua"] ; a = {n = Sg ; p = P1}} ; +lin Te = {s = table Case ["te" ; "teitä"] ; a = {n = Pl ; p = P2}} ; +lin Sina = {s = table Case ["sinä" ; "sinua"] ; a = {n = Sg ; p = P2}} ; +lin Me = {s = table Case ["me" ; "meitä"] ; a = {n = Pl ; p = P1}} ; + +lin Raha = mkN "raha" ; +lin Paska = mkN "paska" ; +lin Pallo = mkN "pallo" ; +lin Puhua = mkV "puhu" ; +lin Munia = mkV "muni" ; +lin Sanoa = mkV "sano" ; + +oper mkN : Str -> Noun = \raha -> { + s = table { + NF Sg Nom => raha ; + NF Sg Part => raha + "a" ; + NF Pl Nom => raha + "t" ; + NF Pl Part => Predef.tk 1 raha + "oja" + } + } ; + +oper mkV : Str -> Verb = \puhu -> { + s = table { + VF Sg P1 => puhu + "n" ; + VF Sg P2 => puhu + "t" ; + VF Sg P3 => puhu + Predef.dp 1 puhu ; + VF Pl P1 => puhu + "mme" ; + VF Pl P2 => puhu + "tte" ; + VF Pl P3 => puhu + "vat" + } + } ; + diff --git a/src-3.0/GF/GFCC/doc/gfcc.html b/src-3.0/GF/GFCC/doc/gfcc.html new file mode 100644 index 000000000..8f8c478c0 --- /dev/null +++ b/src-3.0/GF/GFCC/doc/gfcc.html @@ -0,0 +1,809 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"> +<HTML> +<HEAD> +<META NAME="generator" CONTENT="http://txt2tags.sf.net"> +<TITLE>The GFCC Grammar Format</TITLE> +</HEAD><BODY BGCOLOR="white" TEXT="black"> +<P ALIGN="center"><CENTER><H1>The GFCC Grammar Format</H1> +<FONT SIZE="4"> +<I>Aarne Ranta</I><BR> +October 5, 2007 +</FONT></CENTER> + +<P> +Author's address: +<A HREF="http://www.cs.chalmers.se/~aarne"><CODE>http://www.cs.chalmers.se/~aarne</CODE></A> +</P> +<P> +History: +</P> +<UL> +<LI>5 Oct 2007: new, better structured GFCC with full expressive power +<LI>19 Oct: translation of lincats, new figures on C++ +<LI>3 Oct 2006: first version +</UL> + +<H2>What is GFCC</H2> +<P> +GFCC is a low-level format for GF grammars. Its aim is to contain the minimum +that is needed to process GF grammars at runtime. This minimality has three +advantages: +</P> +<UL> +<LI>compact grammar files and run-time objects +<LI>time and space efficient processing +<LI>simple definition of interpreters +</UL> + +<P> +Thus we also want to call GFCC the <B>portable grammar format</B>. +</P> +<P> +The idea is that all embedded GF applications use GFCC. +The GF system would be primarily used as a compiler and as a grammar +development tool. +</P> +<P> +Since GFCC is implemented in BNFC, a parser of the format is readily +available for C, C++, C#, Haskell, Java, and OCaml. Also an XML +representation can be generated in BNFC. A +<A HREF="../">reference implementation</A> +of linearization and some other functions has been written in Haskell. +</P> +<H2>GFCC vs. GFC</H2> +<P> +GFCC is aimed to replace GFC as the run-time grammar format. GFC was designed +to be a run-time format, but also to +support separate compilation of grammars, i.e. +to store the results of compiling +individual GF modules. But this means that GFC has to contain extra information, +such as type annotations, which is only needed in compilation and not at +run-time. In particular, the pattern matching syntax and semantics of GFC is +complex and therefore difficult to implement in new platforms. +</P> +<P> +Actually, GFC is planned to be omitted also as the target format of +separate compilation, where plain GF (type annotated and partially evaluated) +will be used instead. GFC provides only marginal advantages as a target format +compared with GF, and it is therefore just extra weight to carry around this +format. +</P> +<P> +The main differences of GFCC compared with GFC (and GF) can be summarized as follows: +</P> +<UL> +<LI>there are no modules, and therefore no qualified names +<LI>a GFCC grammar is multilingual, and consists of a common abstract syntax + together with one concrete syntax per language +<LI>records and tables are replaced by arrays +<LI>record labels and parameter values are replaced by integers +<LI>record projection and table selection are replaced by array indexing +<LI>even though the format does support dependent types and higher-order abstract + syntax, there is no interpreted yet that does this +</UL> + +<P> +Here is an example of a GF grammar, consisting of three modules, +as translated to GFCC. The representations are aligned; thus they do not completely +reflect the order of judgements in GFCC files, which have different orders of +blocks of judgements, and alphabetical sorting. +</P> +<PRE> + grammar Ex(Eng,Swe); + + abstract Ex = { abstract { + cat cat + S ; NP ; VP ; NP[]; S[]; VP[]; + fun fun + Pred : NP -> VP -> S ; Pred=[(($ 0! 1),(($ 1! 0)!($ 0! 0)))]; + She, They : NP ; She=[0,"she"]; + Sleep : VP ; They=[1,"they"]; + Sleep=[["sleeps","sleep"]]; + } } ; + + concrete Eng of Ex = { concrete Eng { + lincat lincat + S = {s : Str} ; S=[()]; + NP = {s : Str ; n : Num} ; NP=[1,()]; + VP = {s : Num => Str} ; VP=[[(),()]]; + param + Num = Sg | Pl ; + lin lin + Pred np vp = { Pred=[(($ 0! 1),(($ 1! 0)!($ 0! 0)))]; + s = np.s ++ vp.s ! np.n} ; + She = {s = "she" ; n = Sg} ; She=[0,"she"]; + They = {s = "they" ; n = Pl} ; They = [1, "they"]; + Sleep = {s = table { Sleep=[["sleeps","sleep"]]; + Sg => "sleeps" ; + Pl => "sleep" + } + } ; + } } ; + + concrete Swe of Ex = { concrete Swe { + lincat lincat + S = {s : Str} ; S=[()]; + NP = {s : Str} ; NP=[()]; + VP = {s : Str} ; VP=[()]; + param + Num = Sg | Pl ; + lin lin + Pred np vp = { Pred = [(($0!0),($1!0))]; + s = np.s ++ vp.s} ; + She = {s = "hon"} ; She = ["hon"]; + They = {s = "de"} ; They = ["de"]; + Sleep = {s = "sover"} ; Sleep = ["sover"]; + } } ; +</PRE> +<P></P> +<H2>The syntax of GFCC files</H2> +<P> +The complete BNFC grammar, from which +the rules in this section are taken, is in the file +<A HREF="../DataGFCC.cf"><CODE>GF/GFCC/GFCC.cf</CODE></A>. +</P> +<H3>Top level</H3> +<P> +A grammar has a header telling the name of the abstract syntax +(often specifying an application domain), and the names of +the concrete languages. The abstract syntax and the concrete +syntaxes themselves follow. +</P> +<PRE> + Grm. Grammar ::= + "grammar" CId "(" [CId] ")" ";" + Abstract ";" + [Concrete] ; + + Abs. Abstract ::= + "abstract" "{" + "flags" [Flag] + "fun" [FunDef] + "cat" [CatDef] + "}" ; + + Cnc. Concrete ::= + "concrete" CId "{" + "flags" [Flag] + "lin" [LinDef] + "oper" [LinDef] + "lincat" [LinDef] + "lindef" [LinDef] + "printname" [LinDef] + "}" ; +</PRE> +<P> +This syntax organizes each module to a sequence of <B>fields</B>, such +as flags, linearizations, operations, linearization types, etc. +It is envisaged that particular applications can ignore some +of the fields, typically so that earlier fields are more +important than later ones. +</P> +<P> +The judgement forms have the following syntax. +</P> +<PRE> + Flg. Flag ::= CId "=" String ; + Cat. CatDef ::= CId "[" [Hypo] "]" ; + Fun. FunDef ::= CId ":" Type "=" Exp ; + Lin. LinDef ::= CId "=" Term ; +</PRE> +<P> +For the run-time system, the reference implementation in Haskell +uses a structure that gives efficient look-up: +</P> +<PRE> + data GFCC = GFCC { + absname :: CId , + cncnames :: [CId] , + abstract :: Abstr , + concretes :: Map CId Concr + } + + data Abstr = Abstr { + aflags :: Map CId String, -- value of a flag + funs :: Map CId (Type,Exp), -- type and def of a fun + cats :: Map CId [Hypo], -- context of a cat + catfuns :: Map CId [CId] -- funs yielding a cat (redundant, for fast lookup) + } + + data Concr = Concr { + flags :: Map CId String, -- value of a flag + lins :: Map CId Term, -- lin of a fun + opers :: Map CId Term, -- oper generated by subex elim + lincats :: Map CId Term, -- lin type of a cat + lindefs :: Map CId Term, -- lin default of a cat + printnames :: Map CId Term -- printname of a cat or a fun + } +</PRE> +<P> +These definitions are from <A HREF="../DataGFCC.hs"><CODE>GF/GFCC/DataGFCC.hs</CODE></A>. +</P> +<P> +Identifiers (<CODE>CId</CODE>) are like <CODE>Ident</CODE> in GF, except that +the compiler produces constants prefixed with <CODE>_</CODE> in +the common subterm elimination optimization. +</P> +<PRE> + token CId (('_' | letter) (letter | digit | '\'' | '_')*) ; +</PRE> +<P></P> +<H3>Abstract syntax</H3> +<P> +Types are first-order function types built from argument type +contexts and value types. +category symbols. Syntax trees (<CODE>Exp</CODE>) are +rose trees with nodes consisting of a head (<CODE>Atom</CODE>) and +bound variables (<CODE>CId</CODE>). +</P> +<PRE> + DTyp. Type ::= "[" [Hypo] "]" CId [Exp] ; + DTr. Exp ::= "[" "(" [CId] ")" Atom [Exp] "]" ; + Hyp. Hypo ::= CId ":" Type ; +</PRE> +<P> +The head Atom is either a function +constant, a bound variable, or a metavariable, or a string, integer, or float +literal. +</P> +<PRE> + AC. Atom ::= CId ; + AS. Atom ::= String ; + AI. Atom ::= Integer ; + AF. Atom ::= Double ; + AM. Atom ::= "?" Integer ; +</PRE> +<P> +The context-free types and trees of the "old GFCC" are special +cases, which can be defined as follows: +</P> +<PRE> + Typ. Type ::= [CId] "->" CId + Typ args val = DTyp [Hyp (CId "_") arg | arg <- args] val + + Tr. Exp ::= "(" CId [Exp] ")" + Tr fun exps = DTr [] fun exps +</PRE> +<P> +To store semantic (<CODE>def</CODE>) definitions by cases, the following expression +form is provided, but it is only meaningful in the last field of a function +declaration in an abstract syntax: +</P> +<PRE> + EEq. Exp ::= "{" [Equation] "}" ; + Equ. Equation ::= [Exp] "->" Exp ; +</PRE> +<P> +Notice that expressions are used to encode patterns. Primitive notions +(the default semantics in GF) are encoded as empty sets of equations +(<CODE>[]</CODE>). For a constructor (canonical form) of a category <CODE>C</CODE>, we +aim to use the encoding as the application <CODE>(_constr C)</CODE>. +</P> +<H3>Concrete syntax</H3> +<P> +Linearization terms (<CODE>Term</CODE>) are built as follows. +Constructor names are shown to make the later code +examples readable. +</P> +<PRE> + R. Term ::= "[" [Term] "]" ; -- array (record/table) + P. Term ::= "(" Term "!" Term ")" ; -- access to field (projection/selection) + S. Term ::= "(" [Term] ")" ; -- concatenated sequence + K. Term ::= Tokn ; -- token + V. Term ::= "$" Integer ; -- argument (subtree) + C. Term ::= Integer ; -- array index (label/parameter value) + FV. Term ::= "[|" [Term] "|]" ; -- free variation + TM. Term ::= "?" ; -- linearization of metavariable +</PRE> +<P> +Tokens are strings or (maybe obsolescent) prefix-dependent +variant lists. +</P> +<PRE> + KS. Tokn ::= String ; + KP. Tokn ::= "[" "pre" [String] "[" [Variant] "]" "]" ; + Var. Variant ::= [String] "/" [String] ; +</PRE> +<P> +Two special forms of terms are introduced by the compiler +as optimizations. They can in principle be eliminated, but +their presence makes grammars much more compact. Their semantics +will be explained in a later section. +</P> +<PRE> + F. Term ::= CId ; -- global constant + W. Term ::= "(" String "+" Term ")" ; -- prefix + suffix table +</PRE> +<P> +There is also a deprecated form of "record parameter alias", +</P> +<PRE> + RP. Term ::= "(" Term "@" Term ")"; -- DEPRECATED +</PRE> +<P> +which will be removed when the migration to new GFCC is complete. +</P> +<H2>The semantics of concrete syntax terms</H2> +<P> +The code in this section is from <A HREF="../Linearize.hs"><CODE>GF/GFCC/Linearize.hs</CODE></A>. +</P> +<H3>Linearization and realization</H3> +<P> +The linearization algorithm is essentially the same as in +GFC: a tree is linearized by evaluating its linearization term +in the environment of the linearizations of the subtrees. +Literal atoms are linearized in the obvious way. +The function also needs to know the language (i.e. concrete syntax) +in which linearization is performed. +</P> +<PRE> + linExp :: GFCC -> CId -> Exp -> Term + linExp gfcc lang tree@(DTr _ at trees) = case at of + AC fun -> comp (Prelude.map lin trees) $ look fun + AS s -> R [kks (show s)] -- quoted + AI i -> R [kks (show i)] + AF d -> R [kks (show d)] + AM -> TM + where + lin = linExp gfcc lang + comp = compute gfcc lang + look = lookLin gfcc lang +</PRE> +<P> +TODO: bindings must be supported. +</P> +<P> +The result of linearization is usually a record, which is realized as +a string using the following algorithm. +</P> +<PRE> + realize :: Term -> String + realize trm = case trm of + R (t:_) -> realize t + S ss -> unwords $ Prelude.map realize ss + K (KS s) -> s + K (KP s _) -> unwords s ---- prefix choice TODO + W s t -> s ++ realize t + FV (t:_) -> realize t + TM -> "?" +</PRE> +<P> +Notice that realization always picks the first field of a record. +If a linearization type has more than one field, the first field +does not necessarily contain the desired string. +Also notice that the order of record fields in GFCC is not necessarily +the same as in GF source. +</P> +<H3>Term evaluation</H3> +<P> +Evaluation follows call-by-value order, with two environments +needed: +</P> +<UL> +<LI>the grammar (a concrete syntax) to give the global constants +<LI>an array of terms to give the subtree linearizations +</UL> + +<P> +The code is presented in one-level pattern matching, to +enable reimplementations in languages that do not permit +deep patterns (such as Java and C++). +</P> +<PRE> + compute :: GFCC -> CId -> [Term] -> Term -> Term + compute gfcc lang args = comp where + comp trm = case trm of + P r p -> proj (comp r) (comp p) + W s t -> W s (comp t) + R ts -> R $ Prelude.map comp ts + V i -> idx args (fromInteger i) -- already computed + F c -> comp $ look c -- not computed (if contains V) + FV ts -> FV $ Prelude.map comp ts + S ts -> S $ Prelude.filter (/= S []) $ Prelude.map comp ts + _ -> trm + + look = lookOper gfcc lang + + idx xs i = xs !! i + + proj r p = case (r,p) of + (_, FV ts) -> FV $ Prelude.map (proj r) ts + (W s t, _) -> kks (s ++ getString (proj t p)) + _ -> comp $ getField r (getIndex p) + + getString t = case t of + K (KS s) -> s + _ -> trace ("ERROR in grammar compiler: string from "++ show t) "ERR" + + getIndex t = case t of + C i -> fromInteger i + RP p _ -> getIndex p + TM -> 0 -- default value for parameter + _ -> trace ("ERROR in grammar compiler: index from " ++ show t) 0 + + getField t i = case t of + R rs -> idx rs i + RP _ r -> getField r i + TM -> TM + _ -> trace ("ERROR in grammar compiler: field from " ++ show t) t +</PRE> +<P></P> +<H3>The special term constructors</H3> +<P> +The three forms introduced by the compiler may a need special +explanation. +</P> +<P> +Global constants +</P> +<PRE> + Term ::= CId ; +</PRE> +<P> +are shorthands for complex terms. They are produced by the +compiler by (iterated) <B>common subexpression elimination</B>. +They are often more powerful than hand-devised code sharing in the source +code. They could be computed off-line by replacing each identifier by +its definition. +</P> +<P> +<B>Prefix-suffix tables</B> +</P> +<PRE> + Term ::= "(" String "+" Term ")" ; +</PRE> +<P> +represent tables of word forms divided to the longest common prefix +and its array of suffixes. In the example grammar above, we have +</P> +<PRE> + Sleep = [("sleep" + ["s",""])] +</PRE> +<P> +which in fact is equal to the array of full forms +</P> +<PRE> + ["sleeps", "sleep"] +</PRE> +<P> +The power of this construction comes from the fact that suffix sets +tend to be repeated in a language, and can therefore be collected +by common subexpression elimination. It is this technique that +explains the used syntax rather than the more accurate +</P> +<PRE> + "(" String "+" [String] ")" +</PRE> +<P> +since we want the suffix part to be a <CODE>Term</CODE> for the optimization to +take effect. +</P> +<H2>Compiling to GFCC</H2> +<P> +Compilation to GFCC is performed by the GF grammar compiler, and +GFCC interpreters need not know what it does. For grammar writers, +however, it might be interesting to know what happens to the grammars +in the process. +</P> +<P> +The compilation phases are the following +</P> +<OL> +<LI>type check and partially evaluate GF source +<LI>create a symbol table mapping the GF parameter and record types to + fixed-size arrays, and parameter values and record labels to integers +<LI>traverse the linearization rules replacing parameters and labels by integers +<LI>reorganize the created GF grammar so that it has just one abstract syntax + and one concrete syntax per language +<LI>TODO: apply UTF8 encoding to the grammar, if not yet applied (this is told by the + <CODE>coding</CODE> flag) +<LI>translate the GF grammar object to a GFCC grammar object, using a simple + compositional mapping +<LI>perform the word-suffix optimization on GFCC linearization terms +<LI>perform subexpression elimination on each concrete syntax module +<LI>print out the GFCC code +</OL> + +<H3>Problems in GFCC compilation</H3> +<P> +Two major problems had to be solved in compiling GF to GFCC: +</P> +<UL> +<LI>consistent order of tables and records, to permit the array translation +<LI>run-time variables in complex parameter values. +</UL> + +<P> +The current implementation is still experimental and may fail +to generate correct code. Any errors remaining are likely to be +related to the two problems just mentioned. +</P> +<P> +The order problem is solved in slightly different ways for tables and records. +In both cases, <B>eta expansion</B> is used to establish a +canonical order. Tables are ordered by applying the preorder induced +by <CODE>param</CODE> definitions. Records are ordered by sorting them by labels. +This means that +e.g. the <CODE>s</CODE> field will in general no longer appear as the first +field, even if it does so in the GF source code. But relying on the +order of fields in a labelled record would be misplaced anyway. +</P> +<P> +The canonical form of records is further complicated by lock fields, +i.e. dummy fields of form <CODE>lock_C = <></CODE>, which are added to grammar +libraries to force intensionality of linearization types. The problem +is that the absence of a lock field only generates a warning, not +an error. Therefore a GF grammar can contain objects of the same +type with and without a lock field. This problem was solved in GFCC +generation by just removing all lock fields (defined as fields whose +type is the empty record type). This has the further advantage of +(slightly) reducing the grammar size. More importantly, it is safe +to remove lock fields, because they are never used in computation, +and because intensional types are only needed in grammars reused +as libraries, not in grammars used at runtime. +</P> +<P> +While the order problem is rather bureaucratic in nature, run-time +variables are an interesting problem. They arise in the presence +of complex parameter values, created by argument-taking constructors +and parameter records. To give an example, consider the GF parameter +type system +</P> +<PRE> + Number = Sg | Pl ; + Person = P1 | P2 | P3 ; + Agr = Ag Number Person ; +</PRE> +<P> +The values can be translated to integers in the expected way, +</P> +<PRE> + Sg = 0, Pl = 1 + P1 = 0, P2 = 1, P3 = 2 + Ag Sg P1 = 0, Ag Sg P2 = 1, Ag Sg P3 = 2, + Ag Pl P1 = 3, Ag Pl P2 = 4, Ag Pl P3 = 5 +</PRE> +<P> +However, an argument of <CODE>Agr</CODE> can be a run-time variable, as in +</P> +<PRE> + Ag np.n P3 +</PRE> +<P> +This expression must first be translated to a case expression, +</P> +<PRE> + case np.n of { + 0 => 2 ; + 1 => 5 + } +</PRE> +<P> +which can then be translated to the GFCC term +</P> +<PRE> + ([2,5] ! ($0 ! $1)) +</PRE> +<P> +assuming that the variable <CODE>np</CODE> is the first argument and that its +<CODE>Number</CODE> field is the second in the record. +</P> +<P> +This transformation of course has to be performed recursively, since +there can be several run-time variables in a parameter value: +</P> +<PRE> + Ag np.n np.p +</PRE> +<P> +A similar transformation would be possible to deal with the double +role of parameter records discussed above. Thus the type +</P> +<PRE> + RNP = {n : Number ; p : Person} +</PRE> +<P> +could be uniformly translated into the set <CODE>{0,1,2,3,4,5}</CODE> +as <CODE>Agr</CODE> above. Selections would be simple instances of indexing. +But any projection from the record should be translated into +a case expression, +</P> +<PRE> + rnp.n ===> + case rnp of { + 0 => 0 ; + 1 => 0 ; + 2 => 0 ; + 3 => 1 ; + 4 => 1 ; + 5 => 1 + } +</PRE> +<P> +To avoid the code bloat resulting from this, we have chosen to +deal with records by a <B>currying</B> transformation: +</P> +<PRE> + table {n : Number ; p : Person} {... ...} + ===> + table Number {Sg => table Person {...} ; table Person {...}} +</PRE> +<P> +This is performed when GFCC is generated. Selections with +records have to be treated likewise, +</P> +<PRE> + t ! r ===> t ! r.n ! r.p +</PRE> +<P></P> +<H3>The representation of linearization types</H3> +<P> +Linearization types (<CODE>lincat</CODE>) are not needed when generating with +GFCC, but they have been added to enable parser generation directly from +GFCC. The linearization type definitions are shown as a part of the +concrete syntax, by using terms to represent types. Here is the table +showing how different linearization types are encoded. +</P> +<PRE> + P* = max(P) -- parameter type + {r1 : T1 ; ... ; rn : Tn}* = [T1*,...,Tn*] -- record + (P => T)* = [T* ,...,T*] -- table, size(P) cases + Str* = () +</PRE> +<P> +For example, the linearization type <CODE>present/CatEng.NP</CODE> is +translated as follows: +</P> +<PRE> + NP = { + a : { -- 6 = 2*3 values + n : {ParamX.Number} ; -- 2 values + p : {ParamX.Person} -- 3 values + } ; + s : {ResEng.Case} => Str -- 3 values + } + + __NP = [[1,2],[(),(),()]] +</PRE> +<P></P> +<H3>Running the compiler and the GFCC interpreter</H3> +<P> +GFCC generation is a part of the +<A HREF="http://www.cs.chalmers.se/Cs/Research/Language-technology/darcs/GF/doc/darcs.html">developers' version</A> +of GF since September 2006. To invoke the compiler, the flag +<CODE>-printer=gfcc</CODE> to the command +<CODE>pm = print_multi</CODE> is used. It is wise to recompile the grammar from +source, since previously compiled libraries may not obey the canonical +order of records. +Here is an example, performed in +<A HREF="../../../../../examples/bronzeage">example/bronzeage</A>. +</P> +<PRE> + i -src -path=.:prelude:resource-1.0/* -optimize=all_subs BronzeageEng.gf + i -src -path=.:prelude:resource-1.0/* -optimize=all_subs BronzeageGer.gf + strip + pm -printer=gfcc | wf bronze.gfcc +</PRE> +<P> +There is also an experimental batch compiler, which does not use the GFC +format or the record aliases. It can be produced by +</P> +<PRE> + make gfc +</PRE> +<P> +in <CODE>GF/src</CODE>, and invoked by +</P> +<PRE> + gfc --make FILES +</PRE> +<P></P> +<H2>The reference interpreter</H2> +<P> +The reference interpreter written in Haskell consists of the following files: +</P> +<PRE> + -- source file for BNFC + GFCC.cf -- labelled BNF grammar of gfcc + + -- files generated by BNFC + AbsGFCC.hs -- abstrac syntax datatypes + ErrM.hs -- error monad used internally + LexGFCC.hs -- lexer of gfcc files + ParGFCC.hs -- parser of gfcc files and syntax trees + PrintGFCC.hs -- printer of gfcc files and syntax trees + + -- hand-written files + DataGFCC.hs -- grammar datatype, post-parser grammar creation + Linearize.hs -- linearization and evaluation + Macros.hs -- utilities abstracting away from GFCC datatypes + Generate.hs -- random and exhaustive generation, generate-and-test parsing + API.hs -- functionalities accessible in embedded GF applications + Generate.hs -- random and exhaustive generation + Shell.hs -- main function - a simple command interpreter +</PRE> +<P> +It is included in the +<A HREF="http://www.cs.chalmers.se/Cs/Research/Language-technology/darcs/GF/doc/darcs.html">developers' version</A> +of GF, in the subdirectories <A HREF="../"><CODE>GF/src/GF/GFCC</CODE></A> and +<A HREF="../../Devel"><CODE>GF/src/GF/Devel</CODE></A>. +</P> +<P> +As of September 2007, default parsing in main GF uses GFCC (implemented by Krasimir +Angelov). The interpreter uses the relevant modules +</P> +<PRE> + GF/Conversions/SimpleToFCFG.hs -- generate parser from GFCC + GF/Parsing/FCFG.hs -- run the parser +</PRE> +<P></P> +<P> +To compile the interpreter, type +</P> +<PRE> + make gfcc +</PRE> +<P> +in <CODE>GF/src</CODE>. To run it, type +</P> +<PRE> + ./gfcc <GFCC-file> +</PRE> +<P> +The available commands are +</P> +<UL> +<LI><CODE>gr <Cat> <Int></CODE>: generate a number of random trees in category. + and show their linearizations in all languages +<LI><CODE>grt <Cat> <Int></CODE>: generate a number of random trees in category. + and show the trees and their linearizations in all languages +<LI><CODE>gt <Cat> <Int></CODE>: generate a number of trees in category from smallest, + and show their linearizations in all languages +<LI><CODE>gtt <Cat> <Int></CODE>: generate a number of trees in category from smallest, + and show the trees and their linearizations in all languages +<LI><CODE>p <Lang> <Cat> <String></CODE>: parse a string into a set of trees +<LI><CODE>lin <Tree></CODE>: linearize tree in all languages, also showing full records +<LI><CODE>q</CODE>: terminate the system cleanly +</UL> + +<H2>Embedded formats</H2> +<UL> +<LI>JavaScript: compiler of linearization and abstract syntax +<P></P> +<LI>Haskell: compiler of abstract syntax and interpreter with parsing, + linearization, and generation +<P></P> +<LI>C: compiler of linearization (old GFCC) +<P></P> +<LI>C++: embedded interpreter supporting linearization (old GFCC) +</UL> + +<H2>Some things to do</H2> +<P> +Support for dependent types, higher-order abstract syntax, and +semantic definition in GFCC generation and interpreters. +</P> +<P> +Replacing the entire GF shell by one based on GFCC. +</P> +<P> +Interpreter in Java. +</P> +<P> +Hand-written parsers for GFCC grammars to reduce code size +(and efficiency?) of interpreters. +</P> +<P> +Binary format and/or file compression of GFCC output. +</P> +<P> +Syntax editor based on GFCC. +</P> +<P> +Rewriting of resource libraries in order to exploit the +word-suffix sharing better (depth-one tables, as in FM). +</P> + +<!-- html code generated by txt2tags 2.3 (http://txt2tags.sf.net) --> +<!-- cmdline: txt2tags -thtml gfcc.txt --> +</BODY></HTML> diff --git a/src-3.0/GF/GFCC/doc/gfcc.txt b/src-3.0/GF/GFCC/doc/gfcc.txt new file mode 100644 index 000000000..5dcf2fbdc --- /dev/null +++ b/src-3.0/GF/GFCC/doc/gfcc.txt @@ -0,0 +1,712 @@ +The GFCC Grammar Format +Aarne Ranta +December 14, 2007 + +Author's address: +[``http://www.cs.chalmers.se/~aarne`` http://www.cs.chalmers.se/~aarne] + +% to compile: txt2tags -thtml --toc gfcc.txt + +History: +- 14 Dec 2007: simpler, Lisp-like concrete syntax of GFCC +- 5 Oct 2007: new, better structured GFCC with full expressive power +- 19 Oct: translation of lincats, new figures on C++ +- 3 Oct 2006: first version + + +==What is GFCC== + +GFCC is a low-level format for GF grammars. Its aim is to contain the minimum +that is needed to process GF grammars at runtime. This minimality has three +advantages: +- compact grammar files and run-time objects +- time and space efficient processing +- simple definition of interpreters + + +Thus we also want to call GFCC the **portable grammar format**. + +The idea is that all embedded GF applications use GFCC. +The GF system would be primarily used as a compiler and as a grammar +development tool. + +Since GFCC is implemented in BNFC, a parser of the format is readily +available for C, C++, C#, Haskell, Java, and OCaml. Also an XML +representation can be generated in BNFC. A +[reference implementation ../] +of linearization and some other functions has been written in Haskell. + + +==GFCC vs. GFC== + +GFCC is aimed to replace GFC as the run-time grammar format. GFC was designed +to be a run-time format, but also to +support separate compilation of grammars, i.e. +to store the results of compiling +individual GF modules. But this means that GFC has to contain extra information, +such as type annotations, which is only needed in compilation and not at +run-time. In particular, the pattern matching syntax and semantics of GFC is +complex and therefore difficult to implement in new platforms. + +Actually, GFC is planned to be omitted also as the target format of +separate compilation, where plain GF (type annotated and partially evaluated) +will be used instead. GFC provides only marginal advantages as a target format +compared with GF, and it is therefore just extra weight to carry around this +format. + +The main differences of GFCC compared with GFC (and GF) can be +summarized as follows: +- there are no modules, and therefore no qualified names +- a GFCC grammar is multilingual, and consists of a common abstract syntax + together with one concrete syntax per language +- records and tables are replaced by arrays +- record labels and parameter values are replaced by integers +- record projection and table selection are replaced by array indexing +- even though the format does support dependent types and higher-order abstract + syntax, there is no interpreted yet that does this + + + +Here is an example of a GF grammar, consisting of three modules, +as translated to GFCC. The representations are aligned; +thus they do not completely +reflect the order of judgements in GFCC files, which have different orders of +blocks of judgements, and alphabetical sorting. +``` + grammar Ex(Eng,Swe); + +abstract Ex = { abstract { + cat cat + S ; NP ; VP ; NP[]; S[]; VP[]; + fun fun + Pred : NP -> VP -> S ; Pred=[(($ 0! 1),(($ 1! 0)!($ 0! 0)))]; + She, They : NP ; She=[0,"she"]; + Sleep : VP ; They=[1,"they"]; + Sleep=[["sleeps","sleep"]]; +} } ; + +concrete Eng of Ex = { concrete Eng { + lincat lincat + S = {s : Str} ; S=[()]; + NP = {s : Str ; n : Num} ; NP=[1,()]; + VP = {s : Num => Str} ; VP=[[(),()]]; + param + Num = Sg | Pl ; + lin lin + Pred np vp = { Pred=[(($ 0! 1),(($ 1! 0)!($ 0! 0)))]; + s = np.s ++ vp.s ! np.n} ; + She = {s = "she" ; n = Sg} ; She=[0,"she"]; + They = {s = "they" ; n = Pl} ; They = [1, "they"]; + Sleep = {s = table { Sleep=[["sleeps","sleep"]]; + Sg => "sleeps" ; + Pl => "sleep" + } + } ; +} } ; + +concrete Swe of Ex = { concrete Swe { + lincat lincat + S = {s : Str} ; S=[()]; + NP = {s : Str} ; NP=[()]; + VP = {s : Str} ; VP=[()]; + param + Num = Sg | Pl ; + lin lin + Pred np vp = { Pred = [(($0!0),($1!0))]; + s = np.s ++ vp.s} ; + She = {s = "hon"} ; She = ["hon"]; + They = {s = "de"} ; They = ["de"]; + Sleep = {s = "sover"} ; Sleep = ["sover"]; +} } ; +``` + +==The syntax of GFCC files== + +The complete BNFC grammar, from which +the rules in this section are taken, is in the file +[``GF/GFCC/GFCC.cf`` ../DataGFCC.cf]. + + +===Top level=== + +A grammar has a header telling the name of the abstract syntax +(often specifying an application domain), and the names of +the concrete languages. The abstract syntax and the concrete +syntaxes themselves follow. +``` + Grm. Grammar ::= + "grammar" CId "(" [CId] ")" ";" + Abstract ";" + [Concrete] ; + + Abs. Abstract ::= + "abstract" "{" + "flags" [Flag] + "fun" [FunDef] + "cat" [CatDef] + "}" ; + + Cnc. Concrete ::= + "concrete" CId "{" + "flags" [Flag] + "lin" [LinDef] + "oper" [LinDef] + "lincat" [LinDef] + "lindef" [LinDef] + "printname" [LinDef] + "}" ; +``` +This syntax organizes each module to a sequence of **fields**, such +as flags, linearizations, operations, linearization types, etc. +It is envisaged that particular applications can ignore some +of the fields, typically so that earlier fields are more +important than later ones. + +The judgement forms have the following syntax. +``` + Flg. Flag ::= CId "=" String ; + Cat. CatDef ::= CId "[" [Hypo] "]" ; + Fun. FunDef ::= CId ":" Type "=" Exp ; + Lin. LinDef ::= CId "=" Term ; +``` +For the run-time system, the reference implementation in Haskell +uses a structure that gives efficient look-up: +``` + data GFCC = GFCC { + absname :: CId , + cncnames :: [CId] , + abstract :: Abstr , + concretes :: Map CId Concr + } + + data Abstr = Abstr { + aflags :: Map CId String, -- value of a flag + funs :: Map CId (Type,Exp), -- type and def of a fun + cats :: Map CId [Hypo], -- context of a cat + catfuns :: Map CId [CId] -- funs yielding a cat (redundant, for fast lookup) + } + + data Concr = Concr { + flags :: Map CId String, -- value of a flag + lins :: Map CId Term, -- lin of a fun + opers :: Map CId Term, -- oper generated by subex elim + lincats :: Map CId Term, -- lin type of a cat + lindefs :: Map CId Term, -- lin default of a cat + printnames :: Map CId Term -- printname of a cat or a fun + } +``` +These definitions are from [``GF/GFCC/DataGFCC.hs`` ../DataGFCC.hs]. + +Identifiers (``CId``) are like ``Ident`` in GF, except that +the compiler produces constants prefixed with ``_`` in +the common subterm elimination optimization. +``` + token CId (('_' | letter) (letter | digit | '\'' | '_')*) ; +``` + + +===Abstract syntax=== + +Types are first-order function types built from argument type +contexts and value types. +category symbols. Syntax trees (``Exp``) are +rose trees with nodes consisting of a head (``Atom``) and +bound variables (``CId``). +``` + DTyp. Type ::= "[" [Hypo] "]" CId [Exp] ; + DTr. Exp ::= "[" "(" [CId] ")" Atom [Exp] "]" ; + Hyp. Hypo ::= CId ":" Type ; +``` +The head Atom is either a function +constant, a bound variable, or a metavariable, or a string, integer, or float +literal. +``` + AC. Atom ::= CId ; + AS. Atom ::= String ; + AI. Atom ::= Integer ; + AF. Atom ::= Double ; + AM. Atom ::= "?" Integer ; +``` +The context-free types and trees of the "old GFCC" are special +cases, which can be defined as follows: +``` + Typ. Type ::= [CId] "->" CId + Typ args val = DTyp [Hyp (CId "_") arg | arg <- args] val + + Tr. Exp ::= "(" CId [Exp] ")" + Tr fun exps = DTr [] fun exps +``` +To store semantic (``def``) definitions by cases, the following expression +form is provided, but it is only meaningful in the last field of a function +declaration in an abstract syntax: +``` + EEq. Exp ::= "{" [Equation] "}" ; + Equ. Equation ::= [Exp] "->" Exp ; +``` +Notice that expressions are used to encode patterns. Primitive notions +(the default semantics in GF) are encoded as empty sets of equations +(``[]``). For a constructor (canonical form) of a category ``C``, we +aim to use the encoding as the application ``(_constr C)``. + + + +===Concrete syntax=== + +Linearization terms (``Term``) are built as follows. +Constructor names are shown to make the later code +examples readable. +``` + R. Term ::= "[" [Term] "]" ; -- array (record/table) + P. Term ::= "(" Term "!" Term ")" ; -- access to field (projection/selection) + S. Term ::= "(" [Term] ")" ; -- concatenated sequence + K. Term ::= Tokn ; -- token + V. Term ::= "$" Integer ; -- argument (subtree) + C. Term ::= Integer ; -- array index (label/parameter value) + FV. Term ::= "[|" [Term] "|]" ; -- free variation + TM. Term ::= "?" ; -- linearization of metavariable +``` +Tokens are strings or (maybe obsolescent) prefix-dependent +variant lists. +``` + KS. Tokn ::= String ; + KP. Tokn ::= "[" "pre" [String] "[" [Variant] "]" "]" ; + Var. Variant ::= [String] "/" [String] ; +``` +Two special forms of terms are introduced by the compiler +as optimizations. They can in principle be eliminated, but +their presence makes grammars much more compact. Their semantics +will be explained in a later section. +``` + F. Term ::= CId ; -- global constant + W. Term ::= "(" String "+" Term ")" ; -- prefix + suffix table +``` +There is also a deprecated form of "record parameter alias", +``` + RP. Term ::= "(" Term "@" Term ")"; -- DEPRECATED +``` +which will be removed when the migration to new GFCC is complete. + + + +==The semantics of concrete syntax terms== + +The code in this section is from [``GF/GFCC/Linearize.hs`` ../Linearize.hs]. + + +===Linearization and realization=== + +The linearization algorithm is essentially the same as in +GFC: a tree is linearized by evaluating its linearization term +in the environment of the linearizations of the subtrees. +Literal atoms are linearized in the obvious way. +The function also needs to know the language (i.e. concrete syntax) +in which linearization is performed. +``` + linExp :: GFCC -> CId -> Exp -> Term + linExp gfcc lang tree@(DTr _ at trees) = case at of + AC fun -> comp (Prelude.map lin trees) $ look fun + AS s -> R [kks (show s)] -- quoted + AI i -> R [kks (show i)] + AF d -> R [kks (show d)] + AM -> TM + where + lin = linExp gfcc lang + comp = compute gfcc lang + look = lookLin gfcc lang +``` +TODO: bindings must be supported. + +The result of linearization is usually a record, which is realized as +a string using the following algorithm. +``` + realize :: Term -> String + realize trm = case trm of + R (t:_) -> realize t + S ss -> unwords $ Prelude.map realize ss + K (KS s) -> s + K (KP s _) -> unwords s ---- prefix choice TODO + W s t -> s ++ realize t + FV (t:_) -> realize t + TM -> "?" +``` +Notice that realization always picks the first field of a record. +If a linearization type has more than one field, the first field +does not necessarily contain the desired string. +Also notice that the order of record fields in GFCC is not necessarily +the same as in GF source. + + +===Term evaluation=== + +Evaluation follows call-by-value order, with two environments +needed: +- the grammar (a concrete syntax) to give the global constants +- an array of terms to give the subtree linearizations + + +The code is presented in one-level pattern matching, to +enable reimplementations in languages that do not permit +deep patterns (such as Java and C++). +``` +compute :: GFCC -> CId -> [Term] -> Term -> Term +compute gfcc lang args = comp where + comp trm = case trm of + P r p -> proj (comp r) (comp p) + W s t -> W s (comp t) + R ts -> R $ Prelude.map comp ts + V i -> idx args (fromInteger i) -- already computed + F c -> comp $ look c -- not computed (if contains V) + FV ts -> FV $ Prelude.map comp ts + S ts -> S $ Prelude.filter (/= S []) $ Prelude.map comp ts + _ -> trm + + look = lookOper gfcc lang + + idx xs i = xs !! i + + proj r p = case (r,p) of + (_, FV ts) -> FV $ Prelude.map (proj r) ts + (FV ts, _ ) -> FV $ Prelude.map (\t -> proj t p) ts + (W s t, _) -> kks (s ++ getString (proj t p)) + _ -> comp $ getField r (getIndex p) + + getString t = case t of + K (KS s) -> s + _ -> trace ("ERROR in grammar compiler: string from "++ show t) "ERR" + + getIndex t = case t of + C i -> fromInteger i + RP p _ -> getIndex p + TM -> 0 -- default value for parameter + _ -> trace ("ERROR in grammar compiler: index from " ++ show t) 0 + + getField t i = case t of + R rs -> idx rs i + RP _ r -> getField r i + TM -> TM + _ -> trace ("ERROR in grammar compiler: field from " ++ show t) t +``` + +===The special term constructors=== + +The three forms introduced by the compiler may a need special +explanation. + +Global constants +``` + Term ::= CId ; +``` +are shorthands for complex terms. They are produced by the +compiler by (iterated) **common subexpression elimination**. +They are often more powerful than hand-devised code sharing in the source +code. They could be computed off-line by replacing each identifier by +its definition. + +**Prefix-suffix tables** +``` + Term ::= "(" String "+" Term ")" ; +``` +represent tables of word forms divided to the longest common prefix +and its array of suffixes. In the example grammar above, we have +``` + Sleep = [("sleep" + ["s",""])] +``` +which in fact is equal to the array of full forms +``` + ["sleeps", "sleep"] +``` +The power of this construction comes from the fact that suffix sets +tend to be repeated in a language, and can therefore be collected +by common subexpression elimination. It is this technique that +explains the used syntax rather than the more accurate +``` + "(" String "+" [String] ")" +``` +since we want the suffix part to be a ``Term`` for the optimization to +take effect. + + + +==Compiling to GFCC== + +Compilation to GFCC is performed by the GF grammar compiler, and +GFCC interpreters need not know what it does. For grammar writers, +however, it might be interesting to know what happens to the grammars +in the process. + +The compilation phases are the following ++ type check and partially evaluate GF source ++ create a symbol table mapping the GF parameter and record types to + fixed-size arrays, and parameter values and record labels to integers ++ traverse the linearization rules replacing parameters and labels by integers ++ reorganize the created GF grammar so that it has just one abstract syntax + and one concrete syntax per language ++ TODO: apply UTF8 encoding to the grammar, if not yet applied (this is told by the + ``coding`` flag) ++ translate the GF grammar object to a GFCC grammar object, using a simple + compositional mapping ++ perform the word-suffix optimization on GFCC linearization terms ++ perform subexpression elimination on each concrete syntax module ++ print out the GFCC code + + + + +===Problems in GFCC compilation=== + +Two major problems had to be solved in compiling GF to GFCC: +- consistent order of tables and records, to permit the array translation +- run-time variables in complex parameter values. + + +The current implementation is still experimental and may fail +to generate correct code. Any errors remaining are likely to be +related to the two problems just mentioned. + +The order problem is solved in slightly different ways for tables and records. +In both cases, **eta expansion** is used to establish a +canonical order. Tables are ordered by applying the preorder induced +by ``param`` definitions. Records are ordered by sorting them by labels. +This means that +e.g. the ``s`` field will in general no longer appear as the first +field, even if it does so in the GF source code. But relying on the +order of fields in a labelled record would be misplaced anyway. + +The canonical form of records is further complicated by lock fields, +i.e. dummy fields of form ``lock_C = <>``, which are added to grammar +libraries to force intensionality of linearization types. The problem +is that the absence of a lock field only generates a warning, not +an error. Therefore a GF grammar can contain objects of the same +type with and without a lock field. This problem was solved in GFCC +generation by just removing all lock fields (defined as fields whose +type is the empty record type). This has the further advantage of +(slightly) reducing the grammar size. More importantly, it is safe +to remove lock fields, because they are never used in computation, +and because intensional types are only needed in grammars reused +as libraries, not in grammars used at runtime. + +While the order problem is rather bureaucratic in nature, run-time +variables are an interesting problem. They arise in the presence +of complex parameter values, created by argument-taking constructors +and parameter records. To give an example, consider the GF parameter +type system +``` + Number = Sg | Pl ; + Person = P1 | P2 | P3 ; + Agr = Ag Number Person ; +``` +The values can be translated to integers in the expected way, +``` + Sg = 0, Pl = 1 + P1 = 0, P2 = 1, P3 = 2 + Ag Sg P1 = 0, Ag Sg P2 = 1, Ag Sg P3 = 2, + Ag Pl P1 = 3, Ag Pl P2 = 4, Ag Pl P3 = 5 +``` +However, an argument of ``Agr`` can be a run-time variable, as in +``` + Ag np.n P3 +``` +This expression must first be translated to a case expression, +``` + case np.n of { + 0 => 2 ; + 1 => 5 + } +``` +which can then be translated to the GFCC term +``` + ([2,5] ! ($0 ! $1)) +``` +assuming that the variable ``np`` is the first argument and that its +``Number`` field is the second in the record. + +This transformation of course has to be performed recursively, since +there can be several run-time variables in a parameter value: +``` + Ag np.n np.p +``` +A similar transformation would be possible to deal with the double +role of parameter records discussed above. Thus the type +``` + RNP = {n : Number ; p : Person} +``` +could be uniformly translated into the set ``{0,1,2,3,4,5}`` +as ``Agr`` above. Selections would be simple instances of indexing. +But any projection from the record should be translated into +a case expression, +``` + rnp.n ===> + case rnp of { + 0 => 0 ; + 1 => 0 ; + 2 => 0 ; + 3 => 1 ; + 4 => 1 ; + 5 => 1 + } +``` +To avoid the code bloat resulting from this, we have chosen to +deal with records by a **currying** transformation: +``` + table {n : Number ; p : Person} {... ...} + ===> + table Number {Sg => table Person {...} ; table Person {...}} +``` +This is performed when GFCC is generated. Selections with +records have to be treated likewise, +``` + t ! r ===> t ! r.n ! r.p +``` + + +===The representation of linearization types=== + +Linearization types (``lincat``) are not needed when generating with +GFCC, but they have been added to enable parser generation directly from +GFCC. The linearization type definitions are shown as a part of the +concrete syntax, by using terms to represent types. Here is the table +showing how different linearization types are encoded. +``` + P* = max(P) -- parameter type + {r1 : T1 ; ... ; rn : Tn}* = [T1*,...,Tn*] -- record + (P => T)* = [T* ,...,T*] -- table, size(P) cases + Str* = () +``` +For example, the linearization type ``present/CatEng.NP`` is +translated as follows: +``` + NP = { + a : { -- 6 = 2*3 values + n : {ParamX.Number} ; -- 2 values + p : {ParamX.Person} -- 3 values + } ; + s : {ResEng.Case} => Str -- 3 values + } + + __NP = [[1,2],[(),(),()]] +``` + + + + +===Running the compiler and the GFCC interpreter=== + +GFCC generation is a part of the +[developers' version http://www.cs.chalmers.se/Cs/Research/Language-technology/darcs/GF/doc/darcs.html] +of GF since September 2006. To invoke the compiler, the flag +``-printer=gfcc`` to the command +``pm = print_multi`` is used. It is wise to recompile the grammar from +source, since previously compiled libraries may not obey the canonical +order of records. +Here is an example, performed in +[example/bronzeage ../../../../../examples/bronzeage]. +``` + i -src -path=.:prelude:resource-1.0/* -optimize=all_subs BronzeageEng.gf + i -src -path=.:prelude:resource-1.0/* -optimize=all_subs BronzeageGer.gf + strip + pm -printer=gfcc | wf bronze.gfcc +``` +There is also an experimental batch compiler, which does not use the GFC +format or the record aliases. It can be produced by +``` + make gfc +``` +in ``GF/src``, and invoked by +``` + gfc --make FILES +``` + + + + +==The reference interpreter== + +The reference interpreter written in Haskell consists of the following files: +``` + -- source file for BNFC + GFCC.cf -- labelled BNF grammar of gfcc + + -- files generated by BNFC + AbsGFCC.hs -- abstrac syntax datatypes + ErrM.hs -- error monad used internally + LexGFCC.hs -- lexer of gfcc files + ParGFCC.hs -- parser of gfcc files and syntax trees + PrintGFCC.hs -- printer of gfcc files and syntax trees + + -- hand-written files + DataGFCC.hs -- grammar datatype, post-parser grammar creation + Linearize.hs -- linearization and evaluation + Macros.hs -- utilities abstracting away from GFCC datatypes + Generate.hs -- random and exhaustive generation, generate-and-test parsing + API.hs -- functionalities accessible in embedded GF applications + Generate.hs -- random and exhaustive generation + Shell.hs -- main function - a simple command interpreter +``` +It is included in the +[developers' version http://www.cs.chalmers.se/Cs/Research/Language-technology/darcs/GF/doc/darcs.html] +of GF, in the subdirectories [``GF/src/GF/GFCC`` ../] and +[``GF/src/GF/Devel`` ../../Devel]. + +As of September 2007, default parsing in main GF uses GFCC (implemented by Krasimir +Angelov). The interpreter uses the relevant modules +``` + GF/Conversions/SimpleToFCFG.hs -- generate parser from GFCC + GF/Parsing/FCFG.hs -- run the parser +``` + + +To compile the interpreter, type +``` + make gfcc +``` +in ``GF/src``. To run it, type +``` + ./gfcc <GFCC-file> +``` +The available commands are +- ``gr <Cat> <Int>``: generate a number of random trees in category. + and show their linearizations in all languages +- ``grt <Cat> <Int>``: generate a number of random trees in category. + and show the trees and their linearizations in all languages +- ``gt <Cat> <Int>``: generate a number of trees in category from smallest, + and show their linearizations in all languages +- ``gtt <Cat> <Int>``: generate a number of trees in category from smallest, + and show the trees and their linearizations in all languages +- ``p <Lang> <Cat> <String>``: parse a string into a set of trees +- ``lin <Tree>``: linearize tree in all languages, also showing full records +- ``q``: terminate the system cleanly + + + +==Embedded formats== + +- JavaScript: compiler of linearization and abstract syntax + +- Haskell: compiler of abstract syntax and interpreter with parsing, + linearization, and generation + +- C: compiler of linearization (old GFCC) + +- C++: embedded interpreter supporting linearization (old GFCC) + + + +==Some things to do== + +Support for dependent types, higher-order abstract syntax, and +semantic definition in GFCC generation and interpreters. + +Replacing the entire GF shell by one based on GFCC. + +Interpreter in Java. + +Hand-written parsers for GFCC grammars to reduce code size +(and efficiency?) of interpreters. + +Binary format and/or file compression of GFCC output. + +Syntax editor based on GFCC. + +Rewriting of resource libraries in order to exploit the +word-suffix sharing better (depth-one tables, as in FM). + diff --git a/src-3.0/GF/GFCC/doc/old-GFCC.cf b/src-3.0/GF/GFCC/doc/old-GFCC.cf new file mode 100644 index 000000000..65657a259 --- /dev/null +++ b/src-3.0/GF/GFCC/doc/old-GFCC.cf @@ -0,0 +1,50 @@ +Grm. Grammar ::= Header ";" Abstract ";" [Concrete] ; +Hdr. Header ::= "grammar" CId "(" [CId] ")" ; +Abs. Abstract ::= "abstract" "{" [AbsDef] "}" ; +Cnc. Concrete ::= "concrete" CId "{" [CncDef] "}" ; + +Fun. AbsDef ::= CId ":" Type "=" Exp ; +--AFl. AbsDef ::= "%" CId "=" String ; -- flag +Lin. CncDef ::= CId "=" Term ; +--CFl. CncDef ::= "%" CId "=" String ; -- flag + +Typ. Type ::= [CId] "->" CId ; +Tr. Exp ::= "(" Atom [Exp] ")" ; +AC. Atom ::= CId ; +AS. Atom ::= String ; +AI. Atom ::= Integer ; +AF. Atom ::= Double ; +AM. Atom ::= "?" ; +trA. Exp ::= Atom ; +define trA a = Tr a [] ; + +R. Term ::= "[" [Term] "]" ; -- record/table +P. Term ::= "(" Term "!" Term ")" ; -- projection/selection +S. Term ::= "(" [Term] ")" ; -- sequence with ++ +K. Term ::= Tokn ; -- token +V. Term ::= "$" Integer ; -- argument +C. Term ::= Integer ; -- parameter value/label +F. Term ::= CId ; -- global constant +FV. Term ::= "[|" [Term] "|]" ; -- free variation +W. Term ::= "(" String "+" Term ")" ; -- prefix + suffix table +RP. Term ::= "(" Term "@" Term ")"; -- record parameter alias +TM. Term ::= "?" ; -- lin of metavariable + +L. Term ::= "(" CId "->" Term ")" ; -- lambda abstracted table +BV. Term ::= "#" CId ; -- lambda-bound variable + +KS. Tokn ::= String ; +KP. Tokn ::= "[" "pre" [String] "[" [Variant] "]" "]" ; +Var. Variant ::= [String] "/" [String] ; + + +terminator Concrete ";" ; +terminator AbsDef ";" ; +terminator CncDef ";" ; +separator CId "," ; +separator Term "," ; +terminator Exp "" ; +terminator String "" ; +separator Variant "," ; + +token CId (('_' | letter) (letter | digit | '\'' | '_')*) ; diff --git a/src-3.0/GF/GFCC/doc/old-gfcc.txt b/src-3.0/GF/GFCC/doc/old-gfcc.txt new file mode 100644 index 000000000..6ffd9bd64 --- /dev/null +++ b/src-3.0/GF/GFCC/doc/old-gfcc.txt @@ -0,0 +1,656 @@ +The GFCC Grammar Format +Aarne Ranta +October 19, 2006 + +Author's address: +[``http://www.cs.chalmers.se/~aarne`` http://www.cs.chalmers.se/~aarne] + +% to compile: txt2tags -thtml --toc gfcc.txt + +History: +- 19 Oct: translation of lincats, new figures on C++ +- 3 Oct 2006: first version + + +==What is GFCC== + +GFCC is a low-level format for GF grammars. Its aim is to contain the minimum +that is needed to process GF grammars at runtime. This minimality has three +advantages: +- compact grammar files and run-time objects +- time and space efficient processing +- simple definition of interpreters + + +The idea is that all embedded GF applications are compiled to GFCC. +The GF system would be primarily used as a compiler and as a grammar +development tool. + +Since GFCC is implemented in BNFC, a parser of the format is readily +available for C, C++, Haskell, Java, and OCaml. Also an XML +representation is generated in BNFC. A +[reference implementation ../] +of linearization and some other functions has been written in Haskell. + + +==GFCC vs. GFC== + +GFCC is aimed to replace GFC as the run-time grammar format. GFC was designed +to be a run-time format, but also to +support separate compilation of grammars, i.e. +to store the results of compiling +individual GF modules. But this means that GFC has to contain extra information, +such as type annotations, which is only needed in compilation and not at +run-time. In particular, the pattern matching syntax and semantics of GFC is +complex and therefore difficult to implement in new platforms. + +The main differences of GFCC compared with GFC can be summarized as follows: +- there are no modules, and therefore no qualified names +- a GFCC grammar is multilingual, and consists of a common abstract syntax + together with one concrete syntax per language +- records and tables are replaced by arrays +- record labels and parameter values are replaced by integers +- record projection and table selection are replaced by array indexing +- there is (so far) no support for dependent types or higher-order abstract + syntax (which would be easy to add, but make interpreters much more difficult + to write) + + +Here is an example of a GF grammar, consisting of three modules, +as translated to GFCC. The representations are aligned, with the exceptions +due to the alphabetical sorting of GFCC grammars. +``` + grammar Ex(Eng,Swe); + +abstract Ex = { abstract { + cat + S ; NP ; VP ; + fun + Pred : NP -> VP -> S ; Pred : NP,VP -> S = (Pred); + She, They : NP ; She : -> NP = (She); + Sleep : VP ; Sleep : -> VP = (Sleep); + They : -> NP = (They); +} } ; + +concrete Eng of Ex = { concrete Eng { + lincat + S = {s : Str} ; + NP = {s : Str ; n : Num} ; + VP = {s : Num => Str} ; + param + Num = Sg | Pl ; + lin + Pred np vp = { Pred = [(($0!1),(($1!0)!($0!0)))]; + s = np.s ++ vp.s ! np.n} ; + She = {s = "she" ; n = Sg} ; She = [0, "she"]; + They = {s = "they" ; n = Pl} ; + Sleep = {s = table { Sleep = [("sleep" + ["s",""])]; + Sg => "sleeps" ; + Pl => "sleep" They = [1, "they"]; + } } ; + } ; +} + +concrete Swe of Ex = { concrete Swe { + lincat + S = {s : Str} ; + NP = {s : Str} ; + VP = {s : Str} ; + param + Num = Sg | Pl ; + lin + Pred np vp = { Pred = [(($0!0),($1!0))]; + s = np.s ++ vp.s} ; + She = {s = "hon"} ; She = ["hon"]; + They = {s = "de"} ; They = ["de"]; + Sleep = {s = "sover"} ; Sleep = ["sover"]; +} } ; +``` + +==The syntax of GFCC files== + +===Top level=== + +A grammar has a header telling the name of the abstract syntax +(often specifying an application domain), and the names of +the concrete languages. The abstract syntax and the concrete +syntaxes themselves follow. +``` + Grammar ::= Header ";" Abstract ";" [Concrete] ; + Header ::= "grammar" CId "(" [CId] ")" ; + Abstract ::= "abstract" "{" [AbsDef] "}" ; + Concrete ::= "concrete" CId "{" [CncDef] "}" ; +``` +Abstract syntax judgements give typings and semantic definitions. +Concrete syntax judgements give linearizations. +``` + AbsDef ::= CId ":" Type "=" Exp ; + CncDef ::= CId "=" Term ; +``` +Also flags are possible, local to each "module" (i.e. abstract and concretes). +``` + AbsDef ::= "%" CId "=" String ; + CncDef ::= "%" CId "=" String ; +``` +For the run-time system, the reference implementation in Haskell +uses a structure that gives efficient look-up: +``` + data GFCC = GFCC { + absname :: CId , + cncnames :: [CId] , + abstract :: Abstr , + concretes :: Map CId Concr + } + + data Abstr = Abstr { + funs :: Map CId Type, -- find the type of a fun + cats :: Map CId [CId] -- find the funs giving a cat + } + + type Concr = Map CId Term +``` + + +===Abstract syntax=== + +Types are first-order function types built from +category symbols. Syntax trees (``Exp``) are +rose trees with the head (``Atom``) either a function +constant, a metavariable, or a string, integer, or float +literal. +``` + Type ::= [CId] "->" CId ; + Exp ::= "(" Atom [Exp] ")" ; + Atom ::= CId ; -- function constant + Atom ::= "?" ; -- metavariable + Atom ::= String ; -- string literal + Atom ::= Integer ; -- integer literal + Atom ::= Double ; -- float literal +``` + + +===Concrete syntax=== + +Linearization terms (``Term``) are built as follows. +Constructor names are shown to make the later code +examples readable. +``` + R. Term ::= "[" [Term] "]" ; -- array + P. Term ::= "(" Term "!" Term ")" ; -- access to indexed field + S. Term ::= "(" [Term] ")" ; -- sequence with ++ + K. Term ::= Tokn ; -- token + V. Term ::= "$" Integer ; -- argument + C. Term ::= Integer ; -- array index + FV. Term ::= "[|" [Term] "|]" ; -- free variation + TM. Term ::= "?" ; -- linearization of metavariable +``` +Tokens are strings or (maybe obsolescent) prefix-dependent +variant lists. +``` + KS. Tokn ::= String ; + KP. Tokn ::= "[" "pre" [String] "[" [Variant] "]" "]" ; + Var. Variant ::= [String] "/" [String] ; +``` +Three special forms of terms are introduced by the compiler +as optimizations. They can in principle be eliminated, but +their presence makes grammars much more compact. Their semantics +will be explained in a later section. +``` + F. Term ::= CId ; -- global constant + W. Term ::= "(" String "+" Term ")" ; -- prefix + suffix table + RP. Term ::= "(" Term "@" Term ")"; -- record parameter alias +``` +Identifiers are like ``Ident`` in GF and GFC, except that +the compiler produces constants prefixed with ``_`` in +the common subterm elimination optimization. +``` + token CId (('_' | letter) (letter | digit | '\'' | '_')*) ; +``` + + +==The semantics of concrete syntax terms== + +===Linearization and realization=== + +The linearization algorithm is essentially the same as in +GFC: a tree is linearized by evaluating its linearization term +in the environment of the linearizations of the subtrees. +Literal atoms are linearized in the obvious way. +The function also needs to know the language (i.e. concrete syntax) +in which linearization is performed. +``` + linExp :: GFCC -> CId -> Exp -> Term + linExp mcfg lang tree@(Tr at trees) = case at of + AC fun -> comp (Prelude.map lin trees) $ look fun + AS s -> R [kks (show s)] -- quoted + AI i -> R [kks (show i)] + AF d -> R [kks (show d)] + AM -> TM + where + lin = linExp mcfg lang + comp = compute mcfg lang + look = lookLin mcfg lang +``` +The result of linearization is usually a record, which is realized as +a string using the following algorithm. +``` + realize :: Term -> String + realize trm = case trm of + R (t:_) -> realize t + S ss -> unwords $ Prelude.map realize ss + K (KS s) -> s + K (KP s _) -> unwords s ---- prefix choice TODO + W s t -> s ++ realize t + FV (t:_) -> realize t + TM -> "?" +``` +Since the order of record fields is not necessarily +the same as in GF source, +this realization does not work securely for +categories whose lincats more than one field. + + +===Term evaluation=== + +Evaluation follows call-by-value order, with two environments +needed: +- the grammar (a concrete syntax) to give the global constants +- an array of terms to give the subtree linearizations + + +The code is presented in one-level pattern matching, to +enable reimplementations in languages that do not permit +deep patterns (such as Java and C++). +``` +compute :: GFCC -> CId -> [Term] -> Term -> Term +compute mcfg lang args = comp where + comp trm = case trm of + P r p -> proj (comp r) (comp p) + RP i t -> RP (comp i) (comp t) + W s t -> W s (comp t) + R ts -> R $ Prelude.map comp ts + V i -> idx args (fromInteger i) -- already computed + F c -> comp $ look c -- not computed (if contains V) + FV ts -> FV $ Prelude.map comp ts + S ts -> S $ Prelude.filter (/= S []) $ Prelude.map comp ts + _ -> trm + + look = lookLin mcfg lang + + idx xs i = xs !! i + + proj r p = case (r,p) of + (_, FV ts) -> FV $ Prelude.map (proj r) ts + (W s t, _) -> kks (s ++ getString (proj t p)) + _ -> comp $ getField r (getIndex p) + + getString t = case t of + K (KS s) -> s + _ -> trace ("ERROR in grammar compiler: string from "++ show t) "ERR" + + getIndex t = case t of + C i -> fromInteger i + RP p _ -> getIndex p + TM -> 0 -- default value for parameter + _ -> trace ("ERROR in grammar compiler: index from " ++ show t) 0 + + getField t i = case t of + R rs -> idx rs i + RP _ r -> getField r i + TM -> TM + _ -> trace ("ERROR in grammar compiler: field from " ++ show t) t +``` + +===The special term constructors=== + +The three forms introduced by the compiler may a need special +explanation. + +Global constants +``` + Term ::= CId ; +``` +are shorthands for complex terms. They are produced by the +compiler by (iterated) common subexpression elimination. +They are often more powerful than hand-devised code sharing in the source +code. They could be computed off-line by replacing each identifier by +its definition. + +Prefix-suffix tables +``` + Term ::= "(" String "+" Term ")" ; +``` +represent tables of word forms divided to the longest common prefix +and its array of suffixes. In the example grammar above, we have +``` + Sleep = [("sleep" + ["s",""])] +``` +which in fact is equal to the array of full forms +``` + ["sleeps", "sleep"] +``` +The power of this construction comes from the fact that suffix sets +tend to be repeated in a language, and can therefore be collected +by common subexpression elimination. It is this technique that +explains the used syntax rather than the more accurate +``` + "(" String "+" [String] ")" +``` +since we want the suffix part to be a ``Term`` for the optimization to +take effect. + +The most curious construct of GFCC is the parameter array alias, +``` + Term ::= "(" Term "@" Term ")"; +``` +This form is used as the value of parameter records, such as the type +``` + {n : Number ; p : Person} +``` +The problem with parameter records is their double role. +They can be used like parameter values, as indices in selection, +``` + VP.s ! {n = Sg ; p = P3} +``` +but also as records, from which parameters can be projected: +``` + {n = Sg ; p = P3}.n +``` +Whichever use is selected as primary, a prohibitively complex +case expression must be generated at compilation to GFCC to get the +other use. The adopted +solution is to generate a pair containing both a parameter value index +and an array of indices of record fields. For instance, if we have +``` + param Number = Sg | Pl ; Person = P1 | P2 | P3 ; +``` +we get the encoding +``` + {n = Sg ; p = P3} ---> (2 @ [0,2]) +``` +The GFCC computation rules are essentially +``` + (t ! (i @ _)) = (t ! i) + ((_ @ r) ! j) =(r ! j) +``` + + +==Compiling to GFCC== + +Compilation to GFCC is performed by the GF grammar compiler, and +GFCC interpreters need not know what it does. For grammar writers, +however, it might be interesting to know what happens to the grammars +in the process. + +The compilation phases are the following ++ translate GF source to GFC, as always in GF ++ undo GFC back-end optimizations ++ perform the ``values`` optimization to normalize tables ++ create a symbol table mapping the GFC parameter and record types to + fixed-size arrays, and parameter values and record labels to integers ++ traverse the linearization rules replacing parameters and labels by integers ++ reorganize the created GFC grammar so that it has just one abstract syntax + and one concrete syntax per language ++ apply UTF8 encoding to the grammar, if not yet applied (this is told by the + ``coding`` flag) ++ translate the GFC syntax tree to a GFCC syntax tree, using a simple + compositional mapping ++ perform the word-suffix optimization on GFCC linearization terms ++ perform subexpression elimination on each concrete syntax module ++ print out the GFCC code + + +Notice that a major part of the compilation is done within GFC, so that +GFC-related tasks (such as parser generation) could be performed by +using the old algorithms. + + +===Problems in GFCC compilation=== + +Two major problems had to be solved in compiling GFC to GFCC: +- consistent order of tables and records, to permit the array translation +- run-time variables in complex parameter values. + + +The current implementation is still experimental and may fail +to generate correct code. Any errors remaining are likely to be +related to the two problems just mentioned. + +The order problem is solved in different ways for tables and records. +For tables, the ``values`` optimization of GFC already manages to +maintain a canonical order. But this order can be destroyed by the +``share`` optimization. To make sure that GFCC compilation works properly, +it is safest to recompile the GF grammar by using the ``values`` +optimization flag. + +Records can be canonically ordered by sorting them by labels. +In fact, this was done in connection of the GFCC work as a part +of the GFC generation, to guarantee consistency. This means that +e.g. the ``s`` field will in general no longer appear as the first +field, even if it does so in the GF source code. But relying on the +order of fields in a labelled record would be misplaced anyway. + +The canonical form of records is further complicated by lock fields, +i.e. dummy fields of form ``lock_C = <>``, which are added to grammar +libraries to force intensionality of linearization types. The problem +is that the absence of a lock field only generates a warning, not +an error. Therefore a GFC grammar can contain objects of the same +type with and without a lock field. This problem was solved in GFCC +generation by just removing all lock fields (defined as fields whose +type is the empty record type). This has the further advantage of +(slightly) reducing the grammar size. More importantly, it is safe +to remove lock fields, because they are never used in computation, +and because intensional types are only needed in grammars reused +as libraries, not in grammars used at runtime. + +While the order problem is rather bureaucratic in nature, run-time +variables are an interesting problem. They arise in the presence +of complex parameter values, created by argument-taking constructors +and parameter records. To give an example, consider the GF parameter +type system +``` + Number = Sg | Pl ; + Person = P1 | P2 | P3 ; + Agr = Ag Number Person ; +``` +The values can be translated to integers in the expected way, +``` + Sg = 0, Pl = 1 + P1 = 0, P2 = 1, P3 = 2 + Ag Sg P1 = 0, Ag Sg P2 = 1, Ag Sg P3 = 2, + Ag Pl P1 = 3, Ag Pl P2 = 4, Ag Pl P3 = 5 +``` +However, an argument of ``Agr`` can be a run-time variable, as in +``` + Ag np.n P3 +``` +This expression must first be translated to a case expression, +``` + case np.n of { + 0 => 2 ; + 1 => 5 + } +``` +which can then be translated to the GFCC term +``` + ([2,5] ! ($0 ! $1)) +``` +assuming that the variable ``np`` is the first argument and that its +``Number`` field is the second in the record. + +This transformation of course has to be performed recursively, since +there can be several run-time variables in a parameter value: +``` + Ag np.n np.p +``` +A similar transformation would be possible to deal with the double +role of parameter records discussed above. Thus the type +``` + RNP = {n : Number ; p : Person} +``` +could be uniformly translated into the set ``{0,1,2,3,4,5}`` +as ``Agr`` above. Selections would be simple instances of indexing. +But any projection from the record should be translated into +a case expression, +``` + rnp.n ===> + case rnp of { + 0 => 0 ; + 1 => 0 ; + 2 => 0 ; + 3 => 1 ; + 4 => 1 ; + 5 => 1 + } +``` +To avoid the code bloat resulting from this, we chose the alias representation +which is easy enough to deal with in interpreters. + + +===The representation of linearization types=== + +Linearization types (``lincat``) are not needed when generating with +GFCC, but they have been added to enable parser generation directly from +GFCC. The linearization type definitions are shown as a part of the +concrete syntax, by using terms to represent types. Here is the table +showing how different linearization types are encoded. +``` + P* = size(P) -- parameter type + {_ : I ; __ : R}* = (I* @ R*) -- record of parameters + {r1 : T1 ; ... ; rn : Tn}* = [T1*,...,Tn*] -- other record + (P => T)* = [T* ,...,T*] -- size(P) times + Str* = () +``` +The category symbols are prefixed with two underscores (``__``). +For example, the linearization type ``present/CatEng.NP`` is +translated as follows: +``` + NP = { + a : { -- 6 = 2*3 values + n : {ParamX.Number} ; -- 2 values + p : {ParamX.Person} -- 3 values + } ; + s : {ResEng.Case} => Str -- 3 values + } + + __NP = [(6@[2,3]),[(),(),()]] +``` + + + + +===Running the compiler and the GFCC interpreter=== + +GFCC generation is a part of the +[developers' version http://www.cs.chalmers.se/Cs/Research/Language-technology/darcs/GF/doc/darcs.html] +of GF since September 2006. To invoke the compiler, the flag +``-printer=gfcc`` to the command +``pm = print_multi`` is used. It is wise to recompile the grammar from +source, since previously compiled libraries may not obey the canonical +order of records. To ``strip`` the grammar before +GFCC translation removes unnecessary interface references. +Here is an example, performed in +[example/bronzeage ../../../../../examples/bronzeage]. +``` + i -src -path=.:prelude:resource-1.0/* -optimize=all_subs BronzeageEng.gf + i -src -path=.:prelude:resource-1.0/* -optimize=all_subs BronzeageGer.gf + strip + pm -printer=gfcc | wf bronze.gfcc +``` + + + +==The reference interpreter== + +The reference interpreter written in Haskell consists of the following files: +``` + -- source file for BNFC + GFCC.cf -- labelled BNF grammar of gfcc + + -- files generated by BNFC + AbsGFCC.hs -- abstrac syntax of gfcc + ErrM.hs -- error monad used internally + LexGFCC.hs -- lexer of gfcc files + ParGFCC.hs -- parser of gfcc files and syntax trees + PrintGFCC.hs -- printer of gfcc files and syntax trees + + -- hand-written files + DataGFCC.hs -- post-parser grammar creation, linearization and evaluation + GenGFCC.hs -- random and exhaustive generation, generate-and-test parsing + RunGFCC.hs -- main function - a simple command interpreter +``` +It is included in the +[developers' version http://www.cs.chalmers.se/Cs/Research/Language-technology/darcs/GF/doc/darcs.html] +of GF, in the subdirectory [``GF/src/GF/Canon/GFCC`` ../]. + +To compile the interpreter, type +``` + make gfcc +``` +in ``GF/src``. To run it, type +``` + ./gfcc <GFCC-file> +``` +The available commands are +- ``gr <Cat> <Int>``: generate a number of random trees in category. + and show their linearizations in all languages +- ``grt <Cat> <Int>``: generate a number of random trees in category. + and show the trees and their linearizations in all languages +- ``gt <Cat> <Int>``: generate a number of trees in category from smallest, + and show their linearizations in all languages +- ``gtt <Cat> <Int>``: generate a number of trees in category from smallest, + and show the trees and their linearizations in all languages +- ``p <Int> <Cat> <String>``: "parse", i.e. generate trees until match or + until the given number have been generated +- ``<Tree>``: linearize tree in all languages, also showing full records +- ``quit``: terminate the system cleanly + + +==Interpreter in C++== + +A base-line interpreter in C++ has been started. +Its main functionality is random generation of trees and linearization of them. + +Here are some results from running the different interpreters, compared +to running the same grammar in GF, saved in ``.gfcm`` format. +The grammar contains the English, German, and Norwegian +versions of Bronzeage. The experiment was carried out on +Ubuntu Linux laptop with 1.5 GHz Intel centrino processor. + +|| | GF | gfcc(hs) | gfcc++ | +| program size | 7249k | 803k | 113k +| grammar size | 336k | 119k | 119k +| read grammar | 1150ms | 510ms | 100ms +| generate 222 | 9500ms | 450ms | 800ms +| memory | 21M | 10M | 20M + + + +To summarize: +- going from GF to gfcc is a major win in both code size and efficiency +- going from Haskell to C++ interpreter is not a win yet, because of a space + leak in the C++ version + + + +==Some things to do== + +Interpreter in Java. + +Parsing via MCFG +- the FCFG format can possibly be simplified +- parser grammars should be saved in files to make interpreters easier + + +Hand-written parsers for GFCC grammars to reduce code size +(and efficiency?) of interpreters. + +Binary format and/or file compression of GFCC output. + +Syntax editor based on GFCC. + +Rewriting of resource libraries in order to exploit the +word-suffix sharing better (depth-one tables, as in FM). + + + diff --git a/src-3.0/GF/GFCC/doc/syntax.txt b/src-3.0/GF/GFCC/doc/syntax.txt new file mode 100644 index 000000000..db8f7c149 --- /dev/null +++ b/src-3.0/GF/GFCC/doc/syntax.txt @@ -0,0 +1,180 @@ +GFCC Syntax + + +==Syntax of GFCC files== + +The parser syntax is very simple, as defined in BNF: +``` + Grm. Grammar ::= [RExp] ; + + App. RExp ::= "(" CId [RExp] ")" ; + AId. RExp ::= CId ; + AInt. RExp ::= Integer ; + AStr. RExp ::= String ; + AFlt. RExp ::= Double ; + AMet. RExp ::= "?" ; + + terminator RExp "" ; + + token CId (('_' | letter) (letter | digit | '\'' | '_')*) ; +``` +While a parser and a printer can be generated for many languages +from this grammar by using the BNF Converter, a parser is also +easy to write by hand using recursive descent. + + +==Syntax of well-formed GFCC code== + +Here is a summary of well-formed syntax, +with a comment on the semantics of each construction. +``` + Grammar ::= + ("grammar" CId CId*) -- abstract syntax name and concrete syntax names + "(" "flags" Flag* ")" -- global and abstract flags + "(" "abstract" Abstract ")" -- abstract syntax + "(" "concrete" Concrete* ")" -- concrete syntaxes + + Abstract ::= + "(" "fun" FunDef* ")" -- function definitions + "(" "cat" CatDef* ")" -- category definitions + + Concrete ::= + "(" CId -- language name + "flags" Flag* -- concrete flags + "lin" LinDef* -- linearization rules + "oper" LinDef* -- operations (macros) + "lincat" LinDef* -- linearization type definitions + "lindef" LinDef* -- linearization default definitions + "printname" LinDef* -- printname definitions + "param" LinDef* -- lincats with labels and parameter value names + ")" + + Flag ::= "(" CId String ")" -- flag and value + FunDef ::= "(" CId Type Exp ")" -- function, type, and definition + CatDef ::= "(" CId Hypo* ")" -- category and context + LinDef ::= "(" CId Term ")" -- function and definition + + Type ::= + "(" CId -- value category + "(" "H" Hypo* ")" -- argument context + "(" "X" Exp* ")" ")" -- arguments (of dependent value type) + + Exp ::= + "(" CId -- function + "(" "B" CId* ")" -- bindings + "(" "X" Exp* ")" ")" -- arguments + | CId -- variable + | "?" -- metavariable + | "(" "Eq" Equation* ")" -- group of pattern equations + | Integer -- integer literal (non-negative) + | Float -- floating-point literal (non-negative) + | String -- string literal (in double quotes) + + Hypo ::= "(" CId Type ")" -- variable and type + + Equation ::= "(" "E" Exp Exp* ")" -- value and pattern list + + Term ::= + "(" "R" Term* ")" -- array (record or table) + | "(" "S" Term* ")" -- concatenated sequence + | "(" "FV" Term* ")" -- free variant list + | "(" "P" Term Term ")" -- access to index (projection or selection) + | "(" "W" String Term ")" -- token prefix with suffix list + | "(" "A" Integer ")" -- pointer to subtree + | String -- token (in double quotes) + | Integer -- index in array + | CId -- macro constant + | "?" -- metavariable +``` + + +==GFCC interpreter== + +The first phase in interpreting GFCC is to parse a GFCC file and +build an internal abstract syntax representation, as specified +in the previous section. + +With this representation, linearization can be performed by +a straightforward function from expressions (``Exp``) to terms +(``Term``). All expressions except groups of pattern equations +can be linearized. + +Here is a reference Haskell implementation of linearization: +``` + linExp :: GFCC -> CId -> Exp -> Term + linExp gfcc lang tree@(DTr _ at trees) = case at of + AC fun -> comp (map lin trees) $ look fun + AS s -> R [K (show s)] -- quoted + AI i -> R [K (show i)] + AF d -> R [K (show d)] + AM -> TM + where + lin = linExp gfcc lang + comp = compute gfcc lang + look = lookLin gfcc lang +``` +TODO: bindings must be supported. + +Terms resulting from linearization are evaluated in +call-by-value order, with two environments needed: +- the grammar (a concrete syntax) to give the global constants +- an array of terms to give the subtree linearizations + + +The Haskell implementation works as follows: +``` +compute :: GFCC -> CId -> [Term] -> Term -> Term +compute gfcc lang args = comp where + comp trm = case trm of + P r p -> proj (comp r) (comp p) + W s t -> W s (comp t) + R ts -> R $ map comp ts + V i -> idx args (fromInteger i) -- already computed + F c -> comp $ look c -- not computed (if contains V) + FV ts -> FV $ Prelude.map comp ts + S ts -> S $ Prelude.filter (/= S []) $ Prelude.map comp ts + _ -> trm + + look = lookOper gfcc lang + + idx xs i = xs !! i + + proj r p = case (r,p) of + (_, FV ts) -> FV $ Prelude.map (proj r) ts + (FV ts, _ ) -> FV $ Prelude.map (\t -> proj t p) ts + (W s t, _) -> kks (s ++ getString (proj t p)) + _ -> comp $ getField r (getIndex p) + + getString t = case t of + K (KS s) -> s + _ -> trace ("ERROR in grammar compiler: string from "++ show t) "ERR" + + getIndex t = case t of + C i -> fromInteger i + RP p _ -> getIndex p + TM -> 0 -- default value for parameter + _ -> trace ("ERROR in grammar compiler: index from " ++ show t) 0 + + getField t i = case t of + R rs -> idx rs i + RP _ r -> getField r i + TM -> TM + _ -> trace ("ERROR in grammar compiler: field from " ++ show t) t +``` +The result of linearization is usually a record, which is realized as +a string using the following algorithm. +``` + realize :: Term -> String + realize trm = case trm of + R (t:_) -> realize t + S ss -> unwords $ map realize ss + K s -> s + W s t -> s ++ realize t + FV (t:_) -> realize t -- TODO: all variants + TM -> "?" +``` +Notice that realization always picks the first field of a record. +If a linearization type has more than one field, the first field +does not necessarily contain the desired string. +Also notice that the order of record fields in GFCC is not necessarily +the same as in GF source. |
