---------------------------------------------------------------------- -- | -- Module : Treebank -- Maintainer : AR -- Stability : (stable) -- Portability : (portable) -- -- Generate multilingual treebanks. AR 8\/2\/2006 -- -- (c) Aarne Ranta 2006 under GNU GPL -- -- Purpose: to generate treebanks. ----------------------------------------------------------------------------- module GF.UseGrammar.Treebank ( mkTreebank, testTreebank, treesTreebank, getTreebank, lookupTreebank, pre2treebank ) where import GF.Compile.ShellState import GF.UseGrammar.Linear (linTree2string) import GF.UseGrammar.Custom import GF.UseGrammar.GetTree (string2tree) import GF.Grammar.TypeCheck (annotate) import GF.Canon.CMacros (noMark) import GF.Grammar.Grammar (Trm) import GF.Grammar.MMacros (exp2tree) import GF.Grammar.Macros (zIdent) import GF.Grammar.PrGrammar (prt_) import GF.Grammar.Values (tree2exp) import GF.Data.Operations import GF.Infra.Option import qualified GF.Grammar.Abstract as A import qualified Data.Map as M -- Generate a treebank with a multilingual grammar. AR 8/2/2006 -- (c) Aarne Ranta 2006 under GNU GPL -- | the main functions mkTreebank :: Options -> ShellState -> String -> [A.Tree] -> Res mkTreebank opts sh com trees = putInXML opts "treebank" comm (concatMap mkItem tris) where mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t ++ concatMap (mkLin t) langs) -- mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t >>mapM_ (mkLin t) langs) mkTree t = putInXML opts "tree" [] (puts $ showTree t) mkLin t lg = putInXML opts "lin" (lang lg) (puts $ linearize sh lg t) langs = [prt_ l | l <- allLanguages sh] comm = "" --- " command=" ++ show com +++ "abstract=" ++ show abstr abstr = "" --- "Abs" ---- cat i = " number=" ++ show (show i) --- " cat=" ++ show "S" ---- lang lg = " lang=" ++ show (prt_ (zIdent lg)) tris = zip trees [1..] testTreebank :: Options -> ShellState -> String -> Res testTreebank opts sh = putInXML opts "testtreebank" [] . concatMap testOne . getTreebanks . lines where testOne (e,lang,str0) = do let tr = annot gr e let str = linearize sh lang tr if str == str0 then ret else putInXML opts "diff" [] $ concat [ putInXML opts "tree" [] (puts $ showTree tr), putInXML opts "old" (" lang=" ++ show (prt_ (zIdent lang))) $ puts str0, putInXML opts "new" (" lang=" ++ show (prt_ (zIdent lang))) $ puts str ] gr = firstStateGrammar sh treesTreebank :: Options -> String -> [String] treesTreebank _ = terms . getTreebank . lines where terms ts = [t | (t,_) <- ts] -- string vs. IO type Res = [String] -- IO () puts :: String -> Res puts = return -- putStrLn ret = [] -- return () -- type PreTreebank = [(String,[(String,String)])] getTreebanks :: [String] -> [(String,String,String)] getTreebanks = concatMap grps . getTreebank where grps (t,lls) = [(t,x,y) | (x,y) <- lls] getTreebank :: [String] -> PreTreebank getTreebank ll = case ll of l:ls@(_:_:_) -> let (l1,l2) = getItem ls (tr,lins) = getTree l1 lglins = getLins lins in (tr,lglins) : getTreebank l2 _ -> [] where getItem = span ((/=" String -> [(String,String)] lookupTreebank tb s = maybe [] id $ M.lookup s tb pre2treebank :: PreTreebank -> Treebank pre2treebank tb = M.fromListWith (++) [(s,[(l,t)]) | (t,ls) <- tb, (l,s) <- ls] annot :: StateGrammar -> String -> A.Tree annot gr s = errVal (error "illegal tree") $ do let t = tree2exp $ string2tree gr s annotate (grammar gr) t putInXML :: Options -> String -> String -> Res -> Res putInXML opts tag attrs io = (ifXML $ puts $ tagXML $ tag ++ attrs) ++ io ++ (ifXML $ puts $ tagXML $ '/':tag) where ifXML c = if oElem showXML opts then c else [] tagXML :: String -> String tagXML s = "<" ++ s ++ ">" --- these handy functions are borrowed from EmbedAPI linearize mgr lang = untok . linTree2string noMark (canModules mgr) (zIdent lang) where sgr = stateGrammarOfLangOpt False mgr (zIdent lang) untok = customOrDefault (stateOptions sgr) useUntokenizer customUntokenizer sgr showTree t = prt_ $ tree2exp t