summaryrefslogtreecommitdiff
path: root/src-3.0/GF/UseGrammar/Treebank.hs
diff options
context:
space:
mode:
authoraarne <aarne@cs.chalmers.se>2008-05-21 09:26:44 +0000
committeraarne <aarne@cs.chalmers.se>2008-05-21 09:26:44 +0000
commit055c0d0d5a5bb0dc75904fe53df7f2e4f5732a8f (patch)
tree0e63fb68c69c8f6ad0f78893c63420f0a3600e1c /src-3.0/GF/UseGrammar/Treebank.hs
parent915a1de71783ab8446b1af9e72c7ba7dfbc12d3f (diff)
GF/src is now for 2.9, and the new sources are in src-3.0 - keep it this way until the release of GF 3
Diffstat (limited to 'src-3.0/GF/UseGrammar/Treebank.hs')
-rw-r--r--src-3.0/GF/UseGrammar/Treebank.hs251
1 files changed, 251 insertions, 0 deletions
diff --git a/src-3.0/GF/UseGrammar/Treebank.hs b/src-3.0/GF/UseGrammar/Treebank.hs
new file mode 100644
index 000000000..841a9c6dc
--- /dev/null
+++ b/src-3.0/GF/UseGrammar/Treebank.hs
@@ -0,0 +1,251 @@
+----------------------------------------------------------------------
+-- |
+-- Module : Treebank
+-- Maintainer : AR
+-- Stability : (stable)
+-- Portability : (portable)
+--
+-- Generate multilingual treebanks. AR 8\/2\/2006
+--
+-- (c) Aarne Ranta 2006 under GNU GPL
+--
+-- Purpose: to generate treebanks.
+-----------------------------------------------------------------------------
+
+module GF.UseGrammar.Treebank (
+ mkMultiTreebank,
+ mkUniTreebank,
+ multi2uniTreebank,
+ uni2multiTreebank,
+ testMultiTreebank,
+ treesTreebank,
+ getTreebank,
+ getUniTreebank,
+ readUniTreebanks,
+ readMultiTreebank,
+ lookupTreebank,
+ assocsTreebank,
+ isWordInTreebank,
+ printAssoc,
+ mkCompactTreebank
+ ) where
+
+import GF.Compile.ShellState
+import GF.UseGrammar.Linear -- (linTree2string)
+import GF.UseGrammar.Custom
+import GF.UseGrammar.GetTree (string2tree)
+import GF.Grammar.TypeCheck (annotate)
+import GF.Canon.CMacros (noMark)
+import GF.Grammar.Grammar (Trm)
+import GF.Grammar.MMacros (exp2tree)
+import GF.Grammar.Macros (zIdent)
+import GF.Grammar.PrGrammar (prt_,prt)
+import GF.Grammar.Values (tree2exp)
+import GF.Data.Operations
+import GF.Infra.Option
+import GF.Infra.Ident (Ident)
+import GF.Infra.UseIO
+import qualified GF.Grammar.Abstract as A
+import qualified Data.Map as M
+import qualified Data.Set as S
+import qualified Data.List as L
+import Control.Monad (liftM)
+import System.FilePath
+
+-- Generate a treebank with a multilingual grammar. AR 8/2/2006
+-- (c) Aarne Ranta 2006 under GNU GPL
+
+-- keys are trees; format: XML file
+type MultiTreebank = [(String,[(String,String)])] -- tree,lang,lin
+
+-- keys are strings; format: string TAB tree TAB ... TAB tree
+type UniTreebank = Treebank -- M.Map String [String] -- string,tree
+
+-- both formats can be read from both kinds of files
+readUniTreebanks :: FilePath -> IO [(Ident,UniTreebank)]
+readUniTreebanks file = do
+ s <- readFileIf file
+ return $ if isMultiTreebank s
+ then multi2uniTreebank $ getTreebank $ lines s
+ else
+ let tb = getUniTreebank $ lines s
+ in [(zIdent (dropExtension file),tb)]
+
+readMultiTreebank :: FilePath -> IO MultiTreebank
+readMultiTreebank file = do
+ s <- readFileIf file
+ return $ if isMultiTreebank s
+ then getTreebank $ lines s
+ else uni2multiTreebank (zIdent (dropExtension file)) $ getUniTreebank $ lines s
+
+isMultiTreebank :: String -> Bool
+isMultiTreebank s = take 10 s == "<treebank>"
+
+multi2uniTreebank :: MultiTreebank -> [(Ident,UniTreebank)]
+multi2uniTreebank mt@((_,lls):_) = [(zIdent la, mkTb la) | (la,_) <- lls] where
+ mkTb la = M.fromListWith (++) [(s,[t]) | (t,lls) <- mt, (l,s) <- lls, l==la]
+multi2uniTreebank [] = []
+
+uni2multiTreebank :: Ident -> UniTreebank -> MultiTreebank
+uni2multiTreebank la tb =
+ [(t,[(prt_ la, s)]) | (s,ts) <- assocsTreebank tb, t <- ts]
+
+-- | the main functions
+
+-- builds a treebank where trees are the keys, and writes a file (opt. XML)
+mkMultiTreebank :: Options -> ShellState -> String -> [A.Tree] -> Res
+mkMultiTreebank opts sh com trees
+ | oElem (iOpt "compact") opts = mkCompactTreebank opts sh trees
+mkMultiTreebank opts sh com trees =
+ putInXML opts "treebank" comm (concatMap mkItem tris) where
+ mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t ++ concatMap (mkLin t) langs)
+-- mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t >>mapM_ (mkLin t) langs)
+ mkTree t = putInXML opts "tree" [] (puts $ showTree t)
+ mkLin t lg = putInXML opts "lin" (lang lg) (puts $ linearize opts sh lg t)
+
+ langs = [prt_ l | l <- allLanguages sh]
+ comm = "" --- " command=" ++ show com +++ "abstract=" ++ show abstr
+ abstr = "" --- "Abs" ----
+ cat i = " number=" ++ show (show i) --- " cat=" ++ show "S" ----
+ lang lg = " lang=" ++ show (prt_ (zIdent lg))
+ tris = zip trees [1..]
+
+-- builds a unilingual treebank where strings are the keys into an internal treebank
+
+mkUniTreebank :: Options -> ShellState -> Language -> [A.Tree] -> Treebank
+mkUniTreebank opts sh lg trees = M.fromListWith (++) [(lin t, [prt_ t]) | t <- trees]
+ where
+ lang = prt_ lg
+ lin t = linearize opts sh lang t
+
+-- reads a treebank and linearizes its trees again, printing all differences
+testMultiTreebank :: Options -> ShellState -> String -> Res
+testMultiTreebank opts sh = putInXML opts "testtreebank" [] .
+ concatMap testOne .
+ getTreebanks . lines
+ where
+ testOne (e,lang,str0) = do
+ let tr = annot gr e
+ let str = linearize opts sh lang tr
+ if str == str0 then ret else putInXML opts "diff" [] $ concat [
+ putInXML opts "tree" [] (puts $ showTree tr),
+ putInXML opts "old" (" lang=" ++ show (prt_ (zIdent lang))) $ puts str0,
+ putInXML opts "new" (" lang=" ++ show (prt_ (zIdent lang))) $ puts str
+ ]
+ gr = firstStateGrammar sh
+
+-- writes all the trees of the treebank
+treesTreebank :: Options -> String -> [String]
+treesTreebank _ = terms . getTreebank . lines where
+ terms ts = [t | (t,_) <- ts]
+
+-- string vs. IO
+type Res = [String] -- IO ()
+puts :: String -> Res
+puts = return -- putStrLn
+ret = [] -- return ()
+--
+
+-- here strings are keys
+assocsTreebank :: UniTreebank -> [(String,[String])]
+assocsTreebank = M.assocs
+
+isWordInTreebank :: UniTreebank -> String -> Bool
+isWordInTreebank tb w = S.member w (S.fromList (concatMap words (M.keys tb)))
+
+printAssoc (s, ts) = s ++ concat ["\t" ++ t | t <- ts]
+
+getTreebanks :: [String] -> [(String,String,String)]
+getTreebanks = concatMap grps . getTreebank where
+ grps (t,lls) = [(t,x,y) | (x,y) <- lls]
+
+getTreebank :: [String] -> MultiTreebank
+getTreebank ll = case ll of
+ l:ls@(_:_:_) ->
+ let (l1,l2) = getItem ls
+ (tr,lins) = getTree l1
+ lglins = getLins lins
+ in (tr,lglins) : getTreebank l2
+ _ -> []
+ where
+ getItem = span ((/="</item") . take 6)
+
+ getTree (_:ss) =
+ let (t1,t2) = span ((/="</tree") . take 6) ss in (last t1, drop 1 t2)
+
+ getLins (beg:str:end:ss) = (getLang beg, str):getLins ss
+ getLins _ = []
+
+ getLang = takeWhile (/='"') . tail . dropWhile (/='"')
+
+getUniTreebank :: [String] -> UniTreebank
+getUniTreebank ls = M.fromListWith (++) [(s, ts) | s:ts <- map chop ls] where
+ chop = chunks '\t'
+
+lookupTreebank :: Treebank -> String -> [String]
+lookupTreebank tb s = maybe [] id $ M.lookup s tb
+
+annot :: StateGrammar -> String -> A.Tree
+annot gr s = errVal (error "illegal tree") $ do
+ let t = tree2exp $ string2tree gr s
+ annotate (grammar gr) t
+
+putInXML :: Options -> String -> String -> Res -> Res
+putInXML opts tag attrs io =
+ (ifXML $ puts $ tagXML $ tag ++ attrs) ++
+ io ++
+ (ifXML $ puts $ tagXML $ '/':tag)
+ where
+ ifXML c = if oElem showXML opts then c else []
+
+
+tagXML :: String -> String
+tagXML s = "<" ++ s ++ ">"
+
+-- print the treebank in a compact format:
+-- first a sorted list of all words, referrable by index
+-- then the linearization of each tree, as sequences of word indices
+-- this format is usable in embedded translation systems.
+
+mkCompactTreebank :: Options -> ShellState -> [A.Tree] -> [String]
+mkCompactTreebank opts sh = printCompactTreebank . mkJustMultiTreebank opts sh
+
+printCompactTreebank :: (MultiTreebank,[String]) -> [String]
+printCompactTreebank (tb,lgs) = (stat:langs:unwords ws : "\n" : linss) where
+ ws = L.sort $ L.nub $ concat $ map (concatMap (words . snd) . snd) tb
+
+ linss = map (unwords . pad) linss0
+ linss0 = map (map (show . encode) . words) allExs
+ allExs = concat [[snd (ls !! i) | (_,ls) <- tb] | i <- [0..length lgs - 1]]
+ encode w = maybe undefined id $ M.lookup w wmap
+ wmap = M.fromAscList $ zip ws [1..]
+ stat = unwords $ map show [length ws, length lgs, length tb, smax]
+ langs = unwords lgs
+ smax = maximum $ map length linss0
+ pad ws = ws ++ replicate (smax - length ws) "0"
+
+-- [(String,[(String,String)])] -- tree,lang,lin
+mkJustMultiTreebank :: Options -> ShellState -> [A.Tree] -> (MultiTreebank,[String])
+mkJustMultiTreebank opts sh ts =
+ ([(prt_ t, [(la, lin la t) | la <- langs]) | t <- ts],langs) where
+ langs = map prt_ $ allLanguages sh
+ lin = linearize opts sh
+
+
+--- these handy functions are borrowed from EmbedAPI
+
+linearize opts mgr lang = lin where
+ sgr = stateGrammarOfLangOpt False mgr zlang
+ cgr = canModules mgr
+ zlang = zIdent lang
+ untok = customOrDefault (addOptions opts (stateOptions sgr)) useUntokenizer customUntokenizer sgr
+ lin
+ | oElem showRecord opts = err id id . liftM prt . linearizeNoMark cgr zlang
+ | oElem tableLin opts =
+ err id id . liftM (unlines . map untok . prLinTable True) . allLinTables True cgr zlang
+ | oElem showAll opts =
+ err id id . liftM (unlines . map untok . prLinTable False) . allLinTables False cgr zlang
+
+ | otherwise = untok . linTree2string noMark cgr zlang
+
+showTree t = prt_ $ tree2exp t