summaryrefslogtreecommitdiff
path: root/src/GF/UseGrammar/Treebank.hs
diff options
context:
space:
mode:
authoraarne <aarne@cs.chalmers.se>2008-06-25 16:43:48 +0000
committeraarne <aarne@cs.chalmers.se>2008-06-25 16:43:48 +0000
commitb96b36f43de3e2f8b58d5f539daa6f6d47f25870 (patch)
tree0992334be13cec6538a1dea22fbbf26ad6bdf224 /src/GF/UseGrammar/Treebank.hs
parentfe367412e0aeb4ad5c02de68e6eca382e0f96984 (diff)
removed src for 2.9
Diffstat (limited to 'src/GF/UseGrammar/Treebank.hs')
-rw-r--r--src/GF/UseGrammar/Treebank.hs251
1 files changed, 0 insertions, 251 deletions
diff --git a/src/GF/UseGrammar/Treebank.hs b/src/GF/UseGrammar/Treebank.hs
deleted file mode 100644
index 841a9c6dc..000000000
--- a/src/GF/UseGrammar/Treebank.hs
+++ /dev/null
@@ -1,251 +0,0 @@
-----------------------------------------------------------------------
--- |
--- Module : Treebank
--- Maintainer : AR
--- Stability : (stable)
--- Portability : (portable)
---
--- Generate multilingual treebanks. AR 8\/2\/2006
---
--- (c) Aarne Ranta 2006 under GNU GPL
---
--- Purpose: to generate treebanks.
------------------------------------------------------------------------------
-
-module GF.UseGrammar.Treebank (
- mkMultiTreebank,
- mkUniTreebank,
- multi2uniTreebank,
- uni2multiTreebank,
- testMultiTreebank,
- treesTreebank,
- getTreebank,
- getUniTreebank,
- readUniTreebanks,
- readMultiTreebank,
- lookupTreebank,
- assocsTreebank,
- isWordInTreebank,
- printAssoc,
- mkCompactTreebank
- ) where
-
-import GF.Compile.ShellState
-import GF.UseGrammar.Linear -- (linTree2string)
-import GF.UseGrammar.Custom
-import GF.UseGrammar.GetTree (string2tree)
-import GF.Grammar.TypeCheck (annotate)
-import GF.Canon.CMacros (noMark)
-import GF.Grammar.Grammar (Trm)
-import GF.Grammar.MMacros (exp2tree)
-import GF.Grammar.Macros (zIdent)
-import GF.Grammar.PrGrammar (prt_,prt)
-import GF.Grammar.Values (tree2exp)
-import GF.Data.Operations
-import GF.Infra.Option
-import GF.Infra.Ident (Ident)
-import GF.Infra.UseIO
-import qualified GF.Grammar.Abstract as A
-import qualified Data.Map as M
-import qualified Data.Set as S
-import qualified Data.List as L
-import Control.Monad (liftM)
-import System.FilePath
-
--- Generate a treebank with a multilingual grammar. AR 8/2/2006
--- (c) Aarne Ranta 2006 under GNU GPL
-
--- keys are trees; format: XML file
-type MultiTreebank = [(String,[(String,String)])] -- tree,lang,lin
-
--- keys are strings; format: string TAB tree TAB ... TAB tree
-type UniTreebank = Treebank -- M.Map String [String] -- string,tree
-
--- both formats can be read from both kinds of files
-readUniTreebanks :: FilePath -> IO [(Ident,UniTreebank)]
-readUniTreebanks file = do
- s <- readFileIf file
- return $ if isMultiTreebank s
- then multi2uniTreebank $ getTreebank $ lines s
- else
- let tb = getUniTreebank $ lines s
- in [(zIdent (dropExtension file),tb)]
-
-readMultiTreebank :: FilePath -> IO MultiTreebank
-readMultiTreebank file = do
- s <- readFileIf file
- return $ if isMultiTreebank s
- then getTreebank $ lines s
- else uni2multiTreebank (zIdent (dropExtension file)) $ getUniTreebank $ lines s
-
-isMultiTreebank :: String -> Bool
-isMultiTreebank s = take 10 s == "<treebank>"
-
-multi2uniTreebank :: MultiTreebank -> [(Ident,UniTreebank)]
-multi2uniTreebank mt@((_,lls):_) = [(zIdent la, mkTb la) | (la,_) <- lls] where
- mkTb la = M.fromListWith (++) [(s,[t]) | (t,lls) <- mt, (l,s) <- lls, l==la]
-multi2uniTreebank [] = []
-
-uni2multiTreebank :: Ident -> UniTreebank -> MultiTreebank
-uni2multiTreebank la tb =
- [(t,[(prt_ la, s)]) | (s,ts) <- assocsTreebank tb, t <- ts]
-
--- | the main functions
-
--- builds a treebank where trees are the keys, and writes a file (opt. XML)
-mkMultiTreebank :: Options -> ShellState -> String -> [A.Tree] -> Res
-mkMultiTreebank opts sh com trees
- | oElem (iOpt "compact") opts = mkCompactTreebank opts sh trees
-mkMultiTreebank opts sh com trees =
- putInXML opts "treebank" comm (concatMap mkItem tris) where
- mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t ++ concatMap (mkLin t) langs)
--- mkItem(t,i)= putInXML opts "item" (cat i) (mkTree t >>mapM_ (mkLin t) langs)
- mkTree t = putInXML opts "tree" [] (puts $ showTree t)
- mkLin t lg = putInXML opts "lin" (lang lg) (puts $ linearize opts sh lg t)
-
- langs = [prt_ l | l <- allLanguages sh]
- comm = "" --- " command=" ++ show com +++ "abstract=" ++ show abstr
- abstr = "" --- "Abs" ----
- cat i = " number=" ++ show (show i) --- " cat=" ++ show "S" ----
- lang lg = " lang=" ++ show (prt_ (zIdent lg))
- tris = zip trees [1..]
-
--- builds a unilingual treebank where strings are the keys into an internal treebank
-
-mkUniTreebank :: Options -> ShellState -> Language -> [A.Tree] -> Treebank
-mkUniTreebank opts sh lg trees = M.fromListWith (++) [(lin t, [prt_ t]) | t <- trees]
- where
- lang = prt_ lg
- lin t = linearize opts sh lang t
-
--- reads a treebank and linearizes its trees again, printing all differences
-testMultiTreebank :: Options -> ShellState -> String -> Res
-testMultiTreebank opts sh = putInXML opts "testtreebank" [] .
- concatMap testOne .
- getTreebanks . lines
- where
- testOne (e,lang,str0) = do
- let tr = annot gr e
- let str = linearize opts sh lang tr
- if str == str0 then ret else putInXML opts "diff" [] $ concat [
- putInXML opts "tree" [] (puts $ showTree tr),
- putInXML opts "old" (" lang=" ++ show (prt_ (zIdent lang))) $ puts str0,
- putInXML opts "new" (" lang=" ++ show (prt_ (zIdent lang))) $ puts str
- ]
- gr = firstStateGrammar sh
-
--- writes all the trees of the treebank
-treesTreebank :: Options -> String -> [String]
-treesTreebank _ = terms . getTreebank . lines where
- terms ts = [t | (t,_) <- ts]
-
--- string vs. IO
-type Res = [String] -- IO ()
-puts :: String -> Res
-puts = return -- putStrLn
-ret = [] -- return ()
---
-
--- here strings are keys
-assocsTreebank :: UniTreebank -> [(String,[String])]
-assocsTreebank = M.assocs
-
-isWordInTreebank :: UniTreebank -> String -> Bool
-isWordInTreebank tb w = S.member w (S.fromList (concatMap words (M.keys tb)))
-
-printAssoc (s, ts) = s ++ concat ["\t" ++ t | t <- ts]
-
-getTreebanks :: [String] -> [(String,String,String)]
-getTreebanks = concatMap grps . getTreebank where
- grps (t,lls) = [(t,x,y) | (x,y) <- lls]
-
-getTreebank :: [String] -> MultiTreebank
-getTreebank ll = case ll of
- l:ls@(_:_:_) ->
- let (l1,l2) = getItem ls
- (tr,lins) = getTree l1
- lglins = getLins lins
- in (tr,lglins) : getTreebank l2
- _ -> []
- where
- getItem = span ((/="</item") . take 6)
-
- getTree (_:ss) =
- let (t1,t2) = span ((/="</tree") . take 6) ss in (last t1, drop 1 t2)
-
- getLins (beg:str:end:ss) = (getLang beg, str):getLins ss
- getLins _ = []
-
- getLang = takeWhile (/='"') . tail . dropWhile (/='"')
-
-getUniTreebank :: [String] -> UniTreebank
-getUniTreebank ls = M.fromListWith (++) [(s, ts) | s:ts <- map chop ls] where
- chop = chunks '\t'
-
-lookupTreebank :: Treebank -> String -> [String]
-lookupTreebank tb s = maybe [] id $ M.lookup s tb
-
-annot :: StateGrammar -> String -> A.Tree
-annot gr s = errVal (error "illegal tree") $ do
- let t = tree2exp $ string2tree gr s
- annotate (grammar gr) t
-
-putInXML :: Options -> String -> String -> Res -> Res
-putInXML opts tag attrs io =
- (ifXML $ puts $ tagXML $ tag ++ attrs) ++
- io ++
- (ifXML $ puts $ tagXML $ '/':tag)
- where
- ifXML c = if oElem showXML opts then c else []
-
-
-tagXML :: String -> String
-tagXML s = "<" ++ s ++ ">"
-
--- print the treebank in a compact format:
--- first a sorted list of all words, referrable by index
--- then the linearization of each tree, as sequences of word indices
--- this format is usable in embedded translation systems.
-
-mkCompactTreebank :: Options -> ShellState -> [A.Tree] -> [String]
-mkCompactTreebank opts sh = printCompactTreebank . mkJustMultiTreebank opts sh
-
-printCompactTreebank :: (MultiTreebank,[String]) -> [String]
-printCompactTreebank (tb,lgs) = (stat:langs:unwords ws : "\n" : linss) where
- ws = L.sort $ L.nub $ concat $ map (concatMap (words . snd) . snd) tb
-
- linss = map (unwords . pad) linss0
- linss0 = map (map (show . encode) . words) allExs
- allExs = concat [[snd (ls !! i) | (_,ls) <- tb] | i <- [0..length lgs - 1]]
- encode w = maybe undefined id $ M.lookup w wmap
- wmap = M.fromAscList $ zip ws [1..]
- stat = unwords $ map show [length ws, length lgs, length tb, smax]
- langs = unwords lgs
- smax = maximum $ map length linss0
- pad ws = ws ++ replicate (smax - length ws) "0"
-
--- [(String,[(String,String)])] -- tree,lang,lin
-mkJustMultiTreebank :: Options -> ShellState -> [A.Tree] -> (MultiTreebank,[String])
-mkJustMultiTreebank opts sh ts =
- ([(prt_ t, [(la, lin la t) | la <- langs]) | t <- ts],langs) where
- langs = map prt_ $ allLanguages sh
- lin = linearize opts sh
-
-
---- these handy functions are borrowed from EmbedAPI
-
-linearize opts mgr lang = lin where
- sgr = stateGrammarOfLangOpt False mgr zlang
- cgr = canModules mgr
- zlang = zIdent lang
- untok = customOrDefault (addOptions opts (stateOptions sgr)) useUntokenizer customUntokenizer sgr
- lin
- | oElem showRecord opts = err id id . liftM prt . linearizeNoMark cgr zlang
- | oElem tableLin opts =
- err id id . liftM (unlines . map untok . prLinTable True) . allLinTables True cgr zlang
- | oElem showAll opts =
- err id id . liftM (unlines . map untok . prLinTable False) . allLinTables False cgr zlang
-
- | otherwise = untok . linTree2string noMark cgr zlang
-
-showTree t = prt_ $ tree2exp t