summaryrefslogtreecommitdiff
path: root/treebanks/talbanken/Format.hs
diff options
context:
space:
mode:
Diffstat (limited to 'treebanks/talbanken/Format.hs')
-rw-r--r--treebanks/talbanken/Format.hs106
1 files changed, 106 insertions, 0 deletions
diff --git a/treebanks/talbanken/Format.hs b/treebanks/talbanken/Format.hs
new file mode 100644
index 000000000..dc22a016a
--- /dev/null
+++ b/treebanks/talbanken/Format.hs
@@ -0,0 +1,106 @@
+{-# LANGUAGE TypeSynonymInstances, FlexibleInstances #-}
+module Format where
+import Prelude hiding (words,id)
+import Text.XML.HXT.Core
+import Data.Ord
+import Data.List hiding (words)
+import Data.Maybe
+import qualified Data.Map as M
+import System.Environment
+import Debug.Trace as Debug
+import Data.Char
+import qualified Data.Tree as T
+import Control.Monad.State
+
+-- Functions for parsing XML to a format read by the other Haskell files
+
+data Word = W {id :: Id, word :: String, pos :: Tag}
+data PhrTag = Ph {idPh :: Id, cat :: Tag, tags :: [(Tag,Id)]}
+data Sentence = Sent {rootS :: Id, words :: [Word], info :: [PhrTag], ws :: Int}
+type Tag = String
+type Id = String
+
+
+instance XmlPickler [Word] where
+ xpickle = xpWords
+
+instance XmlPickler Sentence where
+ xpickle = xpSentence
+
+xpSentences :: PU [Sentence]
+xpSentences = xpElem "corpus"
+ $ xpWrap (snd, \a -> ((),a))
+ $ xpPair (xpElem "head" $ xpUnit) (xpElem "body" $ xpList $ xpSentence)
+xpTags :: PU [PhrTag]
+xpTags = xpList $ xpElem "nt"
+ $ xpWrap (uncurry3 Ph,\p -> (idPh p,cat p,tags p))
+ $ xpTriple (xpAttr "id" xpText) (xpAttr "cat" xpText)
+ (xpList $ xpTagMap)
+
+xpTagMap :: PU (Tag,String)
+xpTagMap = xpElem "edge"
+ $ xpPair (xpAttr "label" xpText)
+ (xpAttr "idref" xpText)
+
+xpSentence :: PU Sentence
+xpSentence = xpElem "s"
+ $ xpWrap (makeSentence,\s -> (rootS s,words s, info s))
+ $ xpElem "graph"
+ $ xpTriple (xpAttr "root" xpText)
+ ( xpElem "terminals" xpWords)
+ ( xpElem "nonterminals" xpTags)
+ where makeSentence (r,ws,tgs) = Sent r ws tgs (length ws)
+
+
+xpWords :: PU [Word]
+xpWords = xpList $ xpElem "t"
+ $ xpWrap (uncurry3 W,\t -> (id t, word t,pos t))
+ $ xpTriple (xpAttr "id" xpText)
+ (xpAttr "word" xpText)
+ (xpAttr "pos" xpText)
+
+mainF src =
+ runX (xunpickleDocument xpSentences [withInputEncoding utf8
+ , withRemoveWS yes] src
+ >>> arrIO (putStrLn . unlines . map (show . toTree)))
+
+runPickle f src =
+ runX (xunpickleDocument xpSentences [withInputEncoding utf8
+ , withRemoveWS yes] src
+ >>> arrIO (return . map f))
+
+parse = runPickle toStringTree
+parseIdTree = runPickle toTree
+
+toStringTree :: Sentence -> (String,T.Tree String)
+toStringTree = second (fmap snd) . toTree
+
+toTree :: Sentence -> (String,T.Tree (Id,String))
+toTree s@(Sent root ws inf _) = (root,toTree' root s)
+
+toTree' :: String -> Sentence -> T.Tree (Id,String)
+toTree' nr s@(Sent root ws inf _) =
+ case (lookup' nr ws,lookup'' nr inf) of
+ (Just w,_) -> putWord w
+ (_,Just p) -> putPhrase p
+ _ -> error $ "Error in toTree' "++show nr++" could not be found"
+ where putWord (W i w p) = T.Node (i,p) [T.Node (i,w) []]
+ putPhrase (Ph i c t) = T.Node (i,c)
+ $ map (\(tag,next) -> T.Node (next,tag) [toTree' next s]) t
+
+ lookup' y (w@(W x _ _):xs) | y ==x = Just w
+ | otherwise = lookup' y xs
+ lookup' y [] = Nothing
+
+ lookup'' y (w@(Ph x _ _):xs) | y ==x = Just w
+ | otherwise = lookup'' y xs
+ lookup'' y [] = Nothing
+
+
+treeToSentence :: [T.Tree String] -> String
+treeToSentence ts = unwords $ map extractS ts
+ where extractS (T.Node ws []) = ws
+ extractS (T.Node c ts) = unwords $ map extractS ts
+
+showa :: T.Tree String -> String
+showa (T.Node root ts) = "("++root++" "++concatMap showa ts++" )"