diff options
Diffstat (limited to 'treebanks/talbanken/Format.hs')
| -rw-r--r-- | treebanks/talbanken/Format.hs | 106 |
1 files changed, 106 insertions, 0 deletions
diff --git a/treebanks/talbanken/Format.hs b/treebanks/talbanken/Format.hs new file mode 100644 index 000000000..dc22a016a --- /dev/null +++ b/treebanks/talbanken/Format.hs @@ -0,0 +1,106 @@ +{-# LANGUAGE TypeSynonymInstances, FlexibleInstances #-} +module Format where +import Prelude hiding (words,id) +import Text.XML.HXT.Core +import Data.Ord +import Data.List hiding (words) +import Data.Maybe +import qualified Data.Map as M +import System.Environment +import Debug.Trace as Debug +import Data.Char +import qualified Data.Tree as T +import Control.Monad.State + +-- Functions for parsing XML to a format read by the other Haskell files + +data Word = W {id :: Id, word :: String, pos :: Tag} +data PhrTag = Ph {idPh :: Id, cat :: Tag, tags :: [(Tag,Id)]} +data Sentence = Sent {rootS :: Id, words :: [Word], info :: [PhrTag], ws :: Int} +type Tag = String +type Id = String + + +instance XmlPickler [Word] where + xpickle = xpWords + +instance XmlPickler Sentence where + xpickle = xpSentence + +xpSentences :: PU [Sentence] +xpSentences = xpElem "corpus" + $ xpWrap (snd, \a -> ((),a)) + $ xpPair (xpElem "head" $ xpUnit) (xpElem "body" $ xpList $ xpSentence) +xpTags :: PU [PhrTag] +xpTags = xpList $ xpElem "nt" + $ xpWrap (uncurry3 Ph,\p -> (idPh p,cat p,tags p)) + $ xpTriple (xpAttr "id" xpText) (xpAttr "cat" xpText) + (xpList $ xpTagMap) + +xpTagMap :: PU (Tag,String) +xpTagMap = xpElem "edge" + $ xpPair (xpAttr "label" xpText) + (xpAttr "idref" xpText) + +xpSentence :: PU Sentence +xpSentence = xpElem "s" + $ xpWrap (makeSentence,\s -> (rootS s,words s, info s)) + $ xpElem "graph" + $ xpTriple (xpAttr "root" xpText) + ( xpElem "terminals" xpWords) + ( xpElem "nonterminals" xpTags) + where makeSentence (r,ws,tgs) = Sent r ws tgs (length ws) + + +xpWords :: PU [Word] +xpWords = xpList $ xpElem "t" + $ xpWrap (uncurry3 W,\t -> (id t, word t,pos t)) + $ xpTriple (xpAttr "id" xpText) + (xpAttr "word" xpText) + (xpAttr "pos" xpText) + +mainF src = + runX (xunpickleDocument xpSentences [withInputEncoding utf8 + , withRemoveWS yes] src + >>> arrIO (putStrLn . unlines . map (show . toTree))) + +runPickle f src = + runX (xunpickleDocument xpSentences [withInputEncoding utf8 + , withRemoveWS yes] src + >>> arrIO (return . map f)) + +parse = runPickle toStringTree +parseIdTree = runPickle toTree + +toStringTree :: Sentence -> (String,T.Tree String) +toStringTree = second (fmap snd) . toTree + +toTree :: Sentence -> (String,T.Tree (Id,String)) +toTree s@(Sent root ws inf _) = (root,toTree' root s) + +toTree' :: String -> Sentence -> T.Tree (Id,String) +toTree' nr s@(Sent root ws inf _) = + case (lookup' nr ws,lookup'' nr inf) of + (Just w,_) -> putWord w + (_,Just p) -> putPhrase p + _ -> error $ "Error in toTree' "++show nr++" could not be found" + where putWord (W i w p) = T.Node (i,p) [T.Node (i,w) []] + putPhrase (Ph i c t) = T.Node (i,c) + $ map (\(tag,next) -> T.Node (next,tag) [toTree' next s]) t + + lookup' y (w@(W x _ _):xs) | y ==x = Just w + | otherwise = lookup' y xs + lookup' y [] = Nothing + + lookup'' y (w@(Ph x _ _):xs) | y ==x = Just w + | otherwise = lookup'' y xs + lookup'' y [] = Nothing + + +treeToSentence :: [T.Tree String] -> String +treeToSentence ts = unwords $ map extractS ts + where extractS (T.Node ws []) = ws + extractS (T.Node c ts) = unwords $ map extractS ts + +showa :: T.Tree String -> String +showa (T.Node root ts) = "("++root++" "++concatMap showa ts++" )" |
