summaryrefslogtreecommitdiff
path: root/treebanks/talbanken/Format.hs
blob: dc22a016a86b9bcd6c4104107654e0e09188753e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
{-# LANGUAGE TypeSynonymInstances, FlexibleInstances #-}
module Format where
import Prelude hiding (words,id)
import Text.XML.HXT.Core 
import Data.Ord
import Data.List hiding (words)
import Data.Maybe
import qualified Data.Map as M
import System.Environment
import Debug.Trace as Debug
import Data.Char
import qualified Data.Tree as T
import Control.Monad.State
 
-- Functions for parsing XML to a format read by the other Haskell files

data Word     = W  {id :: Id, word :: String, pos :: Tag}
data PhrTag   = Ph {idPh :: Id, cat :: Tag, tags :: [(Tag,Id)]}
data Sentence = Sent {rootS :: Id, words :: [Word], info :: [PhrTag], ws :: Int}
type Tag      = String
type Id       = String


instance XmlPickler [Word] where
  xpickle = xpWords

instance XmlPickler Sentence where
  xpickle = xpSentence

xpSentences :: PU [Sentence] 
xpSentences = xpElem "corpus" 
              $ xpWrap (snd, \a -> ((),a))
              $ xpPair (xpElem "head" $ xpUnit) (xpElem "body" $ xpList $ xpSentence)
xpTags :: PU [PhrTag]
xpTags = xpList $ xpElem "nt"
         $ xpWrap (uncurry3 Ph,\p -> (idPh p,cat p,tags p))
         $ xpTriple (xpAttr "id" xpText) (xpAttr "cat" xpText)
                     (xpList $ xpTagMap)

xpTagMap :: PU (Tag,String)
xpTagMap = xpElem "edge"
           $ xpPair (xpAttr "label" xpText)
                    (xpAttr "idref" xpText)

xpSentence :: PU Sentence  
xpSentence = xpElem "s"
             $ xpWrap (makeSentence,\s -> (rootS s,words s, info s))
             $ xpElem "graph"  
             $  xpTriple (xpAttr "root" xpText)
                         ( xpElem "terminals" xpWords)
                         ( xpElem "nonterminals" xpTags)
  where makeSentence (r,ws,tgs) = Sent r ws tgs (length ws)


xpWords :: PU [Word]
xpWords = xpList $ xpElem "t"  
          $ xpWrap (uncurry3 W,\t -> (id t, word t,pos t)) 
          $ xpTriple (xpAttr "id" xpText)
                     (xpAttr "word" xpText)
                     (xpAttr "pos" xpText)
                     
mainF src =   
  runX (xunpickleDocument xpSentences [withInputEncoding utf8
                                     , withRemoveWS yes] src
        >>> arrIO (putStrLn . unlines . map (show . toTree))) 

runPickle f src = 
  runX (xunpickleDocument xpSentences [withInputEncoding utf8
                                     , withRemoveWS yes] src
        >>> arrIO (return . map f)) 

parse = runPickle toStringTree
parseIdTree = runPickle toTree

toStringTree :: Sentence -> (String,T.Tree String)
toStringTree = second (fmap snd) . toTree

toTree :: Sentence -> (String,T.Tree (Id,String))
toTree s@(Sent root ws inf _) = (root,toTree' root s)

toTree' :: String -> Sentence -> T.Tree (Id,String)
toTree' nr s@(Sent root ws inf _) = 
     case (lookup' nr ws,lookup'' nr inf) of
       (Just w,_) -> putWord w
       (_,Just p) -> putPhrase p
       _          -> error $ "Error in toTree' "++show nr++" could not be found"
  where putWord (W i w p)    = T.Node (i,p) [T.Node (i,w) []]
        putPhrase (Ph i c t) = T.Node (i,c) 
                                $ map (\(tag,next) -> T.Node (next,tag)  [toTree' next s]) t

        lookup' y (w@(W x _ _):xs) | y ==x     = Just w
                                   | otherwise = lookup' y xs
        lookup' y [] = Nothing

        lookup'' y (w@(Ph x _ _):xs) | y ==x     = Just w
                                     | otherwise = lookup'' y xs
        lookup'' y [] = Nothing
 

treeToSentence :: [T.Tree String] -> String
treeToSentence ts = unwords $ map extractS ts
  where extractS (T.Node ws []) = ws
        extractS (T.Node c  ts) = unwords $ map extractS ts

showa :: T.Tree String -> String
showa (T.Node root ts) = "("++root++" "++concatMap showa ts++" )"