1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
|
module SusanneFormat(Tag,Id,Word,Lemma,ParseTree(..),readTreebank,readTag) where
import Data.Char
import qualified Data.Map as Map
type Tag = String
type Mods = String
type Fn = String
type Index = Int
type Id = String
type Word = String
type Lemma = String
data ParseTree
= Phrase Tag Mods Fn Index [ParseTree]
| Word Id Tag Word Lemma
| App String [ParseTree]
| Lit String
deriving Eq
data ParseTreePos
= Root
| At ParseTreePos ([ParseTree] -> ParseTree) [ParseTree]
instance Show ParseTree where
show (Phrase tag mods fn idx ts)
| tag == "" = "["++fn++show idx++" "++unwords (map show ts)++"]"
| fn == "" && idx == 0 = "["++tag++mods++" "++unwords (map show ts)++"]"
| otherwise = "["++tag++mods++":"++fn++show idx++" "++unwords (map show ts)++"]"
show (Word _ tag w _) = "["++tag++" "++w++"]"
show (App f ts)
| null ts = f
| otherwise = "("++f++" "++unwords (map show ts)++")"
show (Lit s) = show s
readTreebank ls = readLines Root (map words ls)
readLines p [] = []
readLines p ([id,_,tag,w,l,parse]:ls) =
readParse (Word id tag (readWord w) l) p parse ls
readParse w p [] ls = readLines p ls
readParse w p ('[':cs) ls =
case readTag w cs of
(fn,cs) -> readParse w (At p fn []) cs ls
readParse w (At p fn ts) ('.':cs) ls =
readParse w (At p fn (w:ts)) cs ls
readParse w (At p fn ts) cs ls =
case readTag w cs of
(_,']':cs) -> let t = fn (reverse ts)
in case p of
Root -> t : readLines p ls
At p fn ts -> readParse w (At p fn (t:ts)) cs ls
_ -> readError w
readTag w cs@(c1:c2:_) -- word tag on phrase level
| isUpper c1 && isUpper c2 =
case break (\c -> not (isLetter c || isDigit c)) cs of
(tag,cs) -> case break (\c -> not (elem c "?*%!\"=+-&@")) cs of
(mods,cs) -> case cs of
(':':c:cs) | isLetter c -> case break (not . isDigit) cs of
(ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
| isDigit c -> case break (not . isDigit) (c:cs) of
(ds,cs) -> (Phrase tag mods "" (if null ds then 0 else read ds),cs)
_ -> (Phrase tag mods "" 0,cs)
readTag w (c:cs) -- phrase tag
| isUpper c = let tag = [c]
in case break (\c -> not (isLetter c || isDigit c || elem c "?*%!\"=+-&@")) cs of
(mods,cs) -> case cs of
(':':c:cs) | isLetter c -> case break (not . isDigit) cs of
(ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
| isDigit c -> case break (not . isDigit) (c:cs) of
(ds,cs) -> (Phrase tag mods "" (if null ds then 0 else read ds),cs)
_ -> (Phrase tag mods "" 0,cs)
| isLower c = let tag = []
mods = []
in case break (not . isDigit) cs of
(ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
| isDigit c = let tag = []
mods = []
in case break (not . isDigit) cs of
(ds,cs) -> (Phrase tag mods [] (read ds),cs)
readTag w cs = readError w
readError (Word id _ _ _) = error id
readWord w0 = replaceEntities w2
where
w1 | head w0 == '+' = tail w0
| otherwise = w0
w2 | last w1 == '+' = init w1
| otherwise = w1
replaceEntities [] = []
replaceEntities ('<':cs) =
let (e,'>':cs1) = break (=='>') cs
in case Map.lookup e entity_names of
Just c -> c : replaceEntities cs1
Nothing -> "<"++e++">"++ replaceEntities cs1
replaceEntities (c: cs) = c : replaceEntities cs
entity_names = Map.fromList
[("agr",'α')
,("agrave",'à')
,("apos",'\'')
,("auml",'ä')
,("bgr",'β')
,("blank",' ')
,("ccedil",'ç')
,("deg",'°')
,("dollar",'$')
,("eacute",'é')
,("egr",'ε')
,("egrave",'è')
,("frac12",'½')
,("frac14",'¼')
,("ggr",'γ')
,("hellip",'…')
,("hyphen",'-')
,("iuml",'ï')
,("khgr",'χ')
,("ldquo",'“')
,("lgr",'λ')
,("lsquo",'‘')
,("mdash",'—')
,("mgr",'μ')
,("minus",'-')
,("ntilde",'ñ')
,("oelig",'œ')
,("ouml",'ö')
,("para",'¶')
,("pgr",'π')
,("phgr",'φ')
,("prime",'′')
,("Prime",'″')
,("rdquo",'”')
,("rgr",'ρ')
,("rsquo",'’')
,("sect",'§')
,("sol",'/')
,("tggr",'θ')
]
|