summaryrefslogtreecommitdiff
path: root/treebanks/susanne/SusanneFormat.hs
blob: 04c9fbbc84ea88b3208107a40e2151faadbced14 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
module SusanneFormat(Tag,Id,Word,Lemma,ParseTree(..),readTreebank,readTag) where

import Data.Char
import qualified Data.Map as Map

type Tag   = String
type Mods  = String
type Fn    = String
type Index = Int
type Id    = String
type Word  = String
type Lemma = String

data ParseTree
 = Phrase Tag Mods Fn Index [ParseTree]
 | Word   Id Tag Word Lemma
 | App String [ParseTree]
 | Lit String
 deriving Eq

data ParseTreePos
 = Root
 | At ParseTreePos ([ParseTree] -> ParseTree) [ParseTree]

instance Show ParseTree where
  show (Phrase tag mods fn idx ts)
    | tag == ""            = "["++fn++show idx++" "++unwords (map show ts)++"]"
    | fn == "" && idx == 0 = "["++tag++mods++" "++unwords (map show ts)++"]"
    | otherwise            = "["++tag++mods++":"++fn++show idx++" "++unwords (map show ts)++"]"
  show (Word _ tag w _)    = "["++tag++" "++w++"]"
  show (App f ts)           
    | null ts              = f
    | otherwise            = "("++f++" "++unwords (map show ts)++")"
  show (Lit s)             = show s

readTreebank ls = readLines Root (map words ls)

readLines p []                        = []
readLines p ([id,_,tag,w,l,parse]:ls) =
  readParse (Word id tag (readWord w) l) p parse ls

readParse w p []       ls = readLines p ls
readParse w p ('[':cs) ls =
  case readTag w cs of
    (fn,cs) -> readParse w (At p fn []) cs ls
readParse w (At p fn ts) ('.':cs) ls =
  readParse w (At p fn (w:ts)) cs ls
readParse w (At p fn ts) cs ls =
  case readTag w cs of
    (_,']':cs) -> let t = fn (reverse ts)
                  in case p of
                       Root       -> t : readLines p ls
                       At p fn ts -> readParse w (At p fn (t:ts)) cs ls
    _          -> readError w

readTag w cs@(c1:c2:_)             -- word tag on phrase level
  | isUpper c1 && isUpper c2 =
      case break (\c -> not (isLetter c || isDigit c)) cs of
        (tag,cs) -> case break (\c -> not (elem c "?*%!\"=+-&@")) cs of
                      (mods,cs) -> case cs of
                                    (':':c:cs) | isLetter c -> case break (not . isDigit) cs of
                                                                 (ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
                                               | isDigit  c -> case break (not . isDigit) (c:cs) of
                                                                 (ds,cs) -> (Phrase tag mods "" (if null ds then 0 else read ds),cs)
                                    _                       -> (Phrase tag mods "" 0,cs)
readTag w (c:cs)                -- phrase tag
  | isUpper c = let tag = [c]
                in case break (\c -> not (isLetter c || isDigit c || elem c "?*%!\"=+-&@")) cs of
                     (mods,cs) -> case cs of
                                    (':':c:cs) | isLetter c -> case break (not . isDigit) cs of
                                                                 (ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
                                               | isDigit  c -> case break (not . isDigit) (c:cs) of
                                                                 (ds,cs) -> (Phrase tag mods "" (if null ds then 0 else read ds),cs)
                                    _                       -> (Phrase tag mods "" 0,cs)
  | isLower c = let tag  = []
                    mods = []
                in case break (not . isDigit) cs of
                     (ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
  | isDigit c = let tag  = []
                    mods = []
                in case break (not . isDigit) cs of
                     (ds,cs) -> (Phrase tag mods [] (read ds),cs)
readTag w cs = readError w

readError (Word id _ _ _) = error id

readWord w0 = replaceEntities w2
  where
    w1 | head w0 == '+' = tail w0
       | otherwise      = w0
    w2 | last w1 == '+' = init w1
       | otherwise      = w1

    replaceEntities []       = []
    replaceEntities ('<':cs) =
      let (e,'>':cs1) = break (=='>') cs
      in case Map.lookup e entity_names of
           Just c  -> c : replaceEntities cs1
           Nothing -> "<"++e++">"++ replaceEntities cs1
    replaceEntities (c:  cs) = c : replaceEntities cs

entity_names = Map.fromList
  [("agr",'α')
  ,("agrave",'à')
  ,("apos",'\'')
  ,("auml",'ä')
  ,("bgr",'β')
  ,("blank",' ')
  ,("ccedil",'ç')
  ,("deg",'°')
  ,("dollar",'$')
  ,("eacute",'é')
  ,("egr",'ε')
  ,("egrave",'è')
  ,("frac12",'½')
  ,("frac14",'¼')
  ,("ggr",'γ')
  ,("hellip",'…')
  ,("hyphen",'-')
  ,("iuml",'ï')
  ,("khgr",'χ')
  ,("ldquo",'“')
  ,("lgr",'λ')
  ,("lsquo",'‘')
  ,("mdash",'—')
  ,("mgr",'μ')
  ,("minus",'-')
  ,("ntilde",'ñ')
  ,("oelig",'œ')
  ,("ouml",'ö')
  ,("para",'¶')
  ,("pgr",'π')
  ,("phgr",'φ')
  ,("prime",'′')
  ,("Prime",'″')
  ,("rdquo",'”')
  ,("rgr",'ρ')
  ,("rsquo",'’')
  ,("sect",'§')
  ,("sol",'/')
  ,("tggr",'θ')
  ]