summaryrefslogtreecommitdiff
path: root/treebanks/susanne
diff options
context:
space:
mode:
Diffstat (limited to 'treebanks/susanne')
-rw-r--r--treebanks/susanne/SusanneFormat.hs43
-rw-r--r--treebanks/susanne/convert.hs14
2 files changed, 57 insertions, 0 deletions
diff --git a/treebanks/susanne/SusanneFormat.hs b/treebanks/susanne/SusanneFormat.hs
new file mode 100644
index 000000000..052f95978
--- /dev/null
+++ b/treebanks/susanne/SusanneFormat.hs
@@ -0,0 +1,43 @@
+module SusanneFormat(Tag,Id,Word,Lemma,ParseTree(..),readTreebank) where
+
+import Data.Char
+
+type Tag = String
+type Id = String
+type Word = String
+type Lemma = String
+
+data ParseTree
+ = Phrase Tag [ParseTree]
+ | Word Id Tag Word Lemma
+
+data ParseTreePos
+ = Root
+ | At ParseTreePos Tag [ParseTree]
+
+instance Show ParseTree where
+ show (Phrase tag ts) = "["++tag++" "++unwords (map show ts)++"]"
+ show (Word _ tag w _) = "["++tag++" "++w++"]"
+
+readTreebank ls = readLines Root (map words ls)
+
+readLines p [] = []
+readLines p ([id,_,tag,w,l,parse]:ls) =
+ readParse p (Word id tag w l) parse ls
+
+readParse p w [] ls = readLines p ls
+readParse p w ('[':cs) ls =
+ case break (not . isTagChar) cs of
+ (tag,cs) -> readParse (At p tag []) w cs ls
+readParse (At p tag ts) w ('.':cs) ls =
+ readParse (At p tag (w:ts)) w cs ls
+readParse (At p tag ts) w cs ls =
+ case break (not . isTagChar) cs of
+ (tag,']':cs) -> let t = Phrase tag (reverse ts)
+ in case p of
+ Root -> t : readLines p ls
+ At p tag ts -> readParse (At p tag (t:ts)) w cs ls
+ _ -> error cs
+
+isTagChar c =
+ isLetter c || isDigit c || elem c ":&+-%@=?\"*!"
diff --git a/treebanks/susanne/convert.hs b/treebanks/susanne/convert.hs
new file mode 100644
index 000000000..91fdc2cf4
--- /dev/null
+++ b/treebanks/susanne/convert.hs
@@ -0,0 +1,14 @@
+import System.Directory
+import System.FilePath
+import Data.List
+
+import SusanneFormat
+
+main = do
+ fs <- getDirectoryContents "data"
+ txts <- (mapM (\f -> readFile ("data" </> f)) . filter ((/= ".") . take 1)) (sort fs)
+ let ts = filter (not . isBreak) (readTreebank (lines (concat txts)))
+ writeFile "text" (unlines (map show ts))
+
+isBreak (Phrase "Oh" [Word _ "YB" "<minbrk>" _]) = True
+isBreak _ = False