diff options
| author | kr.angelov <kr.angelov@gmail.com> | 2013-12-04 12:11:41 +0000 |
|---|---|---|
| committer | kr.angelov <kr.angelov@gmail.com> | 2013-12-04 12:11:41 +0000 |
| commit | 106b41a2cb562b26a477af5372ddf09fe8f79ca7 (patch) | |
| tree | 0604f9154d8768bd648a82b22cca4bedc70573c6 /treebanks | |
| parent | f5cda1e6a385ec6fe847db1ab77b67535ae41e7d (diff) | |
added script for reading the Susanne treebank
Diffstat (limited to 'treebanks')
| -rw-r--r-- | treebanks/susanne/SusanneFormat.hs | 43 | ||||
| -rw-r--r-- | treebanks/susanne/convert.hs | 14 |
2 files changed, 57 insertions, 0 deletions
diff --git a/treebanks/susanne/SusanneFormat.hs b/treebanks/susanne/SusanneFormat.hs new file mode 100644 index 000000000..052f95978 --- /dev/null +++ b/treebanks/susanne/SusanneFormat.hs @@ -0,0 +1,43 @@ +module SusanneFormat(Tag,Id,Word,Lemma,ParseTree(..),readTreebank) where + +import Data.Char + +type Tag = String +type Id = String +type Word = String +type Lemma = String + +data ParseTree + = Phrase Tag [ParseTree] + | Word Id Tag Word Lemma + +data ParseTreePos + = Root + | At ParseTreePos Tag [ParseTree] + +instance Show ParseTree where + show (Phrase tag ts) = "["++tag++" "++unwords (map show ts)++"]" + show (Word _ tag w _) = "["++tag++" "++w++"]" + +readTreebank ls = readLines Root (map words ls) + +readLines p [] = [] +readLines p ([id,_,tag,w,l,parse]:ls) = + readParse p (Word id tag w l) parse ls + +readParse p w [] ls = readLines p ls +readParse p w ('[':cs) ls = + case break (not . isTagChar) cs of + (tag,cs) -> readParse (At p tag []) w cs ls +readParse (At p tag ts) w ('.':cs) ls = + readParse (At p tag (w:ts)) w cs ls +readParse (At p tag ts) w cs ls = + case break (not . isTagChar) cs of + (tag,']':cs) -> let t = Phrase tag (reverse ts) + in case p of + Root -> t : readLines p ls + At p tag ts -> readParse (At p tag (t:ts)) w cs ls + _ -> error cs + +isTagChar c = + isLetter c || isDigit c || elem c ":&+-%@=?\"*!" diff --git a/treebanks/susanne/convert.hs b/treebanks/susanne/convert.hs new file mode 100644 index 000000000..91fdc2cf4 --- /dev/null +++ b/treebanks/susanne/convert.hs @@ -0,0 +1,14 @@ +import System.Directory +import System.FilePath +import Data.List + +import SusanneFormat + +main = do + fs <- getDirectoryContents "data" + txts <- (mapM (\f -> readFile ("data" </> f)) . filter ((/= ".") . take 1)) (sort fs) + let ts = filter (not . isBreak) (readTreebank (lines (concat txts))) + writeFile "text" (unlines (map show ts)) + +isBreak (Phrase "Oh" [Word _ "YB" "<minbrk>" _]) = True +isBreak _ = False |
