From 796dd530eedee8a7ccd605aa956a087c77719ab6 Mon Sep 17 00:00:00 2001 From: "kr.angelov" Date: Tue, 10 Jan 2012 19:36:28 +0000 Subject: the translation script from the Penn format to GF RGL is now in examples/PennTreebank --- examples/PennTreebank/PennFormat.hs | 38 +++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 examples/PennTreebank/PennFormat.hs (limited to 'examples/PennTreebank/PennFormat.hs') diff --git a/examples/PennTreebank/PennFormat.hs b/examples/PennTreebank/PennFormat.hs new file mode 100644 index 000000000..2aaf0a6b6 --- /dev/null +++ b/examples/PennTreebank/PennFormat.hs @@ -0,0 +1,38 @@ +module PennFormat(parseTreebank, showTree) where + +import Text.PrettyPrint +import Data.Tree +import Data.Char + +parseTreebank :: String -> [Tree String] +parseTreebank [] = [] +parseTreebank (c:cs) + | isSpace c = parseTreebank cs + | c == '(' = let (ts,cs1) = parseTrees cs + in ts ++ parseTreebank cs1 + +parseTrees [] = ([],[]) +parseTrees (c:cs) + | isSpace c = parseTrees cs + | c == ')' = ([],cs) + | c == '(' = let (w, cs1) = parseWord cs + (children,cs2) = parseTrees cs1 + (rest, cs3) = parseTrees cs2 + in (Node (normalize w) children : rest,cs3) + | otherwise = let (w, cs1) = parseWord (c:cs) + (rest, cs2) = parseTrees cs1 + in (Node w [] : rest,cs2) + +normalize tag = + let (tag0,mod) = break (=='-') tag + in if null tag0 + then tag + else tag0 + +parseWord = break (\c -> isSpace c || c == '(' || c == ')') + +printTree (Node w []) = text w +printTree (Node l children) = parens (text l <+> hsep (map printTree children)) + +showTree :: Tree String -> String +showTree = render . printTree -- cgit v1.2.3