summaryrefslogtreecommitdiff
path: root/treebanks/PennTreebank/PennFormat.hs
blob: 2aaf0a6b6868bac5703f0352ace426d740faf22f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
module PennFormat(parseTreebank, showTree) where

import Text.PrettyPrint
import Data.Tree
import Data.Char

parseTreebank :: String -> [Tree String]
parseTreebank []     = []
parseTreebank (c:cs)
  | isSpace c        = parseTreebank cs
  | c == '('         = let (ts,cs1) = parseTrees cs
                       in ts ++ parseTreebank cs1

parseTrees []     = ([],[])
parseTrees (c:cs)
  | isSpace c     = parseTrees cs
  | c == ')'      = ([],cs)
  | c == '('      = let (w,       cs1) = parseWord  cs
                        (children,cs2) = parseTrees cs1
                        (rest,    cs3) = parseTrees cs2
                    in (Node (normalize w) children : rest,cs3)
  | otherwise     = let (w,       cs1) = parseWord  (c:cs)
                        (rest,    cs2) = parseTrees cs1
                    in (Node w [] : rest,cs2)

normalize tag =
  let (tag0,mod) = break (=='-') tag
  in if null tag0
       then tag
       else tag0

parseWord = break (\c -> isSpace c || c == '(' || c == ')')

printTree (Node w [])       = text w
printTree (Node l children) = parens (text l <+> hsep (map printTree children))

showTree :: Tree String -> String
showTree = render . printTree