summaryrefslogtreecommitdiff
path: root/treebanks
diff options
context:
space:
mode:
authorkr.angelov <kr.angelov@gmail.com>2013-12-05 10:05:33 +0000
committerkr.angelov <kr.angelov@gmail.com>2013-12-05 10:05:33 +0000
commit05854280181f5ad30939a4fc533d42560103d23f (patch)
tree124ef170431dbc6455a25824a8752e7ce81fbf20 /treebanks
parent106b41a2cb562b26a477af5372ddf09fe8f79ca7 (diff)
more on the Susanne treebank
Diffstat (limited to 'treebanks')
-rw-r--r--treebanks/susanne/Parser.hs37
-rw-r--r--treebanks/susanne/SusanneFormat.hs85
-rw-r--r--treebanks/susanne/convert.hs46
3 files changed, 142 insertions, 26 deletions
diff --git a/treebanks/susanne/Parser.hs b/treebanks/susanne/Parser.hs
new file mode 100644
index 000000000..4e87c6a00
--- /dev/null
+++ b/treebanks/susanne/Parser.hs
@@ -0,0 +1,37 @@
+module Parser where
+
+import Control.Monad
+
+import SusanneFormat
+
+newtype P a = P {runP :: [ParseTree] -> Maybe ([ParseTree], a)}
+
+instance Monad P where
+ return x = P (\ts -> Just (ts, x))
+ f >>= g = P (\ts -> case runP f ts of
+ Nothing -> Nothing
+ Just (ts,x) -> runP (g x) ts)
+
+instance MonadPlus P where
+ mzero = P (\ts -> Nothing)
+ mplus f g = P (\ts -> mplus (runP f ts) (runP g ts))
+
+match tag_spec = P (\ts ->
+ case ts of
+ (Phrase tag1 mods1 fn1 _ _:ts)
+ | tag == tag1 &&
+ all (flip elem mods1) mods &&
+ (null fn || fn == fn1) -> Just (ts,())
+ (Word _ tag1 _ _:ts)
+ | tag == tag1 -> Just (ts,())
+ _ -> Nothing)
+ where
+ (f,_) = readTag (Word "<match>" undefined undefined undefined) tag_spec
+ Phrase tag mods fn _ _ = f []
+
+many f =
+ do x <- f
+ xs <- many f
+ return (x:xs)
+ `mplus`
+ do return []
diff --git a/treebanks/susanne/SusanneFormat.hs b/treebanks/susanne/SusanneFormat.hs
index 052f95978..3eb3187e2 100644
--- a/treebanks/susanne/SusanneFormat.hs
+++ b/treebanks/susanne/SusanneFormat.hs
@@ -1,43 +1,82 @@
-module SusanneFormat(Tag,Id,Word,Lemma,ParseTree(..),readTreebank) where
+module SusanneFormat(Tag,Id,Word,Lemma,ParseTree(..),readTreebank,readTag) where
+import PGF(CId)
import Data.Char
type Tag = String
+type Mods = String
+type Fn = String
+type Index = Int
type Id = String
type Word = String
type Lemma = String
data ParseTree
- = Phrase Tag [ParseTree]
+ = Phrase Tag Mods Fn Index [ParseTree]
| Word Id Tag Word Lemma
+ | App CId [ParseTree]
data ParseTreePos
= Root
- | At ParseTreePos Tag [ParseTree]
+ | At ParseTreePos ([ParseTree] -> ParseTree) [ParseTree]
instance Show ParseTree where
- show (Phrase tag ts) = "["++tag++" "++unwords (map show ts)++"]"
- show (Word _ tag w _) = "["++tag++" "++w++"]"
+ show (Phrase tag mods fn idx ts)
+ | tag == "" = "["++fn++show idx++" "++unwords (map show ts)++"]"
+ | fn == "" && idx == 0 = "["++tag++mods++" "++unwords (map show ts)++"]"
+ | otherwise = "["++tag++mods++":"++fn++show idx++" "++unwords (map show ts)++"]"
+ show (Word _ tag w _) = "["++tag++" "++w++"]"
+ show (App f ts)
+ | null ts = show f
+ | otherwise = "("++show f++" "++unwords (map show ts)++")"
readTreebank ls = readLines Root (map words ls)
readLines p [] = []
readLines p ([id,_,tag,w,l,parse]:ls) =
- readParse p (Word id tag w l) parse ls
-
-readParse p w [] ls = readLines p ls
-readParse p w ('[':cs) ls =
- case break (not . isTagChar) cs of
- (tag,cs) -> readParse (At p tag []) w cs ls
-readParse (At p tag ts) w ('.':cs) ls =
- readParse (At p tag (w:ts)) w cs ls
-readParse (At p tag ts) w cs ls =
- case break (not . isTagChar) cs of
- (tag,']':cs) -> let t = Phrase tag (reverse ts)
- in case p of
- Root -> t : readLines p ls
- At p tag ts -> readParse (At p tag (t:ts)) w cs ls
- _ -> error cs
-
-isTagChar c =
- isLetter c || isDigit c || elem c ":&+-%@=?\"*!"
+ readParse (Word id tag w l) p parse ls
+
+readParse w p [] ls = readLines p ls
+readParse w p ('[':cs) ls =
+ case readTag w cs of
+ (fn,cs) -> readParse w (At p fn []) cs ls
+readParse w (At p fn ts) ('.':cs) ls =
+ readParse w (At p fn (w:ts)) cs ls
+readParse w (At p fn ts) cs ls =
+ case readTag w cs of
+ (_,']':cs) -> let t = fn (reverse ts)
+ in case p of
+ Root -> t : readLines p ls
+ At p fn ts -> readParse w (At p fn (t:ts)) cs ls
+ _ -> readError w
+
+readTag w cs@(c1:c2:_) -- word tag on phrase level
+ | isUpper c1 && isUpper c2 =
+ case break (\c -> not (isLetter c || isDigit c)) cs of
+ (tag,cs) -> case break (\c -> not (elem c "?*%!\"=+-&@")) cs of
+ (mods,cs) -> case cs of
+ (':':c:cs) | isLetter c -> case break (not . isDigit) cs of
+ (ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
+ | isDigit c -> case break (not . isDigit) (c:cs) of
+ (ds,cs) -> (Phrase tag mods "" (if null ds then 0 else read ds),cs)
+ _ -> (Phrase tag mods "" 0,cs)
+readTag w (c:cs) -- phrase tag
+ | isUpper c = let tag = [c]
+ in case break (\c -> not (isLetter c || isDigit c || elem c "?*%!\"=+-&@")) cs of
+ (mods,cs) -> case cs of
+ (':':c:cs) | isLetter c -> case break (not . isDigit) cs of
+ (ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
+ | isDigit c -> case break (not . isDigit) (c:cs) of
+ (ds,cs) -> (Phrase tag mods "" (if null ds then 0 else read ds),cs)
+ _ -> (Phrase tag mods "" 0,cs)
+ | isLower c = let tag = []
+ mods = []
+ in case break (not . isDigit) cs of
+ (ds,cs) -> (Phrase tag mods [c] (if null ds then 0 else read ds),cs)
+ | isDigit c = let tag = []
+ mods = []
+ in case break (not . isDigit) cs of
+ (ds,cs) -> (Phrase tag mods [] (read ds),cs)
+readTag w cs = readError w
+
+readError (Word id _ _ _) = error id
diff --git a/treebanks/susanne/convert.hs b/treebanks/susanne/convert.hs
index 91fdc2cf4..dfd2328ca 100644
--- a/treebanks/susanne/convert.hs
+++ b/treebanks/susanne/convert.hs
@@ -1,14 +1,54 @@
import System.Directory
import System.FilePath
import Data.List
+import Data.Char(toLower)
+import PGF (readPGF, readLanguage, buildMorpho, lookupMorpho, mkCId, functionType, unType)
import SusanneFormat
+Just eng = readLanguage "ParseEng"
+
main = do
+ gr <- readPGF "../../ParseEngAbs.pgf"
+ let morpho = buildMorpho gr eng
fs <- getDirectoryContents "data"
txts <- (mapM (\f -> readFile ("data" </> f)) . filter ((/= ".") . take 1)) (sort fs)
- let ts = filter (not . isBreak) (readTreebank (lines (concat txts)))
+ --let ts = concatMap (convert gr morpho) (readTreebank (lines (concat txts)))
+ let ts = readTreebank (lines (concat txts))
writeFile "text" (unlines (map show ts))
-isBreak (Phrase "Oh" [Word _ "YB" "<minbrk>" _]) = True
-isBreak _ = False
+convert pgf morpho w@(Word _ tag _ lemma)
+ | elem tag ["YB","YBL","YBR","YF","YIL","YIR","YTL","YTR", "YO"] = []
+ | tag == "NN1c" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w
+ | tag == "NN1n" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w
+ | tag == "NN2" = convertLemma pgf morpho (mkCId "N") "s Pl Nom" w
+ | tag == "JJ" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w
+ | tag == "JB" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w
+ | tag == "JBo" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w
+ | tag == "AT" = convertLemma pgf morpho (mkCId "Quant") "s False Sg" w
+ | tag == "VVDi" = convertLemma pgf morpho (mkCId "V") "s VPast" w
+ | tag == "VVDt" = convertLemma pgf morpho (mkCId "V2") "s VPast" w
+ | tag == "VVDv" = convertLemma pgf morpho (mkCId "V") "s VPast" w
+ | tag == "VVZi" = convertLemma pgf morpho (mkCId "V") "s VPres" w
+ | tag == "VVZt" = convertLemma pgf morpho (mkCId "V2") "s VPres" w
+ | tag == "VVZv" = convertLemma pgf morpho (mkCId "V") "s VPres" w
+ | tag == "PPHS2"= convertLemma pgf morpho (mkCId "Pron") "s (NCase Nom)" w
+ | tag == "PPHO2"= convertLemma pgf morpho (mkCId "Pron") "s NPAcc" w
+ | tag == "RR" = convertLemma pgf morpho (mkCId "Adv") "s" w
+ | tag == "II" = convertLemma pgf morpho (mkCId "Prep") "s" w
+ | tag == "IO" = convertLemma pgf morpho (mkCId "Prep") "s" w
+ | otherwise = [w]
+convert pgf morpho (Phrase tag mods fn idx ts)
+ | tag == "O" = concatMap (convert pgf morpho) ts
+ | otherwise = [Phrase tag mods fn idx (concatMap (convert pgf morpho) ts)]
+
+convertLemma pgf morpho cat an0 w@(Word _ tag form _) =
+ case [f | (f,an) <- lookupMorpho morpho (map toLower form), hasCat pgf f cat, an == an0] of
+ [f] -> [App f []]
+ _ -> [w]
+ where
+ hasCat pgf f cat =
+ case functionType pgf f of
+ Just ty -> case unType ty of
+ (_,cat1,_) -> cat1 == cat
+ Nothing -> False