summaryrefslogtreecommitdiff
path: root/treebanks
diff options
context:
space:
mode:
authorkr.angelov <kr.angelov@gmail.com>2013-12-06 15:05:56 +0000
committerkr.angelov <kr.angelov@gmail.com>2013-12-06 15:05:56 +0000
commite2fe50e5859cb6ef359c1a08e3bceb3080cd2159 (patch)
tree39ea57c20489f04c9e6ff57a073a6b19e0c90086 /treebanks
parent71c174104674e7c6f05081c7e8c07f37ca6a1a99 (diff)
partial reconstruction for sentence structures in Susanne
Diffstat (limited to 'treebanks')
-rw-r--r--treebanks/susanne/Parser.hs77
-rw-r--r--treebanks/susanne/SusanneFormat.hs1
-rw-r--r--treebanks/susanne/convert.hs278
3 files changed, 330 insertions, 26 deletions
diff --git a/treebanks/susanne/Parser.hs b/treebanks/susanne/Parser.hs
index 4e87c6a00..62e362a9f 100644
--- a/treebanks/susanne/Parser.hs
+++ b/treebanks/susanne/Parser.hs
@@ -1,37 +1,90 @@
module Parser where
+import Data.Char
import Control.Monad
+import PGF(PGF,Morpho,lookupMorpho,functionType,unType)
import SusanneFormat
-newtype P a = P {runP :: [ParseTree] -> Maybe ([ParseTree], a)}
+newtype P a = P {runP :: PGF -> Morpho -> [ParseTree] -> Maybe ([ParseTree], a)}
instance Monad P where
- return x = P (\ts -> Just (ts, x))
- f >>= g = P (\ts -> case runP f ts of
- Nothing -> Nothing
- Just (ts,x) -> runP (g x) ts)
+ return x = P (\pgf morpho ts -> Just (ts, x))
+ f >>= g = P (\pgf morpho ts -> case runP f pgf morpho ts of
+ Nothing -> Nothing
+ Just (ts,x) -> runP (g x) pgf morpho ts)
instance MonadPlus P where
- mzero = P (\ts -> Nothing)
- mplus f g = P (\ts -> mplus (runP f ts) (runP g ts))
+ mzero = P (\pgf morpho ts -> Nothing)
+ mplus f g = P (\pgf morpho ts -> mplus (runP f pgf morpho ts) (runP g pgf morpho ts))
-match tag_spec = P (\ts ->
+match tag_spec = P (\pgf morpho ts ->
case ts of
- (Phrase tag1 mods1 fn1 _ _:ts)
+ (t@(Phrase tag1 mods1 fn1 _ _):ts)
| tag == tag1 &&
all (flip elem mods1) mods &&
- (null fn || fn == fn1) -> Just (ts,())
- (Word _ tag1 _ _:ts)
- | tag == tag1 -> Just (ts,())
+ (null fn || fn == fn1) -> Just (ts,t)
+ (t@(Word _ tag1 _ _):ts)
+ | tag == tag1 -> Just (ts,t)
_ -> Nothing)
where
(f,_) = readTag (Word "<match>" undefined undefined undefined) tag_spec
Phrase tag mods fn _ _ = f []
+many1 f = do
+ x <- f
+ xs <- many f
+ return (x:xs)
+
many f =
do x <- f
xs <- many f
return (x:xs)
`mplus`
do return []
+
+inside tag_spec p = P (\pgf morpho ts ->
+ case ts of
+ (t@(Phrase tag1 mods1 fn1 _ ts'):ts)
+ | tag == tag1 &&
+ all (flip elem mods1) mods &&
+ (null fn || fn == fn1) -> case runP p pgf morpho ts' of
+ Just ([],x) -> Just (ts,x)
+ _ -> Nothing
+ _ -> Nothing)
+ where
+ (f,_) = readTag (Word "<match>" undefined undefined undefined) tag_spec
+ Phrase tag mods fn _ _ = f []
+
+insideOpt tag_spec p = P (\pgf morpho ts ->
+ case ts of
+ (t@(Phrase tag1 mods1 fn1 _ ts'):ts)
+ | tag == tag1 &&
+ all (flip elem mods1) mods &&
+ (null fn || fn == fn1) -> case runP p pgf morpho ts' of
+ Just ([],x) -> Just (ts,x)
+ _ -> Just (ts,t)
+ _ -> Nothing)
+ where
+ (f,_) = readTag (Word "<match>" undefined undefined undefined) tag_spec
+ Phrase tag mods fn _ _ = f []
+
+lemma tag cat an0 = P (\pgf morpho ts ->
+ case ts of
+ (t@(Word _ tag1 form _):ts) | tag == tag1 ->
+ case [f | (f,an) <- lookupMorpho morpho (map toLower form), hasCat pgf f cat, an == an0] of
+ [f] -> Just (ts,App f [])
+ _ -> Just (ts,t)
+ _ -> Nothing)
+ where
+ hasCat pgf f cat =
+ case functionType pgf f of
+ Just ty -> case unType ty of
+ (_,cat1,_) -> cat1 == cat
+ Nothing -> False
+
+opt f =
+ do x <- f
+ return (Just x)
+ `mplus`
+ do return Nothing
diff --git a/treebanks/susanne/SusanneFormat.hs b/treebanks/susanne/SusanneFormat.hs
index 3eb3187e2..43a685a0f 100644
--- a/treebanks/susanne/SusanneFormat.hs
+++ b/treebanks/susanne/SusanneFormat.hs
@@ -15,6 +15,7 @@ data ParseTree
= Phrase Tag Mods Fn Index [ParseTree]
| Word Id Tag Word Lemma
| App CId [ParseTree]
+ deriving Eq
data ParseTreePos
= Root
diff --git a/treebanks/susanne/convert.hs b/treebanks/susanne/convert.hs
index dfd2328ca..d25e7296d 100644
--- a/treebanks/susanne/convert.hs
+++ b/treebanks/susanne/convert.hs
@@ -2,24 +2,34 @@ import System.Directory
import System.FilePath
import Data.List
import Data.Char(toLower)
+import Control.Monad
+import qualified Data.Map as Map
import PGF (readPGF, readLanguage, buildMorpho, lookupMorpho, mkCId, functionType, unType)
import SusanneFormat
+import Parser
+import Idents
-Just eng = readLanguage "ParseEng"
+Just eng = readLanguage "DictEng"
main = do
- gr <- readPGF "../../ParseEngAbs.pgf"
+ gr <- readPGF "DictEngAbs.pgf"
let morpho = buildMorpho gr eng
fs <- getDirectoryContents "data"
txts <- (mapM (\f -> readFile ("data" </> f)) . filter ((/= ".") . take 1)) (sort fs)
- --let ts = concatMap (convert gr morpho) (readTreebank (lines (concat txts)))
- let ts = readTreebank (lines (concat txts))
- writeFile "text" (unlines (map show ts))
+ --let ts' = readTreebank (lines (concat txts))
+ --writeFile "text" (unlines (map show ts'))
+ let (ts',rs') = combineRes (convert gr morpho) (readTreebank (lines (concat txts)))
+ let rm = Map.fromListWith (++) rs'
+ writeFile "susanne.gft" (unlines (map show ts'))
+ writeFile "rules" (unlines (concat [unwords ("-":cat:"->":cats) : map (\t -> " "++show t) rs'' | (cat :-> cats,rs'') <- Map.toList rm]))
+
+data Rule = Tag :-> [Tag]
+ deriving (Eq,Ord)
convert pgf morpho w@(Word _ tag _ lemma)
- | elem tag ["YB","YBL","YBR","YF","YIL","YIR","YTL","YTR", "YO"] = []
- | tag == "NN1c" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w
+ | elem tag ["YB","YBL","YBR","YF","YIL","YIR","YTL","YTR", "YO"] = ([],[])
+{- | tag == "NN1c" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w
| tag == "NN1n" = convertLemma pgf morpho (mkCId "N") "s Sg Nom" w
| tag == "NN2" = convertLemma pgf morpho (mkCId "N") "s Pl Nom" w
| tag == "JJ" = convertLemma pgf morpho (mkCId "A") "s (AAdj Posit Nom)" w
@@ -36,19 +46,259 @@ convert pgf morpho w@(Word _ tag _ lemma)
| tag == "PPHO2"= convertLemma pgf morpho (mkCId "Pron") "s NPAcc" w
| tag == "RR" = convertLemma pgf morpho (mkCId "Adv") "s" w
| tag == "II" = convertLemma pgf morpho (mkCId "Prep") "s" w
- | tag == "IO" = convertLemma pgf morpho (mkCId "Prep") "s" w
- | otherwise = [w]
-convert pgf morpho (Phrase tag mods fn idx ts)
- | tag == "O" = concatMap (convert pgf morpho) ts
- | otherwise = [Phrase tag mods fn idx (concatMap (convert pgf morpho) ts)]
+ | tag == "IO" = convertLemma pgf morpho (mkCId "Prep") "s" w-}
+ | otherwise = ([w],[])
+convert pgf morpho t@(Phrase tag mods fn idx ts)
+ | tag == "O" = (ts',rs')
+ | tag == "Q" = (ts',rs')
+ | tag == "S" = case runP pS pgf morpho ts' of
+ Just ([],x) -> ([x], rs')
+ _ -> ([Phrase tag mods fn idx ts'], (r,[t]) : rs')
+ | otherwise = ([Phrase tag mods fn idx ts'], (r,[t]) : rs')
+ where
+ (ts',rs') = combineRes (convert pgf morpho) ts
+ r = tag :-> map getTag ts
+
+ isExtra (Word _ "YIL" _ _) = True
+ isExtra (Word _ "YIR" _ _) = True
+ isExtra (Word _ "YTL" _ _) = True
+ isExtra (Word _ "YTR" _ _) = True
+ isExtra _ = False
+
+ getTag (Phrase tag mods fn idx ts) = tag++if null fn then "" else ":"++fn
+ getTag (Word _ tag _ _) = tag
convertLemma pgf morpho cat an0 w@(Word _ tag form _) =
case [f | (f,an) <- lookupMorpho morpho (map toLower form), hasCat pgf f cat, an == an0] of
- [f] -> [App f []]
- _ -> [w]
+ [f] -> ([App f []], [])
+ _ -> ([w],[])
where
hasCat pgf f cat =
case functionType pgf f of
Just ty -> case unType ty of
(_,cat1,_) -> cat1 == cat
Nothing -> False
+
+combineRes f ts = (ts',rs')
+ where
+ (x,y) = unzip (map f ts)
+ ts' = concat x
+ rs' = concat y
+
+pS =
+ do mplus pConj (return ())
+ advs <- many pAdS
+ np <- pSubject
+ (t,p,vp) <- pVP
+ return (foldr ($) (App cidUseCl [t,p,App cidPredVP [np, vp]]) advs)
+ `mplus`
+ do mplus pConj (return ())
+ (t,p,vp) <- pVP
+ return (App cidImpVP [vp])
+ `mplus`
+ do mplus pConj (return ())
+ advs <- many pAdS
+ t1 <- match "EX"
+ (t,p,vp) <- pVP
+ return (foldr ($) (App cidUseCl [t,p,App cidExistNP [t1,vp]]) advs)
+
+pSubject =
+ do insideOpt "N:s" pNP
+ `mplus`
+ do insideOpt "N:S" pNP
+ `mplus`
+ do match "M:s"
+ `mplus`
+ do match "M:S"
+ `mplus`
+ do match "D:s"
+ `mplus`
+ do match "D:S"
+
+pConj =
+ do match "CC"
+ return ()
+ `mplus`
+ do match "CCB"
+ return ()
+
+pAdS =
+ do adv <- pAdv
+ match "YC"
+ return (\t -> App cidExtAdvS [adv,t])
+ `mplus`
+ do adv <- pAdv
+ return (\t -> App cidAdvS [adv,t])
+
+pVP =
+ do adVs <- many pAdV
+ (t,p,vs) <- pV "VS"
+ advs <- many pAdv
+ s <- insideOpt "F:o"
+ (opt (match "CST") >> pS)
+ return (t,p,foldr (\adv t -> App cidAdVVP [adv,t])
+ (foldl (\t adv -> App cidAdvVP [t, adv])
+ (App cidComplVS [vs, s])
+ advs)
+ adVs)
+ `mplus`
+ do adVs <- many pAdV
+ (t,p,vv) <- pV "VV"
+ advs <- many pAdv
+ vp <- match "Ti"
+ return (t,p,foldr (\adv t -> App cidAdVVP [adv,t])
+ (foldl (\t adv -> App cidAdvVP [t, adv])
+ (App cidComplVV [vv, vp])
+ advs)
+ adVs)
+ `mplus`
+ do adVs <- many pAdV
+ (t,p,v2) <- pV "V2"
+ o <- pObject
+ opt (match "YC") -- what is this?
+ advs <- many pAdv
+ return (t,p,foldr (\adv t -> App cidAdVVP [adv,t])
+ (foldl (\t adv -> App cidAdvVP [t, adv])
+ (App cidComplSlash [App cidSlashV2a [v2],o])
+ advs)
+ adVs)
+ `mplus`
+ do adVs <- many pAdV
+ (t,p,v) <- pV "V"
+ advs <- many pAdv
+ return (t,p,foldr (\adv t -> App cidAdVVP [adv,t])
+ (foldl (\t adv -> App cidAdvVP [t, adv])
+ (App cidUseV [v])
+ advs)
+ adVs)
+
+pV cat =
+ do inside "V" $
+ do v <- lemma "VVDv" (mkCId cat) "s VPast"
+ return (App cidTTAnt [App cidTPast [],App cidASimul []],App cidPPos [],v)
+ `mplus`
+ do v <- lemma "VVDt" (mkCId cat) "s VPast"
+ return (App cidTTAnt [App cidTPast [],App cidASimul []],App cidPPos [],v)
+ `mplus`
+ do v <- lemma "VVZv" (mkCId cat) "s VPres"
+ return (App cidTTAnt [App cidTPres [],App cidASimul []],App cidPPos [],v)
+ `mplus`
+ do match "VHD"
+ match "VHD"
+ v <- lemma "VVNv" (mkCId cat) "s VPPart"
+ return (App cidTTAnt [App cidTPres [],App cidAAnter []],App cidPPos [],v)
+ `mplus`
+ do v <- match "V"
+ return (App (mkCId "XXX") [],App (mkCId "XXX") [],v)
+
+pAdV =
+ do insideOpt "R:c" $
+ lemma "RRR" (mkCId "AdV") "s"
+ `mplus`
+ do match "R:m"
+
+pObject =
+ match "P:u"
+ `mplus`
+ insideOpt "N:o" pNP
+ `mplus`
+ match "N:e"
+ `mplus`
+ match "M:e"
+ `mplus`
+ match "D:e"
+ `mplus`
+ match "P:e"
+
+pAdv =
+ do match "N:t"
+ `mplus`
+ do match "N:h"
+ `mplus`
+ do match "P:p"
+ `mplus`
+ do match "P:q"
+ `mplus`
+ do match "P:a"
+ `mplus`
+ do match "P:t"
+ `mplus`
+ do match "P:h"
+ `mplus`
+ do match "P:m"
+ `mplus`
+ do match "P:r"
+ `mplus`
+ do match "R:p"
+ `mplus`
+ do match "R:q"
+ `mplus`
+ do match "R:a"
+ `mplus`
+ do match "R:t"
+ `mplus`
+ do match "R:h"
+ `mplus`
+ do match "R:m"
+ `mplus`
+ do match "R:c"
+ `mplus`
+ do match "R:r"
+ `mplus`
+ do match "F:p"
+ `mplus`
+ do match "F:q"
+ `mplus`
+ do match "F:a"
+ `mplus`
+ do match "F:t"
+ `mplus`
+ do match "F:h"
+ `mplus`
+ do match "F:m"
+ `mplus`
+ do match "F:r"
+ `mplus`
+ do match "W:b"
+ `mplus`
+ do match "L:b"
+
+pNP = do
+ q <- pQuant
+ (n,cn) <- pCN
+ return (App cidDetCN [App cidDetQuant [q,n],cn])
+
+pQuant =
+ do lemma "AT" (mkCId "Quant") "s False Sg"
+ `mplus`
+ do match "AT1"
+ return (App cidIndefArt [])
+
+pCN =
+ do np <- insideOpt "N" pNP
+ (n,cn) <- pCN
+ return (n,App (mkCId "Appos") [np,cn])
+ `mplus`
+ do a <- lemma "JJ" (mkCId "A") "s (AAdj Posit Nom)"
+ (n,cn) <- pCN
+ return (n,App cidAdjCN [App cidPositA [a],cn])
+ `mplus`
+ do (num,n) <- pN
+ advs <- many pPo
+ return (num,
+ foldl (\t adv -> App cidAdvCN [t, adv])
+ (App cidUseN [n])
+ advs)
+
+pN =
+ do n <- lemma "NN1c" (mkCId "N") "s Sg Nom"
+ return (App cidNumSg [], n)
+ `mplus`
+ do n <- lemma "NN1n" (mkCId "N") "s Sg Nom"
+ return (App cidNumSg [], n)
+
+pPo =
+ insideOpt "Po" $ do
+ p <- match "IO"
+ np <- insideOpt "N" pNP
+ return (App cidPrepNP [p,np])