From 6e3503bb7b6c9aac12711477b8a474ce41c1cd7a Mon Sep 17 00:00:00 2001 From: "kr.angelov" Date: Mon, 1 Oct 2012 08:52:54 +0000 Subject: move examples/PennTreebank to /treebanks/PennTreebank --- examples/PennTreebank/Idents.hs | 115 ----- examples/PennTreebank/Monad.hs | 98 ----- examples/PennTreebank/PennFormat.hs | 38 -- examples/PennTreebank/training.hs | 125 ------ examples/PennTreebank/translate.hs | 809 ----------------------------------- treebanks/PennTreebank/Idents.hs | 115 +++++ treebanks/PennTreebank/Monad.hs | 98 +++++ treebanks/PennTreebank/PennFormat.hs | 38 ++ treebanks/PennTreebank/training.hs | 125 ++++++ treebanks/PennTreebank/translate.hs | 809 +++++++++++++++++++++++++++++++++++ 10 files changed, 1185 insertions(+), 1185 deletions(-) delete mode 100644 examples/PennTreebank/Idents.hs delete mode 100644 examples/PennTreebank/Monad.hs delete mode 100644 examples/PennTreebank/PennFormat.hs delete mode 100644 examples/PennTreebank/training.hs delete mode 100644 examples/PennTreebank/translate.hs create mode 100644 treebanks/PennTreebank/Idents.hs create mode 100644 treebanks/PennTreebank/Monad.hs create mode 100644 treebanks/PennTreebank/PennFormat.hs create mode 100644 treebanks/PennTreebank/training.hs create mode 100644 treebanks/PennTreebank/translate.hs diff --git a/examples/PennTreebank/Idents.hs b/examples/PennTreebank/Idents.hs deleted file mode 100644 index b084d9214..000000000 --- a/examples/PennTreebank/Idents.hs +++ /dev/null @@ -1,115 +0,0 @@ -module Idents where - -import PGF - -cidASimul = mkCId "ASimul" -cidAAnter = mkCId "AAnter" -cidPositAdvAdj = mkCId "PositAdvAdj" -cidPositAdVAdj = mkCId "PositAdVAdj" -cidUseCl = mkCId "UseCl" -cidPredVP = mkCId "PredVP" -cidSlashVP = mkCId "SlashVP" -cidComplPredVP = mkCId "ComplPredVP" -cidAdjCN = mkCId "AdjCN" -cidUseN = mkCId "UseN" -cidDetQuant = mkCId "DetQuant" -cidDetQuantOrd = mkCId "DetQuantOrd" -cidNumSg = mkCId "NumSg" -cidNumPl = mkCId "NumPl" -cidDetCN = mkCId "DetCN" -cidIndefArt = mkCId "IndefArt" -cidUsePN = mkCId "UsePN" -cidUseQuantPN = mkCId "UseQuantPN" -cidSymbPN = mkCId "SymbPN" -cidMkSymb = mkCId "MkSymb" -cidUsePron = mkCId "UsePron" -cidConjNP = mkCId "ConjNP" -cidBaseNP = mkCId "BaseNP" -cidConsNP = mkCId "ConsNP" -cidConjCN = mkCId "ConjCN" -cidBaseCN = mkCId "BaseCN" -cidConsCN = mkCId "ConsCN" -cidConjAdv = mkCId "ConjAdv" -cidBaseAdv = mkCId "BaseAdv" -cidConsAdv = mkCId "ConsAdv" -cidBaseS = mkCId "BaseS" -cidConsS = mkCId "ConsS" -cidConjS = mkCId "ConjS" -cidMassNP = mkCId "MassNP" -cidAdvNP = mkCId "AdvNP" -cidTPres = mkCId "TPres" -cidTPast = mkCId "TPast" -cidTFut = mkCId "TFut" -cidTCond = mkCId "TCond" -cidTTAnt = mkCId "TTAnt" -cidPPos = mkCId "PPos" -cidPNeg = mkCId "PNeg" -cidComplSlash = mkCId "ComplSlash" -cidSlashV2a = mkCId "SlashV2a" -cidComplVS = mkCId "ComplVS" -cidComplVV = mkCId "ComplVV" -cidUseV = mkCId "UseV" -cidAdVVP = mkCId "AdVVP" -cidAdvVP = mkCId "AdvVP" -cidAdvVPSlash = mkCId "AdvVPSlash" -cidPrepNP = mkCId "PrepNP" -cidto_Prep = mkCId "to_Prep" -cidsuch_as_Prep= mkCId "such_as_Prep" -cidPastPartAP = mkCId "PastPartAP" -cidPassVPSlash = mkCId "PassVPSlash" -cidAdvS = mkCId "AdvS" -cidPositA = mkCId "PositA" -cidIDig = mkCId "IDig" -cidIIDig = mkCId "IIDig" -cidNumCard = mkCId "NumCard" -cidNumDigits = mkCId "NumDigits" -cidNumNumeral = mkCId "NumNumeral" -cidnum = mkCId "num" -cidpot2as3 = mkCId "pot2as3" -cidpot1as2 = mkCId "pot1as2" -cidpot0as1 = mkCId "pot0as1" -cidpot01 = mkCId "pot01" -cidpot0 = mkCId "pot0" -cidn2 = mkCId "n2" -cidn3 = mkCId "n3" -cidn4 = mkCId "n4" -cidn5 = mkCId "n5" -cidn6 = mkCId "n6" -cidn7 = mkCId "n7" -cidn8 = mkCId "n8" -cidn9 = mkCId "n9" -cidPossPron = mkCId "PossPron" -cidCompAP = mkCId "CompAP" -cidCompNP = mkCId "CompNP" -cidCompAdv = mkCId "CompAdv" -cidCompS = mkCId "CompS" -cidCompVP = mkCId "CompVP" -cidUseComp = mkCId "UseComp" -cidCompoundCN = mkCId "CompoundCN" -cidDashCN = mkCId "DashCN" -cidProgrVP = mkCId "ProgrVP" -cidGerundN = mkCId "GerundN" -cidGerundAP = mkCId "GerundAP" -cidGenNP = mkCId "GenNP" -cidPredetNP = mkCId "PredetNP" -cidDetNP = mkCId "DetNP" -cidAdAP = mkCId "AdAP" -cidAdvAP = mkCId "AdvAP" -cidPositAdAAdj = mkCId "PositAdAAdj" -cideither7or_DConj = mkCId "either7or_DConj" -cidboth7and_DConj = mkCId "both7and_DConj" -cidor_Conj = mkCId "or_Conj" -cidand_Conj = mkCId "and_Conj" -cidamp_Conj = mkCId "amp_Conj" -cidSlashV2V = mkCId "SlashV2V" -cidComplVA = mkCId "ComplVA" -cidAdNum = mkCId "AdNum" -cidi_Pron = mkCId "i_Pron" -cidOrdSuperl = mkCId "OrdSuperl" -cidno_RP = mkCId "no_RP" -cidthat_RP = mkCId "that_RP" -cidUseRCl = mkCId "UseRCl" -cidRelSlash = mkCId "RelSlash" -cidRelNP = mkCId "RelNP" -cidRelVP = mkCId "RelVP" -cidmany_Det = mkCId "many_Det" diff --git a/examples/PennTreebank/Monad.hs b/examples/PennTreebank/Monad.hs deleted file mode 100644 index 30fd1d7a0..000000000 --- a/examples/PennTreebank/Monad.hs +++ /dev/null @@ -1,98 +0,0 @@ -module Monad ( Rule(..), Grammar, grammar - , P, parse - , cat, word, lemma, inside, transform - , many, many1, opt - ) where - -import Data.Tree -import Data.Char -import qualified Data.Map as Map -import Control.Monad -import PGF hiding (Tree,parse) - -infix 1 :-> - - -data Rule t e = t :-> P t e e -type Grammar t e = t -> PGF -> Morpho -> [Tree t] -> e - -grammar :: (Ord t,Show t) => ([e] -> e) -> [Rule t e] -> Grammar t e -grammar def rules = gr - where - gr = \tag -> - case Map.lookup tag pmap of - Just f -> \pgf m ts -> case unP f gr pgf m ts of - Just (e,[]) -> e - _ -> case ts of - [Node w []] -> def [] - ts -> def [gr tag pgf m ts | Node tag ts <- ts] - Nothing -> \pgf m ts -> case ts of - [Node w []] -> def [] - ts -> def [gr tag pgf m ts | Node tag ts <- ts] - - pmap = Map.fromListWith mplus (map (\(t :-> r) -> (t,r)) rules) - - -newtype P t e a = P {unP :: Grammar t e -> PGF -> Morpho -> [Tree t] -> Maybe (a,[Tree t])} - -instance Monad (P t e) where - return x = P (\gr pgf m ts -> Just (x,ts)) - f >>= g = P (\gr pgf m ts -> case unP f gr pgf m ts of - Just (x,ts) -> unP (g x) gr pgf m ts - Nothing -> Nothing) - -instance MonadPlus (P t e) where - mzero = P (\gr pgf m ts -> Nothing) - mplus f g = P (\gr pgf m ts -> unP f gr pgf m ts `mplus` unP g gr pgf m ts) - - -parse :: Grammar t e -> PGF -> Morpho -> Tree t -> e -parse gr pgf morpho (Node tag ts) = gr tag pgf morpho ts - -cat :: Eq t => t -> P t e e -cat tag = P (\gr pgf morpho ts -> - case ts of - (Node tag1 ts1 : ts) | tag == tag1 -> Just (gr tag1 pgf morpho ts1,ts) - _ -> Nothing) - -word :: P t e t -word = P (\gr pgf morpho ts -> - case ts of - (Node w [] : ts) -> Just (w,ts) - _ -> Nothing) - -inside :: Eq t => t -> P t e a -> P t e a -inside tag f = P (\gr pgf morpho ts -> - case ts of - (Node tag1 ts1 : ts) | tag == tag1 -> case unP f gr pgf morpho ts1 of - Just (x,[]) -> Just (x,ts) - _ -> Nothing - _ -> Nothing) - -lemma :: String -> String -> P String e CId -lemma cat0 an0 = P (\gr pgf morpho ts -> - case ts of - (Node w [] : ts) -> case [lemma | (lemma, an1) <- lookupMorpho morpho (map toLower w) - , let cat1 = maybe "" (showType []) (functionType pgf lemma) - , cat0 == cat1 && an0 == an1] of - (id:_) -> Just (id,ts) - _ -> Nothing - _ -> Nothing) - -transform :: ([Tree t] -> [Tree t]) -> P t e () -transform f = P (\gr pgf morpho ts -> Just ((),f ts)) - -many :: P t e a -> P t e [a] -many f = do x <- f - xs <- many f - return (x:xs) - `mplus` - do return [] - -many1 :: P t e a -> P t e [a] -many1 f = do x <- f - xs <- many f - return (x:xs) - -opt :: P t e a -> a -> P t e a -opt f x = mplus f (return x) diff --git a/examples/PennTreebank/PennFormat.hs b/examples/PennTreebank/PennFormat.hs deleted file mode 100644 index 2aaf0a6b6..000000000 --- a/examples/PennTreebank/PennFormat.hs +++ /dev/null @@ -1,38 +0,0 @@ -module PennFormat(parseTreebank, showTree) where - -import Text.PrettyPrint -import Data.Tree -import Data.Char - -parseTreebank :: String -> [Tree String] -parseTreebank [] = [] -parseTreebank (c:cs) - | isSpace c = parseTreebank cs - | c == '(' = let (ts,cs1) = parseTrees cs - in ts ++ parseTreebank cs1 - -parseTrees [] = ([],[]) -parseTrees (c:cs) - | isSpace c = parseTrees cs - | c == ')' = ([],cs) - | c == '(' = let (w, cs1) = parseWord cs - (children,cs2) = parseTrees cs1 - (rest, cs3) = parseTrees cs2 - in (Node (normalize w) children : rest,cs3) - | otherwise = let (w, cs1) = parseWord (c:cs) - (rest, cs2) = parseTrees cs1 - in (Node w [] : rest,cs2) - -normalize tag = - let (tag0,mod) = break (=='-') tag - in if null tag0 - then tag - else tag0 - -parseWord = break (\c -> isSpace c || c == '(' || c == ')') - -printTree (Node w []) = text w -printTree (Node l children) = parens (text l <+> hsep (map printTree children)) - -showTree :: Tree String -> String -showTree = render . printTree diff --git a/examples/PennTreebank/training.hs b/examples/PennTreebank/training.hs deleted file mode 100644 index 433e5852c..000000000 --- a/examples/PennTreebank/training.hs +++ /dev/null @@ -1,125 +0,0 @@ -import PGF -import qualified Data.Map as Map -import Data.Maybe -import Data.List - -main = do - pgf <- readPGF "ParseEngAbs.pgf" - ls <- fmap (filterExprs . zip [1..] . lines) $ readFile "log4.txt" - putStrLn "" - putStrLn ("trees: "++show (length ls)) - let stats = foldl' (collectStats pgf) - (initStats pgf) - [(n,fromMaybe (error l) (readExpr (toQ l)),Just (mkCId "Phr"),Nothing) | (n,l) <- ls] - - putStrLn ("coverage: "++show (coverage stats)) - - putStrLn ("Writing ParseEngAbs.probs...") - writeFile "ParseEngAbs.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- uprobs pgf stats]) - - putStrLn ("Writing ParseEngAbs2.probs...") - writeFile "ParseEngAbs2.probs" (unlines [show cat1 ++ "\t" ++ show cat2 ++ "\t" ++ show p | (cat1,cat2,p) <- mprobs pgf stats]) - - putStrLn ("Writing global.probs...") - writeFile "global.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- gprobs pgf stats]) - - putStrLn ("Writing categories.probs...") - writeFile "categories.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- cprobs pgf stats]) - where - toQ [] = [] - toQ ('[':cs) = let (xs,']':ys) = break (==']') cs - in toQ ('?' : ys) - toQ ('?':cs) = 'Q' : toQ cs - toQ (c:cs) = c : toQ cs - -filterExprs [] = [] -filterExprs ((n,l):ls) - | null l = filterExprs ls - | elem (head l) "+#*" = (n,drop 2 l) : filterExprs ls - | otherwise = filterExprs ls - -initStats pgf = - (Map.fromListWith (+) - ([(f,1) | f <- functions pgf] ++ - [(cat pgf f,1) | f <- functions pgf]) - ,Map.empty - ,0 - ) - -collectStats pgf (ustats,bstats,count) (n,e,mb_cat1,mb_cat2) = - case unApp e of - Just (f,args) -> let fcat2 = cat2 pgf f n e - fcat = fromMaybe (cat2 pgf f n e) mb_cat1 - cf = fromMaybe 0 (Map.lookup f ustats) - cc = fromMaybe 0 (Map.lookup fcat ustats) - in if isJust mb_cat1 && f /= mkCId "Q" && fcat /= fcat2 - then error (show n ++ ": " ++ showExpr [] e) - else - cf `seq` cc `seq` bstats `seq` count `seq` - foldl' (collectStats pgf) - (Map.insert f (cf+1) (Map.insert fcat (cc+1) ustats) - ,(if null args - then Map.insertWith (+) (fcat,wildCId) 1 - else id) - (maybe bstats (\cat2 -> Map.insertWith (+) (cat2,fcat) 1 bstats) mb_cat2) - ,count+1 - ) - (zipWith3 (\e mb_cat1 mb_cat2 -> (n,e,mb_cat1,mb_cat2)) args (argCats f) (repeat (Just fcat))) - Nothing -> case unStr e of - Just _ -> (ustats,bstats,count+1) - Nothing -> error ("collectStats ("++showExpr [] e++")") - where - argCats f = - case fmap unType (functionType pgf f) of - Just (arg_tys,_,_) -> let tyCat (_,_,ty) = let (_,cat,_) = unType ty in Just cat - in map tyCat arg_tys - Nothing -> repeat Nothing - -coverage (ustats,bstats,count) = - let c = fromMaybe 0 (Map.lookup (mkCId "Q") ustats) - in (fromIntegral (count - c) / fromIntegral count) * 100 - -uprobs pgf (ustats,bstats,count) = - [toProb f (cat pgf f) | f <- functions pgf] - where - toProb f cat = - let count = fromMaybe 0 (Map.lookup f ustats) - cat_mass = fromMaybe 0 (Map.lookup cat ustats) - in (f, fromIntegral count / fromIntegral cat_mass :: Double) - -mprobs pgf (ustats,bstats,count) = - concat [toProb cat | cat <- categories pgf] - where - toProb cat = - let mass = sum [count | ((cat1,cat2),count) <- Map.toList bstats, cat1==cat] - cat_count = fromMaybe 0 (Map.lookup cat ustats) - fun_count = sum [fromMaybe 0 (Map.lookup f ustats) | f <- functionsByCat pgf cat] - in (cat,mkCId "*",if cat_count == 0 then 0 else fromIntegral (cat_count - fun_count) / fromIntegral cat_count) : - [(cat1,cat2,fromIntegral count / fromIntegral mass) - | ((cat1,cat2),count) <- Map.toList bstats, cat1==cat] - -gprobs pgf (ustats,bstats,count) = - sortBy (\x y -> compare (snd y) (snd x)) [toProb f | f <- functions pgf] - where - toProb f = - let fcount = fromMaybe 0 (Map.lookup f ustats) - in (f, fromIntegral fcount / fromIntegral count :: Double) - -cprobs pgf (ustats,bstats,count) = - sortBy (\x y -> compare (snd y) (snd x)) [toProb c | c <- categories pgf] - where - mass = sum [fromMaybe 0 (Map.lookup c ustats) | c <- categories pgf] - - toProb c = - let fcount = fromMaybe 0 (Map.lookup c ustats) - in (c, fromIntegral fcount / fromIntegral mass :: Double) - -cat pgf f = - case fmap unType (functionType pgf f) of - Just (_,cat,_) -> cat - Nothing -> error ("Unknown function "++showCId f) - -cat2 pgf f n e = - case fmap unType (functionType pgf f) of - Just (_,cat,_) -> cat - Nothing -> error (show n ++ ": Unknown function "++showCId f++" in "++showExpr [] e) diff --git a/examples/PennTreebank/translate.hs b/examples/PennTreebank/translate.hs deleted file mode 100644 index 35a17e22a..000000000 --- a/examples/PennTreebank/translate.hs +++ /dev/null @@ -1,809 +0,0 @@ --- [1416,4467,4623,4871,4561,4303,3763,3137,2501,1857,1353,952,646,483,332,200,116,89,54,41,20,22,7,2,4,5,0,3,2,1,0,0,0,0,0,1] --- average 5 - -import Monad -import Idents -import PennFormat - -import PGF hiding (Tree,parse) -import Control.Monad -import System.IO -import System.Process -import Data.Maybe -import Data.List -import Data.IORef -import Data.Char -import Data.Tree - -test = False - -main = do - pgf <- readPGF "ParseEngAbs.pgf" - let Just language = readLanguage "ParseEng" - morpho = buildMorpho pgf language - s <- readFile "wsj.02-21" - ref <- newIORef (0,0,0) - mapM_ (process pgf morpho ref) ((if test then take 40 else id) (parseTreebank s)) - where - process pgf morpho ref t = do - (cn,co,l) <- readIORef ref - let e = (flatten . parse penn pgf morpho . prune) t - (cn',co') = count (cn,co) e - l' = l+1 - writeIORef ref (cn',co',l') - hPutStrLn stdout (showExpr [] e) - when test $ do - writeFile ("tmp_tree.dot") (graphvizAbstractTree pgf (True,False) e) - rawSystem "dot" ["-Tpdf", "tmp_tree.dot", "-otrees/tree"++showAlign l'++".pdf"] - return () - hPutStrLn stderr (show ((fromIntegral cn' / fromIntegral co') * 100)) - - count (cn,co) e = cn `seq` co `seq` - case unApp e of - Just (f,es) -> if f == meta - then foldl' count (cn, co+1) es - else foldl' count (cn+1,co+1) es - Nothing -> (cn+1,co+1) - - - showAlign n = - replicate (5 - length s) '0' ++ s - where - s = show n - - prune (Node tag ts) - | tag == "S" - && not (null ts) - && last ts == Node "." [Node "." []] = Node tag (init ts) - | otherwise = Node tag ts - - flatten e = - case unApp e of - Just (f,es) | f == meta -> mkApp f (concatMap grab es) - | otherwise -> mkApp f (map flatten es) - Nothing -> e - - grab e = - case unApp e of - Just (f,es) | f == meta -> concatMap grab es - | otherwise -> [mkApp f (map flatten es)] - Nothing -> [] - - -penn :: Grammar String Expr -penn = - grammar (mkApp meta) - [ "ADVP":-> do adv <- cat "RB" - case unApp adv of - Just (f,[a]) | f == cidPositAdvAdj -> return (mkApp cidPositAdVAdj [a]) - _ -> mzero - `mplus` - do adV <- inside "RB" (lemma "AdV" "s") - return (mkApp adV []) - , "ADJP":-> do adas <- many pAdA - v <- inside "JJ" (lemma "V2" "s VPPart") - pps <- many (cat "PP") - let adj = mkApp cidPastPartAP [mkApp v []] - ap0 = foldr (\ada ap -> mkApp cidAdAP [ada,ap]) adj adas - ap = foldr (\pp ap -> mkApp cidAdvAP [ap,pp]) ap0 pps - return ap - `mplus` - do adas0 <- many pAdA - adjs <- many1 (cat "JJ") - let adj = last adjs - adas = adas0 ++ [mkApp cidPositAdAAdj [adj] | adj <- init adjs] - ap = foldr (\ada ap -> mkApp cidAdAP [ada,ap]) (mkApp cidPositA [adj]) adas - return ap - , "S" :-> do advs <- many $ do pp <- cat "PP" - inside "," word - return pp - `mplus` - do cat "ADVP" - e0 <- do (tmp,pol,sl,e) <- pClSlash - guard (not sl) - return (mkApp cidUseCl [tmp,pol,e]) - `mplus` - do s <- cat "S" - inside "," word - np <- cat "NP" - inside "VP" $ do - (t,v) <- pV "VS" - inside "SBAR" $ do - cat "-NONE-" - inside "S" $ do - cat "-NONE-" - return (mkApp cidUseCl [mkApp cidTTAnt [ mkApp (fromMaybe meta (isVTense t)) [] - , mkApp cidASimul [] - ] - ,mkApp cidPPos [] - ,mkApp cidComplPredVP [np,mkApp cidComplVS [mkApp v [],s]] - ]) - opt (inside "." word) "" - return (foldr (\ad e -> mkApp cidAdvS [ad, e]) e0 advs) - `mplus` - do s1 <- cat "S" - opt (inside "," word) "" - cc <- cat "CC" - s2 <- cat "S" - return (mkApp cidConjS [cc, mkApp cidBaseS [s1,s2]]) - , "SBAR" :-> do (do cat "-NONE-" -- missing preposition - return () - `mplus` - do w <- inside "IN" word - guard (w == "that")) - cat "S" - , "NP" :-> do (m_cc,list_np) <- pBaseNPs - case m_cc of - Just cc -> return (mkApp cidConjNP [cc, mkListNP list_np]) - Nothing -> if length list_np > 1 - then return (mkApp meta list_np) - else return (head list_np) - `mplus` - do np <- cat "NP" - rs <- inside "SBAR" $ - do rp <- cat "WHNP" - inside "S" $ - do (tmp,pol,sl,e) <- pClSlash - guard sl - return (mkApp cidUseRCl [tmp,pol,mkApp cidRelSlash [rp,e]]) - `mplus` - do inside "NP" (cat "-NONE-") - (tmp,pol,sl,vp) <- inside "VP" pVP - guard (not sl) - return (mkApp cidUseRCl [fromMaybe (mkApp meta []) (isVTense tmp) - ,mkApp pol [] - ,mkApp cidRelVP [rp,vp]]) - return (mkApp cidRelNP [np,rs]) - `mplus` - do (m_cc,list_np) <- pNPs - case m_cc of - Just cc -> return (mkApp cidConjNP [cc, mkListNP list_np]) - Nothing -> if length list_np > 1 - then return (mkApp meta list_np) - else return (head list_np) - , "VP" :-> do (_,_,_,e) <- pVP - return e - , "PP" :-> do prep <- do cat "IN" - `mplus` - do inside "TO" word - return (mkApp cidto_Prep []) - `mplus` - do w1 <- inside "JJ" word - w2 <- inside "IN" word - guard (w1 == "such" && w2 == "as") - return (mkApp cidsuch_as_Prep []) - np <- cat "NP" - return (mkApp cidPrepNP [prep,np]) - `mplus` - do pp1 <- cat "PP" - inside "," word - conj <- cat "CC" - pp2 <- cat "PP" - opt (inside "," word) "" - return (mkApp cidConjAdv [conj, mkApp cidBaseAdv [pp1,pp2]]) - , "CC" :-> do cc <- word - case cc of - "and" -> return (mkApp cidand_Conj []) - "&" -> return (mkApp cidamp_Conj []) - "or" -> return (mkApp cidor_Conj []) - _ -> mzero - , "DT" :-> do (dt,b) <- pDT - return dt - , "IN" :-> do prep <- lemma "Prep" "s" - return (mkApp prep []) - , "NN" :-> do transform (concatMap splitDashN) - (do n <- lemma "N" "s Sg Nom" - (do inside "-" word - n2 <- lemma "N" "s Sg Nom" - return (mkApp cidDashCN [mkApp n [], mkApp n2 []]) - `mplus` - do return (mkApp n []))) - `mplus` - do v <- lemma "V" "s VPresPart" - return (mkApp cidGerundN [mkApp v []]) - , "NNS" :-> do transform (concatMap splitDashN) - (do n <- lemma "N" "s Pl Nom" - return (mkApp n []) - `mplus` - do n1 <- lemma "N" "s Sg Nom" - inside "-" word - n2 <- lemma "N" "s Pl Nom" - return (mkApp cidDashCN [mkApp n1 [], mkApp n2 []])) - , "PRP" :-> do p <- (lemma "Pron" "s (NCase Nom)" - `mplus` - lemma "Pron" "s NPAcc" - `mplus` - (do w <- word - guard (w == "I") -- upper case word - return cidi_Pron)) - return (mkApp p []) - , "PRP$":-> do p <- lemma "Pron" "s (NCase Gen)" - return (mkApp cidPossPron [mkApp p []]) - , "RB" :-> do a <- lemma "A" "s AAdv" - return (mkApp cidPositAdvAdj [mkApp a []]) - `mplus` - do adv <- lemma "Adv" "s" - return (mkApp adv []) - , "QP" :-> do adn <- inside "IN" (lemma "AdN" "s") - num <- pCD - return (mkApp cidDetQuant [mkApp cidIndefArt [], mkApp cidNumCard [mkApp cidAdNum [mkApp adn [], num]]]) - , "WHNP":-> cat "WP" - `mplus` - cat "WDT" - `mplus` - cat "WP$" - `mplus` - do cat "-NONE-" - return (mkApp cidno_RP []) - `mplus` - do w <- inside "IN" word - guard (w == "that") - return (mkApp cidthat_RP []) - , "-NONE-" - :-> return (mkApp meta []) - , "JJ" :-> do a <- lemma "A" "s (AAdj Posit Nom)" - return (mkApp a []) - , "JJR" :-> do a <- lemma "A" "s (AAdj Compar Nom)" - return (mkApp a []) - , "JJS" :-> do a <- lemma "A" "s (AAdj Superl Nom)" - return (mkApp cidOrdSuperl [mkApp a []]) - , "VB" :-> do v <- mplus (lemma "V" "s VInf") (lemma "V2" "s VInf") - return (mkApp v []) - , "VBD" :-> do v <- mplus (lemma "V" "s VPast") (lemma "V2" "s VPast") - return (mkApp v []) - , "VBG" :-> do v <- mplus (lemma "V" "s VPresPart") (lemma "V2" "s VPresPart") - return (mkApp v []) - , "VBN" :-> do v <- mplus (lemma "V" "s VPPart") (lemma "V2" "s VPPart") - return (mkApp v []) - , "VBP" :-> do v <- mplus (lemma "V" "s VInf") (lemma "V2" "s VInf") - return (mkApp v []) - , "VBZ" :-> do v <- mplus (lemma "V" "s VPres") (lemma "V2" "s VPres") - return (mkApp v []) - , "PDT" :-> do pdt <- lemma "Predet" "s" - return (mkApp pdt []) - , "WP" :-> do rp <- (lemma "RP" "s (RC Masc (NCase Nom))" - `mplus` - lemma "RP" "s (RC Masc NPAcc)") - return (mkApp rp []) - , "WDT" :-> do rp <- lemma "RP" "s (RC Neutr (NCase Nom))" - return (mkApp rp []) - , "WP$" :-> do rp <- lemma "RP" "s (RC Masc (NCase Gen))" - return (mkApp rp []) - ] - -data VForm a - = VInf | VPart | VGerund | VTense a - -instance Functor VForm where - fmap f VInf = VInf - fmap f VPart = VPart - fmap f VGerund = VGerund - fmap f (VTense t) = VTense (f t) - -isVInf VInf = True -isVInf _ = False - -isVPart VPart = True -isVPart _ = False - -isVGerund VGerund = True -isVGerund _ = False - -isVTense (VTense t) = Just t -isVTense _ = Nothing - - -pVP = do - (t,a,p,sl,e0) <- do t <- pCopula - p <- pPol - inside "VP" $ do - advs <- many (cat "ADVP") - (t',p',sl,e0) <- pVP - guard (isVPart t' && sl && p' == cidPPos) - let e1 = mkApp cidPassVPSlash [e0] - e2 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e1 advs - return (t,cidASimul,p,False,e2) - `mplus` - do t <- pCopula - p <- pPol - advs <- many (cat "ADVP") - e <- do e <- cat "ADJP" - return (mkApp cidCompAP [e]) - `mplus` - do e <- cat "NP" - return (mkApp cidCompNP [e]) - `mplus` - do e <- cat "NP" - return (mkApp cidCompNP [e]) - `mplus` - do e <- cat "PP" - return (mkApp cidCompAdv [e]) - `mplus` - do e <- cat "SBAR" - return (mkApp cidCompS [e]) - `mplus` - do e <- inside "S" $ do - inside "NP" (cat "-NONE-") - (tmp,pol,sl,e) <- inside "VP" pVP - guard (isVInf tmp && not sl && pol == cidPPos) - return e - return (mkApp cidCompVP [e]) - let e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) (mkApp cidUseComp [e]) advs - return (t,cidASimul,p,False,e1) - `mplus` - do t <- pCopula - p <- pPol - advs <- many (cat "ADVP") - (tmp,pol,sl,e) <- inside "VP" pVP - guard (isVGerund tmp && not sl && pol == cidPPos) - let e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e advs - return (t,cidASimul,p,False,mkApp cidProgrVP [e1]) - `mplus` - do t <- pCopula - p <- pPol - adv <- cat "ADVP" - return (t,cidASimul,p,False,mkApp cidUseComp [mkApp cidCompAdv [adv]]) - `mplus` - do w <- inside "MD" word - t <- case w of - "will" -> return cidTFut - "would" -> return cidTCond - _ -> mzero - p <- pPol - advs <- many (cat "ADVP") - (tmp,pol,sl,e0) <- inside "VP" pVP - guard (isVInf tmp && pol == cidPPos) - let e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs - return (VTense t,cidASimul,p,sl,e1) - `mplus` - do t <- pHave - p <- pPol - advs <- many (cat "ADVP") - (tmp,pol,sl,e0) <- inside "VP" pVP - guard (isVPart tmp && pol == cidPPos) - let e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs - return (t,cidAAnter,p,sl,e1) - `mplus` - do t <- pDo - p <- pPol - advs <- many (cat "ADVP") - (tmp,p',sl,e0) <- inside "VP" $ pVP - guard (p' == cidPPos) - let e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs - return (t,cidASimul,p,sl,e1) - `mplus` - do advs <- many (cat "ADVP") - inside "TO" word -- infinitives - e0 <- cat "VP" - let e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs - return (VInf,cidASimul,cidPPos,False,e1) - `mplus` - do advs1 <- many (cat "ADVP") - (t,v) <- pV "V2" - pps <- many (cat "PP") - let e0 = mkApp cidSlashV2a [mkApp v []] - e1 = foldl (\e pp -> mkApp cidAdvVPSlash [e, pp]) e0 pps - (sl,e2) <- (do (inside "NP" (cat "-NONE-") - `mplus` - inside "SBAR" (cat "-NONE-")) - return (True,e1) - `mplus` - do np <- cat "NP" - return (False,mkApp cidComplSlash [e1, np])) - let e3 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e2 advs1 - return (t,cidASimul,cidPPos,sl,e3) - `mplus` - do (t,v) <- inside "MD" $ - (do v <- lemma "VV" "s (VVF VPres)" - return (cidTPres,v) - `mplus` - do v <- lemma "VV" "s (VVF VPast)" - return (cidTPast,v)) - p <- pPol - advs <- many (cat "ADVP") - vp <- cat "VP" - let e0 = mkApp cidComplVV [mkApp v [], vp] - e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs - return (VTense t,cidASimul,p,False,e1) - `mplus` - do advs <- many (cat "ADVP") - (t,v) <- pVV - vp <- inside "S" $ do - inside "NP" (cat "-NONE-") - (tmp,pol,sl,e) <- inside "VP" pVP - guard ((isVInf tmp || isVGerund tmp) && not sl && pol == cidPPos) - return e - let e0 = mkApp cidComplVV [mkApp v [], vp] - e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs - return (t,cidASimul,cidPPos,False,e1) - `mplus` - do advs <- many (cat "ADVP") - (t,v) <- pV "V2V" - inside "S" $ - (do inside "NP" (cat "-NONE-") - (tmp,pol,sl,vp) <- inside "VP" pVP - guard (isVInf tmp && not sl) - let e0 = mkApp cidSlashV2V [mkApp v [], mkApp pol [], vp] - e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs - return (t,cidASimul,cidPPos,True,e1) - `mplus` - do np <- cat "NP" - (tmp,pol,sl,vp) <- inside "VP" pVP - guard (isVInf tmp && not sl) - let e0 = mkApp cidComplSlash [mkApp cidSlashV2V [mkApp v [], mkApp pol [], vp], np] - e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs - return (t,cidASimul,cidPPos,False,e1)) - `mplus` - do advs <- many (cat "ADVP") - (t,v) <- pV "VA" - adjp <- cat "ADJP" - let e0 = mkApp cidComplVA [mkApp v [], adjp] - e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs - return (t,cidASimul,cidPPos,False,e1) - `mplus` - do advs <- many (cat "ADVP") - (t,v) <- pV "VS" - s <- cat "S" `mplus` cat "SBAR" - let e0 = mkApp cidComplVS [mkApp v [], s] - e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs - return (t,cidASimul,cidPPos,False,e1) - `mplus` - do advs <- many (cat "ADVP") - (t,v) <- pV "V" - let e0 = mkApp cidUseV [mkApp v []] - e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs - return (t,cidASimul,cidPPos,False,e1) - pps <- many (cat "PP" - `mplus` - inside "ADVP" (cat "RB")) - let tmp = fmap (\t -> mkApp cidTTAnt [mkApp t [],mkApp a []]) t - e1 = foldl (\e pp -> mkApp (if sl then cidAdvVPSlash else cidAdvVP) [e, pp]) e0 pps - return (tmp, p, sl, e1) - -pClSlash = do np <- cat "NP" - advs <- many (cat "ADVP") - (tmp,pol,sl,vp) <- do (tmp,pol,sl,vp) <- inside "VP" pVP - return (isVTense tmp,pol,sl,vp) - `mplus` - do vp <- cat "VP" - return (Nothing,meta,False,vp) - let vp1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) vp advs - return (fromMaybe (mkApp meta []) tmp - ,mkApp pol [] - ,sl - ,mkApp (if sl then cidSlashVP else cidPredVP) [np,vp1] - ) - -pV cat = - do v <- lookup "VB" "s VInf" - return (VInf,v) - `mplus` - do v <- lookup "VBP" "s VInf" - return (VTense cidTPres,v) - `mplus` - do v <- lookup "VBZ" "s VPres" - return (VTense cidTPres,v) - `mplus` - do v <- lookup "VBD" "s VPast" - return (VTense cidTPast,v) - `mplus` - do v <- lookup "VBN" "s VPPart" - return (VPart,v) - `mplus` - do v <- lookup "VBG" "s VPresPart" - return (VGerund,v) - where - lookup pos fld = - inside pos $ - (do lemma cat fld - `mplus` - do w <- word - return (mkCId ("["++w++"_"++cat++"]"))) - -pVV = - do v <- lookup "VB" "s (VVF VInf)" - return (VInf,v) - `mplus` - do v <- lookup "VBP" "s (VVF VInf)" - return (VTense cidTPres,v) - `mplus` - do v <- lookup "VBZ" "s (VVF VPres)" - return (VTense cidTPres,v) - `mplus` - do v <- lookup "VBD" "s (VVF VPast)" - return (VTense cidTPast,v) - `mplus` - do v <- lookup "VBN" "s (VVF VPPart)" - return (VPart,v) - `mplus` - do v <- lookup "VBG" "s (VVF VPresPart)" - return (VGerund,v) - where - lookup pos fld = - inside pos $ - (do lemma "VV" fld - `mplus` - do w <- word - return (mkCId ("["++w++"_VV]"))) - -pCopula = - do s <- inside "VB" word - guard (s == "be") - return VInf - `mplus` - do s <- inside "VBP" word - guard (s == "am" || s == "'m" || s == "are" || s == "'re") - return (VTense cidTPres) - `mplus` - do s <- inside "VBZ" word - guard (s == "is" || s == "'s") - return (VTense cidTPres) - `mplus` - do s <- inside "VBD" word - guard (s == "were" || s == "was") - return (VTense cidTPast) - `mplus` - do s <- inside "VBN" word - guard (s == "been") - return VPart - `mplus` - do s <- inside "VBG" word - guard (s == "being") - return VGerund - -pDo = - do s <- inside "VB" word - guard (s == "do") - return VInf - `mplus` - do s <- inside "VBP" word - guard (s == "do") - return (VTense cidTPres) - `mplus` - do s <- inside "VBZ" word - guard (s == "does") - return (VTense cidTPres) - `mplus` - do s <- inside "VBD" word - guard (s == "did") - return (VTense cidTPast) - -pHave = - do s <- inside "VB" word - guard (s == "have") - return VInf - `mplus` - do s <- inside "VBP" word - guard (s == "have") - return (VTense cidTPres) - `mplus` - do s <- inside "VBZ" word - guard (s == "has") - return (VTense cidTPres) - `mplus` - do s <- inside "VBD" word - guard (s == "had") - return (VTense cidTPast) - `mplus` - do s <- inside "VBN" word - guard (s == "had") - return VPart - -pPol = - do w <- inside "RB" word - guard (w == "n't" || w == "not") - return cidPNeg - `mplus` - do return cidPPos - -pBaseNP = - do np <- inside "NN" (lemma "NP" "s (NCase Nom)") - return (mkApp np []) - `mplus` - do m_pdt <- opt (liftM Just (cat "PDT")) Nothing - m_q <- opt (liftM Just pQuant) Nothing - m_num <- opt (liftM Just pCD ) Nothing - m_ord <- opt (liftM Just (cat "JJS")) Nothing - adjs <- many pModCN - ns <- many1 (mplus (cat "NN" >>= \n -> return (n,cidNumSg)) - (cat "NNS" >>= \n -> return (n,cidNumPl))) - let (n,s) = last ns - cn0 = foldr (\(n,s) e -> mkApp cidCompoundCN [mkApp s [], n, e]) - (mkApp cidUseN [n]) - (init ns) - cn = foldr (\adj e -> mkApp cidAdjCN [adj, e]) - cn0 - adjs - num = maybe (mkApp s []) (\n -> mkApp cidNumCard [n]) m_num - - mkDetQuant q num = - case m_ord of - Just ord -> mkApp cidDetQuantOrd [q,num,ord] - Nothing -> mkApp cidDetQuant [q,num] - - e0 <- if s == cidNumSg - then case m_q of - Just (q,True) -> return (mkApp cidDetCN [mkDetQuant q num,cn]) - Just (q,False) -> return (mkApp cidDetCN [q,cn]) - Nothing -> do guard (isNothing m_num) - return (mkApp cidMassNP [cn]) - else case m_q of - Just (q,True) -> return (mkApp cidDetCN [mkDetQuant q num,cn]) - Just (q,False) -> return (mkApp cidDetCN [q,cn]) - Nothing -> return (mkApp cidDetCN [mkDetQuant (mkApp cidIndefArt []) num,cn]) - let e1 = case m_pdt of - Just pdt -> mkApp cidPredetNP [pdt,e0] - Nothing -> e0 - return e1 - `mplus` - do dt <- cat "QP" - n <- mplus (cat "NN") (cat "NNS") - return (mkApp cidDetCN [dt,mkApp cidUseN [n]]) - `mplus` - do m_q <- opt (liftM Just pQuant) Nothing - ws2 <- many1 (inside "NNP" word `mplus` inside "NNPS" word) - let e0 = mkApp cidSymbPN - [mkApp cidMkSymb - [mkStr (unwords ws2)]] - case m_q of - Just (q,b) -> do guard b - return (mkApp cidUseQuantPN [q,e0]) - Nothing -> return (mkApp cidUsePN [e0]) - `mplus` - do p <- inside "PRP" (lemma "NP" "s (NCase Nom)") - return (mkApp p []) - `mplus` - do p <- cat "PRP" - return (mkApp cidUsePron [p]) - `mplus` - do np <- cat "NP" - pps <- many1 (cat "PP") - prns <- many (cat "PRN") - let e0 = foldl (\e pp -> mkApp cidAdvNP [e, pp]) np pps - e1 = foldl (\e pn -> mkApp meta [e, pn]) e0 prns - return e1 - `mplus` - do np <- cat "NP" - inside "," word - (t',p',sl,vp) <- inside "VP" pVP - guard (isVPart t' && sl && p' == cidPPos) - inside "," word - return (mkApp meta [np, vp]) - `mplus` - do (q,b) <- pQuant - return (mkApp cidDetNP [if b - then mkApp cidDetQuant [mkApp cidIndefArt [],mkApp cidNumSg []] - else q]) - `mplus` - do n <- pCD - return (mkApp cidDetNP [mkApp cidDetQuant [mkApp cidIndefArt [],mkApp cidNumCard [n]]]) - -pBaseNPs = do - np <- pBaseNP - (do inside "," word - (m_cc,nps) <- pBaseNPs - return (m_cc ,np:nps) - `mplus` - do cc <- cat "CC" - np2 <- pBaseNP - return (Just cc,[np,np2]) - `mplus` - do return (Nothing,[np])) - -pNPs = do - (t1,t2) <- do w <- inside "DT" word - case map toLower w of - "both" -> return (mkApp cidand_Conj [],mkApp cidboth7and_DConj []) - "either" -> return (mkApp cidor_Conj [],mkApp cideither7or_DConj []) - _ -> mzero - `mplus` - do return (mkApp meta [],mkApp meta []) - (m_cc,nps) <- pList - return (fmap (toDConj t1 t2) m_cc,nps) - where - toDConj t1 t2 cc - | cc == t1 = t2 - | otherwise = cc - - pList = do - np <- cat "NP" - (do inside "," word - (m_cc,nps) <- pList - return (m_cc ,np:nps) - `mplus` - do cc <- cat "CC" - np2 <- cat "NP" - return (Just cc,[np,np2]) - `mplus` - do return (Nothing,[np])) - -mkListNP nps0 = - foldr (\np1 np2 -> mkApp cidConsNP [np1,np2]) (mkApp cidBaseNP nps2) nps1 - where - (nps1,nps2) = splitAt (length nps0-2) nps0 - -pModCN = - do v <- inside "VBN" (lemma "V2" "s VPPart") - return (mkApp cidPastPartAP [mkApp v []]) - `mplus` - do v <- inside "JJ" (lemma "V2" "s VPPart") - return (mkApp cidPastPartAP [mkApp v []]) - `mplus` - do v <- inside "JJ" (lemma "V" "s VPresPart") - return (mkApp cidGerundAP [mkApp v []]) - `mplus` - do a <- cat "JJ" - return (mkApp cidPositA [a]) - `mplus` - do a <- cat "ADJP" - return a - -pCD = - do w0 <- inside "CD" word - let w = filter (/=',') w0 - guard (not (null w) && all isDigit w) - let es = [mkApp (mkCId ("D_"++[d])) [] | d <- w] - e0 = foldr (\e1 e2 -> mkApp cidIIDig [e1,e2]) (mkApp cidIDig [last es]) (init es) - e1 = mkApp cidNumDigits [e0] - return e1 - `mplus` - do w <- inside "CD" word - e <- case map toLower w of - "one" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot01 []]]]]) - "two" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot0 [mkApp cidn2 []]]]]]) - "three" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot0 [mkApp cidn3 []]]]]]) - "four" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot0 [mkApp cidn4 []]]]]]) - "five" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot0 [mkApp cidn5 []]]]]]) - "six" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot0 [mkApp cidn6 []]]]]]) - "seven" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot0 [mkApp cidn7 []]]]]]) - "eight" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot0 [mkApp cidn8 []]]]]]) - "nine" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot0 [mkApp cidn9 []]]]]]) - _ -> mzero - return (mkApp cidNumNumeral [e]) - `mplus` - do cat "CD" - -pQuant = - inside "DT" pDT - `mplus` - do dt <- cat "PRP$" - return (dt,True) - `mplus` - do np <- inside "NP" $ do - np <- pBaseNP - inside "POS" word - return np - return (mkApp cidGenNP [np],True) - `mplus` - do dt <- pMany - return (dt,False) - -pDT = - do dt <- mplus (lemma "Quant" "s False Sg") - (lemma "Quant" "s False Pl") - return (mkApp dt [],True) - `mplus` - do dt <- lemma "Det" "s" - return (mkApp dt [],False) - -pMany = - do w <- inside "JJ" word - guard (map toLower w == "many") - return (mkApp cidmany_Det []) - -pAdA = do adv <- cat "RB" - case unApp adv of - Just (f,[a]) | f == cidPositAdvAdj - -> return (mkApp cidPositAdAAdj [a]) - _ -> mzero - `mplus` - do ada <- inside "RB" (lemma "AdA" "s") - return (mkApp ada []) - -splitDashN (Node w []) = - case break (=='-') w of - (w1,'-':w2) -> Node w1 [] : Node "-" [Node "-" []] : splitDashN (Node w2 []) - _ -> [Node w []] -splitDashN t = [t] - -meta = mkCId "?" diff --git a/treebanks/PennTreebank/Idents.hs b/treebanks/PennTreebank/Idents.hs new file mode 100644 index 000000000..b084d9214 --- /dev/null +++ b/treebanks/PennTreebank/Idents.hs @@ -0,0 +1,115 @@ +module Idents where + +import PGF + +cidASimul = mkCId "ASimul" +cidAAnter = mkCId "AAnter" +cidPositAdvAdj = mkCId "PositAdvAdj" +cidPositAdVAdj = mkCId "PositAdVAdj" +cidUseCl = mkCId "UseCl" +cidPredVP = mkCId "PredVP" +cidSlashVP = mkCId "SlashVP" +cidComplPredVP = mkCId "ComplPredVP" +cidAdjCN = mkCId "AdjCN" +cidUseN = mkCId "UseN" +cidDetQuant = mkCId "DetQuant" +cidDetQuantOrd = mkCId "DetQuantOrd" +cidNumSg = mkCId "NumSg" +cidNumPl = mkCId "NumPl" +cidDetCN = mkCId "DetCN" +cidIndefArt = mkCId "IndefArt" +cidUsePN = mkCId "UsePN" +cidUseQuantPN = mkCId "UseQuantPN" +cidSymbPN = mkCId "SymbPN" +cidMkSymb = mkCId "MkSymb" +cidUsePron = mkCId "UsePron" +cidConjNP = mkCId "ConjNP" +cidBaseNP = mkCId "BaseNP" +cidConsNP = mkCId "ConsNP" +cidConjCN = mkCId "ConjCN" +cidBaseCN = mkCId "BaseCN" +cidConsCN = mkCId "ConsCN" +cidConjAdv = mkCId "ConjAdv" +cidBaseAdv = mkCId "BaseAdv" +cidConsAdv = mkCId "ConsAdv" +cidBaseS = mkCId "BaseS" +cidConsS = mkCId "ConsS" +cidConjS = mkCId "ConjS" +cidMassNP = mkCId "MassNP" +cidAdvNP = mkCId "AdvNP" +cidTPres = mkCId "TPres" +cidTPast = mkCId "TPast" +cidTFut = mkCId "TFut" +cidTCond = mkCId "TCond" +cidTTAnt = mkCId "TTAnt" +cidPPos = mkCId "PPos" +cidPNeg = mkCId "PNeg" +cidComplSlash = mkCId "ComplSlash" +cidSlashV2a = mkCId "SlashV2a" +cidComplVS = mkCId "ComplVS" +cidComplVV = mkCId "ComplVV" +cidUseV = mkCId "UseV" +cidAdVVP = mkCId "AdVVP" +cidAdvVP = mkCId "AdvVP" +cidAdvVPSlash = mkCId "AdvVPSlash" +cidPrepNP = mkCId "PrepNP" +cidto_Prep = mkCId "to_Prep" +cidsuch_as_Prep= mkCId "such_as_Prep" +cidPastPartAP = mkCId "PastPartAP" +cidPassVPSlash = mkCId "PassVPSlash" +cidAdvS = mkCId "AdvS" +cidPositA = mkCId "PositA" +cidIDig = mkCId "IDig" +cidIIDig = mkCId "IIDig" +cidNumCard = mkCId "NumCard" +cidNumDigits = mkCId "NumDigits" +cidNumNumeral = mkCId "NumNumeral" +cidnum = mkCId "num" +cidpot2as3 = mkCId "pot2as3" +cidpot1as2 = mkCId "pot1as2" +cidpot0as1 = mkCId "pot0as1" +cidpot01 = mkCId "pot01" +cidpot0 = mkCId "pot0" +cidn2 = mkCId "n2" +cidn3 = mkCId "n3" +cidn4 = mkCId "n4" +cidn5 = mkCId "n5" +cidn6 = mkCId "n6" +cidn7 = mkCId "n7" +cidn8 = mkCId "n8" +cidn9 = mkCId "n9" +cidPossPron = mkCId "PossPron" +cidCompAP = mkCId "CompAP" +cidCompNP = mkCId "CompNP" +cidCompAdv = mkCId "CompAdv" +cidCompS = mkCId "CompS" +cidCompVP = mkCId "CompVP" +cidUseComp = mkCId "UseComp" +cidCompoundCN = mkCId "CompoundCN" +cidDashCN = mkCId "DashCN" +cidProgrVP = mkCId "ProgrVP" +cidGerundN = mkCId "GerundN" +cidGerundAP = mkCId "GerundAP" +cidGenNP = mkCId "GenNP" +cidPredetNP = mkCId "PredetNP" +cidDetNP = mkCId "DetNP" +cidAdAP = mkCId "AdAP" +cidAdvAP = mkCId "AdvAP" +cidPositAdAAdj = mkCId "PositAdAAdj" +cideither7or_DConj = mkCId "either7or_DConj" +cidboth7and_DConj = mkCId "both7and_DConj" +cidor_Conj = mkCId "or_Conj" +cidand_Conj = mkCId "and_Conj" +cidamp_Conj = mkCId "amp_Conj" +cidSlashV2V = mkCId "SlashV2V" +cidComplVA = mkCId "ComplVA" +cidAdNum = mkCId "AdNum" +cidi_Pron = mkCId "i_Pron" +cidOrdSuperl = mkCId "OrdSuperl" +cidno_RP = mkCId "no_RP" +cidthat_RP = mkCId "that_RP" +cidUseRCl = mkCId "UseRCl" +cidRelSlash = mkCId "RelSlash" +cidRelNP = mkCId "RelNP" +cidRelVP = mkCId "RelVP" +cidmany_Det = mkCId "many_Det" diff --git a/treebanks/PennTreebank/Monad.hs b/treebanks/PennTreebank/Monad.hs new file mode 100644 index 000000000..30fd1d7a0 --- /dev/null +++ b/treebanks/PennTreebank/Monad.hs @@ -0,0 +1,98 @@ +module Monad ( Rule(..), Grammar, grammar + , P, parse + , cat, word, lemma, inside, transform + , many, many1, opt + ) where + +import Data.Tree +import Data.Char +import qualified Data.Map as Map +import Control.Monad +import PGF hiding (Tree,parse) + +infix 1 :-> + + +data Rule t e = t :-> P t e e +type Grammar t e = t -> PGF -> Morpho -> [Tree t] -> e + +grammar :: (Ord t,Show t) => ([e] -> e) -> [Rule t e] -> Grammar t e +grammar def rules = gr + where + gr = \tag -> + case Map.lookup tag pmap of + Just f -> \pgf m ts -> case unP f gr pgf m ts of + Just (e,[]) -> e + _ -> case ts of + [Node w []] -> def [] + ts -> def [gr tag pgf m ts | Node tag ts <- ts] + Nothing -> \pgf m ts -> case ts of + [Node w []] -> def [] + ts -> def [gr tag pgf m ts | Node tag ts <- ts] + + pmap = Map.fromListWith mplus (map (\(t :-> r) -> (t,r)) rules) + + +newtype P t e a = P {unP :: Grammar t e -> PGF -> Morpho -> [Tree t] -> Maybe (a,[Tree t])} + +instance Monad (P t e) where + return x = P (\gr pgf m ts -> Just (x,ts)) + f >>= g = P (\gr pgf m ts -> case unP f gr pgf m ts of + Just (x,ts) -> unP (g x) gr pgf m ts + Nothing -> Nothing) + +instance MonadPlus (P t e) where + mzero = P (\gr pgf m ts -> Nothing) + mplus f g = P (\gr pgf m ts -> unP f gr pgf m ts `mplus` unP g gr pgf m ts) + + +parse :: Grammar t e -> PGF -> Morpho -> Tree t -> e +parse gr pgf morpho (Node tag ts) = gr tag pgf morpho ts + +cat :: Eq t => t -> P t e e +cat tag = P (\gr pgf morpho ts -> + case ts of + (Node tag1 ts1 : ts) | tag == tag1 -> Just (gr tag1 pgf morpho ts1,ts) + _ -> Nothing) + +word :: P t e t +word = P (\gr pgf morpho ts -> + case ts of + (Node w [] : ts) -> Just (w,ts) + _ -> Nothing) + +inside :: Eq t => t -> P t e a -> P t e a +inside tag f = P (\gr pgf morpho ts -> + case ts of + (Node tag1 ts1 : ts) | tag == tag1 -> case unP f gr pgf morpho ts1 of + Just (x,[]) -> Just (x,ts) + _ -> Nothing + _ -> Nothing) + +lemma :: String -> String -> P String e CId +lemma cat0 an0 = P (\gr pgf morpho ts -> + case ts of + (Node w [] : ts) -> case [lemma | (lemma, an1) <- lookupMorpho morpho (map toLower w) + , let cat1 = maybe "" (showType []) (functionType pgf lemma) + , cat0 == cat1 && an0 == an1] of + (id:_) -> Just (id,ts) + _ -> Nothing + _ -> Nothing) + +transform :: ([Tree t] -> [Tree t]) -> P t e () +transform f = P (\gr pgf morpho ts -> Just ((),f ts)) + +many :: P t e a -> P t e [a] +many f = do x <- f + xs <- many f + return (x:xs) + `mplus` + do return [] + +many1 :: P t e a -> P t e [a] +many1 f = do x <- f + xs <- many f + return (x:xs) + +opt :: P t e a -> a -> P t e a +opt f x = mplus f (return x) diff --git a/treebanks/PennTreebank/PennFormat.hs b/treebanks/PennTreebank/PennFormat.hs new file mode 100644 index 000000000..2aaf0a6b6 --- /dev/null +++ b/treebanks/PennTreebank/PennFormat.hs @@ -0,0 +1,38 @@ +module PennFormat(parseTreebank, showTree) where + +import Text.PrettyPrint +import Data.Tree +import Data.Char + +parseTreebank :: String -> [Tree String] +parseTreebank [] = [] +parseTreebank (c:cs) + | isSpace c = parseTreebank cs + | c == '(' = let (ts,cs1) = parseTrees cs + in ts ++ parseTreebank cs1 + +parseTrees [] = ([],[]) +parseTrees (c:cs) + | isSpace c = parseTrees cs + | c == ')' = ([],cs) + | c == '(' = let (w, cs1) = parseWord cs + (children,cs2) = parseTrees cs1 + (rest, cs3) = parseTrees cs2 + in (Node (normalize w) children : rest,cs3) + | otherwise = let (w, cs1) = parseWord (c:cs) + (rest, cs2) = parseTrees cs1 + in (Node w [] : rest,cs2) + +normalize tag = + let (tag0,mod) = break (=='-') tag + in if null tag0 + then tag + else tag0 + +parseWord = break (\c -> isSpace c || c == '(' || c == ')') + +printTree (Node w []) = text w +printTree (Node l children) = parens (text l <+> hsep (map printTree children)) + +showTree :: Tree String -> String +showTree = render . printTree diff --git a/treebanks/PennTreebank/training.hs b/treebanks/PennTreebank/training.hs new file mode 100644 index 000000000..433e5852c --- /dev/null +++ b/treebanks/PennTreebank/training.hs @@ -0,0 +1,125 @@ +import PGF +import qualified Data.Map as Map +import Data.Maybe +import Data.List + +main = do + pgf <- readPGF "ParseEngAbs.pgf" + ls <- fmap (filterExprs . zip [1..] . lines) $ readFile "log4.txt" + putStrLn "" + putStrLn ("trees: "++show (length ls)) + let stats = foldl' (collectStats pgf) + (initStats pgf) + [(n,fromMaybe (error l) (readExpr (toQ l)),Just (mkCId "Phr"),Nothing) | (n,l) <- ls] + + putStrLn ("coverage: "++show (coverage stats)) + + putStrLn ("Writing ParseEngAbs.probs...") + writeFile "ParseEngAbs.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- uprobs pgf stats]) + + putStrLn ("Writing ParseEngAbs2.probs...") + writeFile "ParseEngAbs2.probs" (unlines [show cat1 ++ "\t" ++ show cat2 ++ "\t" ++ show p | (cat1,cat2,p) <- mprobs pgf stats]) + + putStrLn ("Writing global.probs...") + writeFile "global.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- gprobs pgf stats]) + + putStrLn ("Writing categories.probs...") + writeFile "categories.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- cprobs pgf stats]) + where + toQ [] = [] + toQ ('[':cs) = let (xs,']':ys) = break (==']') cs + in toQ ('?' : ys) + toQ ('?':cs) = 'Q' : toQ cs + toQ (c:cs) = c : toQ cs + +filterExprs [] = [] +filterExprs ((n,l):ls) + | null l = filterExprs ls + | elem (head l) "+#*" = (n,drop 2 l) : filterExprs ls + | otherwise = filterExprs ls + +initStats pgf = + (Map.fromListWith (+) + ([(f,1) | f <- functions pgf] ++ + [(cat pgf f,1) | f <- functions pgf]) + ,Map.empty + ,0 + ) + +collectStats pgf (ustats,bstats,count) (n,e,mb_cat1,mb_cat2) = + case unApp e of + Just (f,args) -> let fcat2 = cat2 pgf f n e + fcat = fromMaybe (cat2 pgf f n e) mb_cat1 + cf = fromMaybe 0 (Map.lookup f ustats) + cc = fromMaybe 0 (Map.lookup fcat ustats) + in if isJust mb_cat1 && f /= mkCId "Q" && fcat /= fcat2 + then error (show n ++ ": " ++ showExpr [] e) + else + cf `seq` cc `seq` bstats `seq` count `seq` + foldl' (collectStats pgf) + (Map.insert f (cf+1) (Map.insert fcat (cc+1) ustats) + ,(if null args + then Map.insertWith (+) (fcat,wildCId) 1 + else id) + (maybe bstats (\cat2 -> Map.insertWith (+) (cat2,fcat) 1 bstats) mb_cat2) + ,count+1 + ) + (zipWith3 (\e mb_cat1 mb_cat2 -> (n,e,mb_cat1,mb_cat2)) args (argCats f) (repeat (Just fcat))) + Nothing -> case unStr e of + Just _ -> (ustats,bstats,count+1) + Nothing -> error ("collectStats ("++showExpr [] e++")") + where + argCats f = + case fmap unType (functionType pgf f) of + Just (arg_tys,_,_) -> let tyCat (_,_,ty) = let (_,cat,_) = unType ty in Just cat + in map tyCat arg_tys + Nothing -> repeat Nothing + +coverage (ustats,bstats,count) = + let c = fromMaybe 0 (Map.lookup (mkCId "Q") ustats) + in (fromIntegral (count - c) / fromIntegral count) * 100 + +uprobs pgf (ustats,bstats,count) = + [toProb f (cat pgf f) | f <- functions pgf] + where + toProb f cat = + let count = fromMaybe 0 (Map.lookup f ustats) + cat_mass = fromMaybe 0 (Map.lookup cat ustats) + in (f, fromIntegral count / fromIntegral cat_mass :: Double) + +mprobs pgf (ustats,bstats,count) = + concat [toProb cat | cat <- categories pgf] + where + toProb cat = + let mass = sum [count | ((cat1,cat2),count) <- Map.toList bstats, cat1==cat] + cat_count = fromMaybe 0 (Map.lookup cat ustats) + fun_count = sum [fromMaybe 0 (Map.lookup f ustats) | f <- functionsByCat pgf cat] + in (cat,mkCId "*",if cat_count == 0 then 0 else fromIntegral (cat_count - fun_count) / fromIntegral cat_count) : + [(cat1,cat2,fromIntegral count / fromIntegral mass) + | ((cat1,cat2),count) <- Map.toList bstats, cat1==cat] + +gprobs pgf (ustats,bstats,count) = + sortBy (\x y -> compare (snd y) (snd x)) [toProb f | f <- functions pgf] + where + toProb f = + let fcount = fromMaybe 0 (Map.lookup f ustats) + in (f, fromIntegral fcount / fromIntegral count :: Double) + +cprobs pgf (ustats,bstats,count) = + sortBy (\x y -> compare (snd y) (snd x)) [toProb c | c <- categories pgf] + where + mass = sum [fromMaybe 0 (Map.lookup c ustats) | c <- categories pgf] + + toProb c = + let fcount = fromMaybe 0 (Map.lookup c ustats) + in (c, fromIntegral fcount / fromIntegral mass :: Double) + +cat pgf f = + case fmap unType (functionType pgf f) of + Just (_,cat,_) -> cat + Nothing -> error ("Unknown function "++showCId f) + +cat2 pgf f n e = + case fmap unType (functionType pgf f) of + Just (_,cat,_) -> cat + Nothing -> error (show n ++ ": Unknown function "++showCId f++" in "++showExpr [] e) diff --git a/treebanks/PennTreebank/translate.hs b/treebanks/PennTreebank/translate.hs new file mode 100644 index 000000000..35a17e22a --- /dev/null +++ b/treebanks/PennTreebank/translate.hs @@ -0,0 +1,809 @@ +-- [1416,4467,4623,4871,4561,4303,3763,3137,2501,1857,1353,952,646,483,332,200,116,89,54,41,20,22,7,2,4,5,0,3,2,1,0,0,0,0,0,1] +-- average 5 + +import Monad +import Idents +import PennFormat + +import PGF hiding (Tree,parse) +import Control.Monad +import System.IO +import System.Process +import Data.Maybe +import Data.List +import Data.IORef +import Data.Char +import Data.Tree + +test = False + +main = do + pgf <- readPGF "ParseEngAbs.pgf" + let Just language = readLanguage "ParseEng" + morpho = buildMorpho pgf language + s <- readFile "wsj.02-21" + ref <- newIORef (0,0,0) + mapM_ (process pgf morpho ref) ((if test then take 40 else id) (parseTreebank s)) + where + process pgf morpho ref t = do + (cn,co,l) <- readIORef ref + let e = (flatten . parse penn pgf morpho . prune) t + (cn',co') = count (cn,co) e + l' = l+1 + writeIORef ref (cn',co',l') + hPutStrLn stdout (showExpr [] e) + when test $ do + writeFile ("tmp_tree.dot") (graphvizAbstractTree pgf (True,False) e) + rawSystem "dot" ["-Tpdf", "tmp_tree.dot", "-otrees/tree"++showAlign l'++".pdf"] + return () + hPutStrLn stderr (show ((fromIntegral cn' / fromIntegral co') * 100)) + + count (cn,co) e = cn `seq` co `seq` + case unApp e of + Just (f,es) -> if f == meta + then foldl' count (cn, co+1) es + else foldl' count (cn+1,co+1) es + Nothing -> (cn+1,co+1) + + + showAlign n = + replicate (5 - length s) '0' ++ s + where + s = show n + + prune (Node tag ts) + | tag == "S" + && not (null ts) + && last ts == Node "." [Node "." []] = Node tag (init ts) + | otherwise = Node tag ts + + flatten e = + case unApp e of + Just (f,es) | f == meta -> mkApp f (concatMap grab es) + | otherwise -> mkApp f (map flatten es) + Nothing -> e + + grab e = + case unApp e of + Just (f,es) | f == meta -> concatMap grab es + | otherwise -> [mkApp f (map flatten es)] + Nothing -> [] + + +penn :: Grammar String Expr +penn = + grammar (mkApp meta) + [ "ADVP":-> do adv <- cat "RB" + case unApp adv of + Just (f,[a]) | f == cidPositAdvAdj -> return (mkApp cidPositAdVAdj [a]) + _ -> mzero + `mplus` + do adV <- inside "RB" (lemma "AdV" "s") + return (mkApp adV []) + , "ADJP":-> do adas <- many pAdA + v <- inside "JJ" (lemma "V2" "s VPPart") + pps <- many (cat "PP") + let adj = mkApp cidPastPartAP [mkApp v []] + ap0 = foldr (\ada ap -> mkApp cidAdAP [ada,ap]) adj adas + ap = foldr (\pp ap -> mkApp cidAdvAP [ap,pp]) ap0 pps + return ap + `mplus` + do adas0 <- many pAdA + adjs <- many1 (cat "JJ") + let adj = last adjs + adas = adas0 ++ [mkApp cidPositAdAAdj [adj] | adj <- init adjs] + ap = foldr (\ada ap -> mkApp cidAdAP [ada,ap]) (mkApp cidPositA [adj]) adas + return ap + , "S" :-> do advs <- many $ do pp <- cat "PP" + inside "," word + return pp + `mplus` + do cat "ADVP" + e0 <- do (tmp,pol,sl,e) <- pClSlash + guard (not sl) + return (mkApp cidUseCl [tmp,pol,e]) + `mplus` + do s <- cat "S" + inside "," word + np <- cat "NP" + inside "VP" $ do + (t,v) <- pV "VS" + inside "SBAR" $ do + cat "-NONE-" + inside "S" $ do + cat "-NONE-" + return (mkApp cidUseCl [mkApp cidTTAnt [ mkApp (fromMaybe meta (isVTense t)) [] + , mkApp cidASimul [] + ] + ,mkApp cidPPos [] + ,mkApp cidComplPredVP [np,mkApp cidComplVS [mkApp v [],s]] + ]) + opt (inside "." word) "" + return (foldr (\ad e -> mkApp cidAdvS [ad, e]) e0 advs) + `mplus` + do s1 <- cat "S" + opt (inside "," word) "" + cc <- cat "CC" + s2 <- cat "S" + return (mkApp cidConjS [cc, mkApp cidBaseS [s1,s2]]) + , "SBAR" :-> do (do cat "-NONE-" -- missing preposition + return () + `mplus` + do w <- inside "IN" word + guard (w == "that")) + cat "S" + , "NP" :-> do (m_cc,list_np) <- pBaseNPs + case m_cc of + Just cc -> return (mkApp cidConjNP [cc, mkListNP list_np]) + Nothing -> if length list_np > 1 + then return (mkApp meta list_np) + else return (head list_np) + `mplus` + do np <- cat "NP" + rs <- inside "SBAR" $ + do rp <- cat "WHNP" + inside "S" $ + do (tmp,pol,sl,e) <- pClSlash + guard sl + return (mkApp cidUseRCl [tmp,pol,mkApp cidRelSlash [rp,e]]) + `mplus` + do inside "NP" (cat "-NONE-") + (tmp,pol,sl,vp) <- inside "VP" pVP + guard (not sl) + return (mkApp cidUseRCl [fromMaybe (mkApp meta []) (isVTense tmp) + ,mkApp pol [] + ,mkApp cidRelVP [rp,vp]]) + return (mkApp cidRelNP [np,rs]) + `mplus` + do (m_cc,list_np) <- pNPs + case m_cc of + Just cc -> return (mkApp cidConjNP [cc, mkListNP list_np]) + Nothing -> if length list_np > 1 + then return (mkApp meta list_np) + else return (head list_np) + , "VP" :-> do (_,_,_,e) <- pVP + return e + , "PP" :-> do prep <- do cat "IN" + `mplus` + do inside "TO" word + return (mkApp cidto_Prep []) + `mplus` + do w1 <- inside "JJ" word + w2 <- inside "IN" word + guard (w1 == "such" && w2 == "as") + return (mkApp cidsuch_as_Prep []) + np <- cat "NP" + return (mkApp cidPrepNP [prep,np]) + `mplus` + do pp1 <- cat "PP" + inside "," word + conj <- cat "CC" + pp2 <- cat "PP" + opt (inside "," word) "" + return (mkApp cidConjAdv [conj, mkApp cidBaseAdv [pp1,pp2]]) + , "CC" :-> do cc <- word + case cc of + "and" -> return (mkApp cidand_Conj []) + "&" -> return (mkApp cidamp_Conj []) + "or" -> return (mkApp cidor_Conj []) + _ -> mzero + , "DT" :-> do (dt,b) <- pDT + return dt + , "IN" :-> do prep <- lemma "Prep" "s" + return (mkApp prep []) + , "NN" :-> do transform (concatMap splitDashN) + (do n <- lemma "N" "s Sg Nom" + (do inside "-" word + n2 <- lemma "N" "s Sg Nom" + return (mkApp cidDashCN [mkApp n [], mkApp n2 []]) + `mplus` + do return (mkApp n []))) + `mplus` + do v <- lemma "V" "s VPresPart" + return (mkApp cidGerundN [mkApp v []]) + , "NNS" :-> do transform (concatMap splitDashN) + (do n <- lemma "N" "s Pl Nom" + return (mkApp n []) + `mplus` + do n1 <- lemma "N" "s Sg Nom" + inside "-" word + n2 <- lemma "N" "s Pl Nom" + return (mkApp cidDashCN [mkApp n1 [], mkApp n2 []])) + , "PRP" :-> do p <- (lemma "Pron" "s (NCase Nom)" + `mplus` + lemma "Pron" "s NPAcc" + `mplus` + (do w <- word + guard (w == "I") -- upper case word + return cidi_Pron)) + return (mkApp p []) + , "PRP$":-> do p <- lemma "Pron" "s (NCase Gen)" + return (mkApp cidPossPron [mkApp p []]) + , "RB" :-> do a <- lemma "A" "s AAdv" + return (mkApp cidPositAdvAdj [mkApp a []]) + `mplus` + do adv <- lemma "Adv" "s" + return (mkApp adv []) + , "QP" :-> do adn <- inside "IN" (lemma "AdN" "s") + num <- pCD + return (mkApp cidDetQuant [mkApp cidIndefArt [], mkApp cidNumCard [mkApp cidAdNum [mkApp adn [], num]]]) + , "WHNP":-> cat "WP" + `mplus` + cat "WDT" + `mplus` + cat "WP$" + `mplus` + do cat "-NONE-" + return (mkApp cidno_RP []) + `mplus` + do w <- inside "IN" word + guard (w == "that") + return (mkApp cidthat_RP []) + , "-NONE-" + :-> return (mkApp meta []) + , "JJ" :-> do a <- lemma "A" "s (AAdj Posit Nom)" + return (mkApp a []) + , "JJR" :-> do a <- lemma "A" "s (AAdj Compar Nom)" + return (mkApp a []) + , "JJS" :-> do a <- lemma "A" "s (AAdj Superl Nom)" + return (mkApp cidOrdSuperl [mkApp a []]) + , "VB" :-> do v <- mplus (lemma "V" "s VInf") (lemma "V2" "s VInf") + return (mkApp v []) + , "VBD" :-> do v <- mplus (lemma "V" "s VPast") (lemma "V2" "s VPast") + return (mkApp v []) + , "VBG" :-> do v <- mplus (lemma "V" "s VPresPart") (lemma "V2" "s VPresPart") + return (mkApp v []) + , "VBN" :-> do v <- mplus (lemma "V" "s VPPart") (lemma "V2" "s VPPart") + return (mkApp v []) + , "VBP" :-> do v <- mplus (lemma "V" "s VInf") (lemma "V2" "s VInf") + return (mkApp v []) + , "VBZ" :-> do v <- mplus (lemma "V" "s VPres") (lemma "V2" "s VPres") + return (mkApp v []) + , "PDT" :-> do pdt <- lemma "Predet" "s" + return (mkApp pdt []) + , "WP" :-> do rp <- (lemma "RP" "s (RC Masc (NCase Nom))" + `mplus` + lemma "RP" "s (RC Masc NPAcc)") + return (mkApp rp []) + , "WDT" :-> do rp <- lemma "RP" "s (RC Neutr (NCase Nom))" + return (mkApp rp []) + , "WP$" :-> do rp <- lemma "RP" "s (RC Masc (NCase Gen))" + return (mkApp rp []) + ] + +data VForm a + = VInf | VPart | VGerund | VTense a + +instance Functor VForm where + fmap f VInf = VInf + fmap f VPart = VPart + fmap f VGerund = VGerund + fmap f (VTense t) = VTense (f t) + +isVInf VInf = True +isVInf _ = False + +isVPart VPart = True +isVPart _ = False + +isVGerund VGerund = True +isVGerund _ = False + +isVTense (VTense t) = Just t +isVTense _ = Nothing + + +pVP = do + (t,a,p,sl,e0) <- do t <- pCopula + p <- pPol + inside "VP" $ do + advs <- many (cat "ADVP") + (t',p',sl,e0) <- pVP + guard (isVPart t' && sl && p' == cidPPos) + let e1 = mkApp cidPassVPSlash [e0] + e2 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e1 advs + return (t,cidASimul,p,False,e2) + `mplus` + do t <- pCopula + p <- pPol + advs <- many (cat "ADVP") + e <- do e <- cat "ADJP" + return (mkApp cidCompAP [e]) + `mplus` + do e <- cat "NP" + return (mkApp cidCompNP [e]) + `mplus` + do e <- cat "NP" + return (mkApp cidCompNP [e]) + `mplus` + do e <- cat "PP" + return (mkApp cidCompAdv [e]) + `mplus` + do e <- cat "SBAR" + return (mkApp cidCompS [e]) + `mplus` + do e <- inside "S" $ do + inside "NP" (cat "-NONE-") + (tmp,pol,sl,e) <- inside "VP" pVP + guard (isVInf tmp && not sl && pol == cidPPos) + return e + return (mkApp cidCompVP [e]) + let e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) (mkApp cidUseComp [e]) advs + return (t,cidASimul,p,False,e1) + `mplus` + do t <- pCopula + p <- pPol + advs <- many (cat "ADVP") + (tmp,pol,sl,e) <- inside "VP" pVP + guard (isVGerund tmp && not sl && pol == cidPPos) + let e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e advs + return (t,cidASimul,p,False,mkApp cidProgrVP [e1]) + `mplus` + do t <- pCopula + p <- pPol + adv <- cat "ADVP" + return (t,cidASimul,p,False,mkApp cidUseComp [mkApp cidCompAdv [adv]]) + `mplus` + do w <- inside "MD" word + t <- case w of + "will" -> return cidTFut + "would" -> return cidTCond + _ -> mzero + p <- pPol + advs <- many (cat "ADVP") + (tmp,pol,sl,e0) <- inside "VP" pVP + guard (isVInf tmp && pol == cidPPos) + let e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs + return (VTense t,cidASimul,p,sl,e1) + `mplus` + do t <- pHave + p <- pPol + advs <- many (cat "ADVP") + (tmp,pol,sl,e0) <- inside "VP" pVP + guard (isVPart tmp && pol == cidPPos) + let e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs + return (t,cidAAnter,p,sl,e1) + `mplus` + do t <- pDo + p <- pPol + advs <- many (cat "ADVP") + (tmp,p',sl,e0) <- inside "VP" $ pVP + guard (p' == cidPPos) + let e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs + return (t,cidASimul,p,sl,e1) + `mplus` + do advs <- many (cat "ADVP") + inside "TO" word -- infinitives + e0 <- cat "VP" + let e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs + return (VInf,cidASimul,cidPPos,False,e1) + `mplus` + do advs1 <- many (cat "ADVP") + (t,v) <- pV "V2" + pps <- many (cat "PP") + let e0 = mkApp cidSlashV2a [mkApp v []] + e1 = foldl (\e pp -> mkApp cidAdvVPSlash [e, pp]) e0 pps + (sl,e2) <- (do (inside "NP" (cat "-NONE-") + `mplus` + inside "SBAR" (cat "-NONE-")) + return (True,e1) + `mplus` + do np <- cat "NP" + return (False,mkApp cidComplSlash [e1, np])) + let e3 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e2 advs1 + return (t,cidASimul,cidPPos,sl,e3) + `mplus` + do (t,v) <- inside "MD" $ + (do v <- lemma "VV" "s (VVF VPres)" + return (cidTPres,v) + `mplus` + do v <- lemma "VV" "s (VVF VPast)" + return (cidTPast,v)) + p <- pPol + advs <- many (cat "ADVP") + vp <- cat "VP" + let e0 = mkApp cidComplVV [mkApp v [], vp] + e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs + return (VTense t,cidASimul,p,False,e1) + `mplus` + do advs <- many (cat "ADVP") + (t,v) <- pVV + vp <- inside "S" $ do + inside "NP" (cat "-NONE-") + (tmp,pol,sl,e) <- inside "VP" pVP + guard ((isVInf tmp || isVGerund tmp) && not sl && pol == cidPPos) + return e + let e0 = mkApp cidComplVV [mkApp v [], vp] + e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs + return (t,cidASimul,cidPPos,False,e1) + `mplus` + do advs <- many (cat "ADVP") + (t,v) <- pV "V2V" + inside "S" $ + (do inside "NP" (cat "-NONE-") + (tmp,pol,sl,vp) <- inside "VP" pVP + guard (isVInf tmp && not sl) + let e0 = mkApp cidSlashV2V [mkApp v [], mkApp pol [], vp] + e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs + return (t,cidASimul,cidPPos,True,e1) + `mplus` + do np <- cat "NP" + (tmp,pol,sl,vp) <- inside "VP" pVP + guard (isVInf tmp && not sl) + let e0 = mkApp cidComplSlash [mkApp cidSlashV2V [mkApp v [], mkApp pol [], vp], np] + e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs + return (t,cidASimul,cidPPos,False,e1)) + `mplus` + do advs <- many (cat "ADVP") + (t,v) <- pV "VA" + adjp <- cat "ADJP" + let e0 = mkApp cidComplVA [mkApp v [], adjp] + e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs + return (t,cidASimul,cidPPos,False,e1) + `mplus` + do advs <- many (cat "ADVP") + (t,v) <- pV "VS" + s <- cat "S" `mplus` cat "SBAR" + let e0 = mkApp cidComplVS [mkApp v [], s] + e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs + return (t,cidASimul,cidPPos,False,e1) + `mplus` + do advs <- many (cat "ADVP") + (t,v) <- pV "V" + let e0 = mkApp cidUseV [mkApp v []] + e1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) e0 advs + return (t,cidASimul,cidPPos,False,e1) + pps <- many (cat "PP" + `mplus` + inside "ADVP" (cat "RB")) + let tmp = fmap (\t -> mkApp cidTTAnt [mkApp t [],mkApp a []]) t + e1 = foldl (\e pp -> mkApp (if sl then cidAdvVPSlash else cidAdvVP) [e, pp]) e0 pps + return (tmp, p, sl, e1) + +pClSlash = do np <- cat "NP" + advs <- many (cat "ADVP") + (tmp,pol,sl,vp) <- do (tmp,pol,sl,vp) <- inside "VP" pVP + return (isVTense tmp,pol,sl,vp) + `mplus` + do vp <- cat "VP" + return (Nothing,meta,False,vp) + let vp1 = foldr (\pp e -> mkApp cidAdVVP [pp, e]) vp advs + return (fromMaybe (mkApp meta []) tmp + ,mkApp pol [] + ,sl + ,mkApp (if sl then cidSlashVP else cidPredVP) [np,vp1] + ) + +pV cat = + do v <- lookup "VB" "s VInf" + return (VInf,v) + `mplus` + do v <- lookup "VBP" "s VInf" + return (VTense cidTPres,v) + `mplus` + do v <- lookup "VBZ" "s VPres" + return (VTense cidTPres,v) + `mplus` + do v <- lookup "VBD" "s VPast" + return (VTense cidTPast,v) + `mplus` + do v <- lookup "VBN" "s VPPart" + return (VPart,v) + `mplus` + do v <- lookup "VBG" "s VPresPart" + return (VGerund,v) + where + lookup pos fld = + inside pos $ + (do lemma cat fld + `mplus` + do w <- word + return (mkCId ("["++w++"_"++cat++"]"))) + +pVV = + do v <- lookup "VB" "s (VVF VInf)" + return (VInf,v) + `mplus` + do v <- lookup "VBP" "s (VVF VInf)" + return (VTense cidTPres,v) + `mplus` + do v <- lookup "VBZ" "s (VVF VPres)" + return (VTense cidTPres,v) + `mplus` + do v <- lookup "VBD" "s (VVF VPast)" + return (VTense cidTPast,v) + `mplus` + do v <- lookup "VBN" "s (VVF VPPart)" + return (VPart,v) + `mplus` + do v <- lookup "VBG" "s (VVF VPresPart)" + return (VGerund,v) + where + lookup pos fld = + inside pos $ + (do lemma "VV" fld + `mplus` + do w <- word + return (mkCId ("["++w++"_VV]"))) + +pCopula = + do s <- inside "VB" word + guard (s == "be") + return VInf + `mplus` + do s <- inside "VBP" word + guard (s == "am" || s == "'m" || s == "are" || s == "'re") + return (VTense cidTPres) + `mplus` + do s <- inside "VBZ" word + guard (s == "is" || s == "'s") + return (VTense cidTPres) + `mplus` + do s <- inside "VBD" word + guard (s == "were" || s == "was") + return (VTense cidTPast) + `mplus` + do s <- inside "VBN" word + guard (s == "been") + return VPart + `mplus` + do s <- inside "VBG" word + guard (s == "being") + return VGerund + +pDo = + do s <- inside "VB" word + guard (s == "do") + return VInf + `mplus` + do s <- inside "VBP" word + guard (s == "do") + return (VTense cidTPres) + `mplus` + do s <- inside "VBZ" word + guard (s == "does") + return (VTense cidTPres) + `mplus` + do s <- inside "VBD" word + guard (s == "did") + return (VTense cidTPast) + +pHave = + do s <- inside "VB" word + guard (s == "have") + return VInf + `mplus` + do s <- inside "VBP" word + guard (s == "have") + return (VTense cidTPres) + `mplus` + do s <- inside "VBZ" word + guard (s == "has") + return (VTense cidTPres) + `mplus` + do s <- inside "VBD" word + guard (s == "had") + return (VTense cidTPast) + `mplus` + do s <- inside "VBN" word + guard (s == "had") + return VPart + +pPol = + do w <- inside "RB" word + guard (w == "n't" || w == "not") + return cidPNeg + `mplus` + do return cidPPos + +pBaseNP = + do np <- inside "NN" (lemma "NP" "s (NCase Nom)") + return (mkApp np []) + `mplus` + do m_pdt <- opt (liftM Just (cat "PDT")) Nothing + m_q <- opt (liftM Just pQuant) Nothing + m_num <- opt (liftM Just pCD ) Nothing + m_ord <- opt (liftM Just (cat "JJS")) Nothing + adjs <- many pModCN + ns <- many1 (mplus (cat "NN" >>= \n -> return (n,cidNumSg)) + (cat "NNS" >>= \n -> return (n,cidNumPl))) + let (n,s) = last ns + cn0 = foldr (\(n,s) e -> mkApp cidCompoundCN [mkApp s [], n, e]) + (mkApp cidUseN [n]) + (init ns) + cn = foldr (\adj e -> mkApp cidAdjCN [adj, e]) + cn0 + adjs + num = maybe (mkApp s []) (\n -> mkApp cidNumCard [n]) m_num + + mkDetQuant q num = + case m_ord of + Just ord -> mkApp cidDetQuantOrd [q,num,ord] + Nothing -> mkApp cidDetQuant [q,num] + + e0 <- if s == cidNumSg + then case m_q of + Just (q,True) -> return (mkApp cidDetCN [mkDetQuant q num,cn]) + Just (q,False) -> return (mkApp cidDetCN [q,cn]) + Nothing -> do guard (isNothing m_num) + return (mkApp cidMassNP [cn]) + else case m_q of + Just (q,True) -> return (mkApp cidDetCN [mkDetQuant q num,cn]) + Just (q,False) -> return (mkApp cidDetCN [q,cn]) + Nothing -> return (mkApp cidDetCN [mkDetQuant (mkApp cidIndefArt []) num,cn]) + let e1 = case m_pdt of + Just pdt -> mkApp cidPredetNP [pdt,e0] + Nothing -> e0 + return e1 + `mplus` + do dt <- cat "QP" + n <- mplus (cat "NN") (cat "NNS") + return (mkApp cidDetCN [dt,mkApp cidUseN [n]]) + `mplus` + do m_q <- opt (liftM Just pQuant) Nothing + ws2 <- many1 (inside "NNP" word `mplus` inside "NNPS" word) + let e0 = mkApp cidSymbPN + [mkApp cidMkSymb + [mkStr (unwords ws2)]] + case m_q of + Just (q,b) -> do guard b + return (mkApp cidUseQuantPN [q,e0]) + Nothing -> return (mkApp cidUsePN [e0]) + `mplus` + do p <- inside "PRP" (lemma "NP" "s (NCase Nom)") + return (mkApp p []) + `mplus` + do p <- cat "PRP" + return (mkApp cidUsePron [p]) + `mplus` + do np <- cat "NP" + pps <- many1 (cat "PP") + prns <- many (cat "PRN") + let e0 = foldl (\e pp -> mkApp cidAdvNP [e, pp]) np pps + e1 = foldl (\e pn -> mkApp meta [e, pn]) e0 prns + return e1 + `mplus` + do np <- cat "NP" + inside "," word + (t',p',sl,vp) <- inside "VP" pVP + guard (isVPart t' && sl && p' == cidPPos) + inside "," word + return (mkApp meta [np, vp]) + `mplus` + do (q,b) <- pQuant + return (mkApp cidDetNP [if b + then mkApp cidDetQuant [mkApp cidIndefArt [],mkApp cidNumSg []] + else q]) + `mplus` + do n <- pCD + return (mkApp cidDetNP [mkApp cidDetQuant [mkApp cidIndefArt [],mkApp cidNumCard [n]]]) + +pBaseNPs = do + np <- pBaseNP + (do inside "," word + (m_cc,nps) <- pBaseNPs + return (m_cc ,np:nps) + `mplus` + do cc <- cat "CC" + np2 <- pBaseNP + return (Just cc,[np,np2]) + `mplus` + do return (Nothing,[np])) + +pNPs = do + (t1,t2) <- do w <- inside "DT" word + case map toLower w of + "both" -> return (mkApp cidand_Conj [],mkApp cidboth7and_DConj []) + "either" -> return (mkApp cidor_Conj [],mkApp cideither7or_DConj []) + _ -> mzero + `mplus` + do return (mkApp meta [],mkApp meta []) + (m_cc,nps) <- pList + return (fmap (toDConj t1 t2) m_cc,nps) + where + toDConj t1 t2 cc + | cc == t1 = t2 + | otherwise = cc + + pList = do + np <- cat "NP" + (do inside "," word + (m_cc,nps) <- pList + return (m_cc ,np:nps) + `mplus` + do cc <- cat "CC" + np2 <- cat "NP" + return (Just cc,[np,np2]) + `mplus` + do return (Nothing,[np])) + +mkListNP nps0 = + foldr (\np1 np2 -> mkApp cidConsNP [np1,np2]) (mkApp cidBaseNP nps2) nps1 + where + (nps1,nps2) = splitAt (length nps0-2) nps0 + +pModCN = + do v <- inside "VBN" (lemma "V2" "s VPPart") + return (mkApp cidPastPartAP [mkApp v []]) + `mplus` + do v <- inside "JJ" (lemma "V2" "s VPPart") + return (mkApp cidPastPartAP [mkApp v []]) + `mplus` + do v <- inside "JJ" (lemma "V" "s VPresPart") + return (mkApp cidGerundAP [mkApp v []]) + `mplus` + do a <- cat "JJ" + return (mkApp cidPositA [a]) + `mplus` + do a <- cat "ADJP" + return a + +pCD = + do w0 <- inside "CD" word + let w = filter (/=',') w0 + guard (not (null w) && all isDigit w) + let es = [mkApp (mkCId ("D_"++[d])) [] | d <- w] + e0 = foldr (\e1 e2 -> mkApp cidIIDig [e1,e2]) (mkApp cidIDig [last es]) (init es) + e1 = mkApp cidNumDigits [e0] + return e1 + `mplus` + do w <- inside "CD" word + e <- case map toLower w of + "one" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot01 []]]]]) + "two" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot0 [mkApp cidn2 []]]]]]) + "three" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot0 [mkApp cidn3 []]]]]]) + "four" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot0 [mkApp cidn4 []]]]]]) + "five" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot0 [mkApp cidn5 []]]]]]) + "six" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot0 [mkApp cidn6 []]]]]]) + "seven" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot0 [mkApp cidn7 []]]]]]) + "eight" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot0 [mkApp cidn8 []]]]]]) + "nine" -> return (mkApp cidnum [mkApp cidpot2as3 [mkApp cidpot1as2 [mkApp cidpot0as1 [mkApp cidpot0 [mkApp cidn9 []]]]]]) + _ -> mzero + return (mkApp cidNumNumeral [e]) + `mplus` + do cat "CD" + +pQuant = + inside "DT" pDT + `mplus` + do dt <- cat "PRP$" + return (dt,True) + `mplus` + do np <- inside "NP" $ do + np <- pBaseNP + inside "POS" word + return np + return (mkApp cidGenNP [np],True) + `mplus` + do dt <- pMany + return (dt,False) + +pDT = + do dt <- mplus (lemma "Quant" "s False Sg") + (lemma "Quant" "s False Pl") + return (mkApp dt [],True) + `mplus` + do dt <- lemma "Det" "s" + return (mkApp dt [],False) + +pMany = + do w <- inside "JJ" word + guard (map toLower w == "many") + return (mkApp cidmany_Det []) + +pAdA = do adv <- cat "RB" + case unApp adv of + Just (f,[a]) | f == cidPositAdvAdj + -> return (mkApp cidPositAdAAdj [a]) + _ -> mzero + `mplus` + do ada <- inside "RB" (lemma "AdA" "s") + return (mkApp ada []) + +splitDashN (Node w []) = + case break (=='-') w of + (w1,'-':w2) -> Node w1 [] : Node "-" [Node "-" []] : splitDashN (Node w2 []) + _ -> [Node w []] +splitDashN t = [t] + +meta = mkCId "?" -- cgit v1.2.3