summaryrefslogtreecommitdiff
path: root/examples/PennTreebank/training.hs
diff options
context:
space:
mode:
authorkr.angelov <kr.angelov@gmail.com>2012-01-10 19:36:28 +0000
committerkr.angelov <kr.angelov@gmail.com>2012-01-10 19:36:28 +0000
commit796dd530eedee8a7ccd605aa956a087c77719ab6 (patch)
treeecdb1b4fe984887b18fa5c9ffc2411b2a794fed4 /examples/PennTreebank/training.hs
parent1732254a1b4f9a229e638cd4142604acc9003b70 (diff)
the translation script from the Penn format to GF RGL is now in examples/PennTreebank
Diffstat (limited to 'examples/PennTreebank/training.hs')
-rw-r--r--examples/PennTreebank/training.hs35
1 files changed, 35 insertions, 0 deletions
diff --git a/examples/PennTreebank/training.hs b/examples/PennTreebank/training.hs
new file mode 100644
index 000000000..080b11a4e
--- /dev/null
+++ b/examples/PennTreebank/training.hs
@@ -0,0 +1,35 @@
+import PGF
+import qualified Data.Map as Map
+import Data.Maybe
+import Data.List
+
+main = do
+ pgf <- readPGF "PennTreebank.pgf"
+ ls <- fmap lines $ readFile "log.txt"
+ let stats = foldl' collectStats Map.empty [e | l <- ls, Just e <- [readExpr (map toQ l)]]
+ mapM_ putStrLn [show f ++ "\t" ++ show p | (f,p) <- Map.toList (probs pgf stats), f /= mkCId "Q"]
+ where
+ toQ '?' = 'Q'
+ toQ c = c
+
+collectStats stats e =
+ case unApp e of
+ Just (f,args) -> let c = fromMaybe 0 (Map.lookup f stats)
+ in c `seq` foldl' collectStats (Map.insert f (c+1) stats) args
+ Nothing -> stats
+
+probs pgf stats =
+ Map.mapWithKey toProb stats
+ where
+ toProb f c
+ | f == mkCId "Q" = 1.0
+ | otherwise = let (_,cat,_) = case functionType pgf f of
+ Just ty -> unType ty
+ Nothing -> error ("unknown: "++show f)
+ cat_mass = fromMaybe 0 (Map.lookup cat mass)
+ in (fromIntegral c / fromIntegral cat_mass :: Double)
+
+ mass = Map.fromListWith (+)
+ [(cat,c) | f <- functions pgf,
+ let Just (_,cat,_) = fmap unType (functionType pgf f),
+ let c = fromMaybe 0 (Map.lookup f stats)]