summaryrefslogtreecommitdiff
path: root/treebanks
diff options
context:
space:
mode:
authorkr.angelov <kr.angelov@gmail.com>2013-03-28 09:15:38 +0000
committerkr.angelov <kr.angelov@gmail.com>2013-03-28 09:15:38 +0000
commit74a16273b99632caf94912a4a4510d17cc88abac (patch)
treee0cd911cad7e17575986b857a422d320126f166c /treebanks
parent17fc938c20db58e65b005320db39c91f210fabac (diff)
added simple script for estimating the coverage on the PennTreebank
Diffstat (limited to 'treebanks')
-rw-r--r--treebanks/PennTreebank/coverage.hs45
1 files changed, 45 insertions, 0 deletions
diff --git a/treebanks/PennTreebank/coverage.hs b/treebanks/PennTreebank/coverage.hs
new file mode 100644
index 000000000..c9c7a7fd5
--- /dev/null
+++ b/treebanks/PennTreebank/coverage.hs
@@ -0,0 +1,45 @@
+import PGF
+import Data.Maybe
+
+main = do
+ ls <- fmap (filterExprs . lines) $ readFile "log4.txt"
+ let (c,m1,m2) = foldl counts (0,0,0) (map (\l -> fromMaybe (error l) (readExpr (f l))) ls)
+ print (length ls)
+ print ((c / (c+m1+m2))*100)
+ print (((c+m2) / (c+m1+m2))*100)
+ meta_dist [length [l | l <- ls, length [c | c <- l, c == '?'] == n] | n <- [0..27]]
+ meta_dist [length [l | l <- ls, length [x | x <- (zip l (tail l)), x == ('(','?')] == n] | n <- [0..27]]
+ cs <- fmap (map (length . words) . lines) $ readFile "wsj.eng"
+ print (average [fromIntegral c / fromIntegral (max n 1) | (c,l) <- zip cs ls, let n = length [c | c <- l, c == '?']])
+ print (average [fromIntegral c / fromIntegral (max n 1) | (c,l) <- zip cs ls, let n = length [x | x <- (zip l (tail l)), x == ('(','?')]])
+
+average xs = sum xs / fromIntegral (length xs)
+
+filterExprs [] = []
+filterExprs (l:ls)
+ | null l = filterExprs ls
+ | elem (head l) "+#*" = drop 2 l : filterExprs ls
+ | otherwise = filterExprs ls
+
+f [] = []
+f ('[':cs) = let (xs,']':ys) = break (==']') cs
+ in f ('?' : ys)
+f ('?':cs) = 'Q' : f cs
+f (c:cs) = c : f cs
+
+counts (c,m1,m2) e = c `seq` m1 `seq` m2 `seq`
+ case unApp e of
+ Just (f,es) | f == mkCId "Q" -> if null es
+ then foldl counts (c,m1,m2+1) es
+ else foldl counts (c,m1+1,m2) es
+ | otherwise -> foldl counts (c+1,m1,m2) es
+ Nothing -> case unStr e of
+ Just _ -> (c+1,m1,m2)
+ Nothing -> error ("counts ("++show e++")")
+
+meta_dist cs = do
+ print cs
+ let cnt = fromIntegral (sum cs)
+ avg = fromIntegral (sum [n*c | (n,c) <- zip [0..] cs]) / cnt
+ dev = sqrt (sum [((fromIntegral n-avg) ^ 2)*fromIntegral c | (n,c) <- zip [0..] cs] / cnt)
+ print (avg,dev)