diff options
| author | kr.angelov <kr.angelov@gmail.com> | 2013-03-28 09:15:38 +0000 |
|---|---|---|
| committer | kr.angelov <kr.angelov@gmail.com> | 2013-03-28 09:15:38 +0000 |
| commit | 74a16273b99632caf94912a4a4510d17cc88abac (patch) | |
| tree | e0cd911cad7e17575986b857a422d320126f166c /treebanks | |
| parent | 17fc938c20db58e65b005320db39c91f210fabac (diff) | |
added simple script for estimating the coverage on the PennTreebank
Diffstat (limited to 'treebanks')
| -rw-r--r-- | treebanks/PennTreebank/coverage.hs | 45 |
1 files changed, 45 insertions, 0 deletions
diff --git a/treebanks/PennTreebank/coverage.hs b/treebanks/PennTreebank/coverage.hs new file mode 100644 index 000000000..c9c7a7fd5 --- /dev/null +++ b/treebanks/PennTreebank/coverage.hs @@ -0,0 +1,45 @@ +import PGF +import Data.Maybe + +main = do + ls <- fmap (filterExprs . lines) $ readFile "log4.txt" + let (c,m1,m2) = foldl counts (0,0,0) (map (\l -> fromMaybe (error l) (readExpr (f l))) ls) + print (length ls) + print ((c / (c+m1+m2))*100) + print (((c+m2) / (c+m1+m2))*100) + meta_dist [length [l | l <- ls, length [c | c <- l, c == '?'] == n] | n <- [0..27]] + meta_dist [length [l | l <- ls, length [x | x <- (zip l (tail l)), x == ('(','?')] == n] | n <- [0..27]] + cs <- fmap (map (length . words) . lines) $ readFile "wsj.eng" + print (average [fromIntegral c / fromIntegral (max n 1) | (c,l) <- zip cs ls, let n = length [c | c <- l, c == '?']]) + print (average [fromIntegral c / fromIntegral (max n 1) | (c,l) <- zip cs ls, let n = length [x | x <- (zip l (tail l)), x == ('(','?')]]) + +average xs = sum xs / fromIntegral (length xs) + +filterExprs [] = [] +filterExprs (l:ls) + | null l = filterExprs ls + | elem (head l) "+#*" = drop 2 l : filterExprs ls + | otherwise = filterExprs ls + +f [] = [] +f ('[':cs) = let (xs,']':ys) = break (==']') cs + in f ('?' : ys) +f ('?':cs) = 'Q' : f cs +f (c:cs) = c : f cs + +counts (c,m1,m2) e = c `seq` m1 `seq` m2 `seq` + case unApp e of + Just (f,es) | f == mkCId "Q" -> if null es + then foldl counts (c,m1,m2+1) es + else foldl counts (c,m1+1,m2) es + | otherwise -> foldl counts (c+1,m1,m2) es + Nothing -> case unStr e of + Just _ -> (c+1,m1,m2) + Nothing -> error ("counts ("++show e++")") + +meta_dist cs = do + print cs + let cnt = fromIntegral (sum cs) + avg = fromIntegral (sum [n*c | (n,c) <- zip [0..] cs]) / cnt + dev = sqrt (sum [((fromIntegral n-avg) ^ 2)*fromIntegral c | (n,c) <- zip [0..] cs] / cnt) + print (avg,dev) |
