summaryrefslogtreecommitdiff
path: root/src-3.0/GF/Speech/TransformCFG.hs
diff options
context:
space:
mode:
authoraarne <aarne@cs.chalmers.se>2008-05-21 09:26:44 +0000
committeraarne <aarne@cs.chalmers.se>2008-05-21 09:26:44 +0000
commit055c0d0d5a5bb0dc75904fe53df7f2e4f5732a8f (patch)
tree0e63fb68c69c8f6ad0f78893c63420f0a3600e1c /src-3.0/GF/Speech/TransformCFG.hs
parent915a1de71783ab8446b1af9e72c7ba7dfbc12d3f (diff)
GF/src is now for 2.9, and the new sources are in src-3.0 - keep it this way until the release of GF 3
Diffstat (limited to 'src-3.0/GF/Speech/TransformCFG.hs')
-rw-r--r--src-3.0/GF/Speech/TransformCFG.hs378
1 files changed, 378 insertions, 0 deletions
diff --git a/src-3.0/GF/Speech/TransformCFG.hs b/src-3.0/GF/Speech/TransformCFG.hs
new file mode 100644
index 000000000..3d7ebd809
--- /dev/null
+++ b/src-3.0/GF/Speech/TransformCFG.hs
@@ -0,0 +1,378 @@
+----------------------------------------------------------------------
+-- |
+-- Module : TransformCFG
+-- Maintainer : BB
+-- Stability : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/11/01 20:09:04 $
+-- > CVS $Author: bringert $
+-- > CVS $Revision: 1.24 $
+--
+-- This module does some useful transformations on CFGs.
+--
+-- peb thinks: most of this module should be moved to GF.Conversion...
+-----------------------------------------------------------------------------
+
+module GF.Speech.TransformCFG where
+
+import GF.Canon.CanonToGFCC (canon2gfcc)
+import qualified GF.GFCC.CId as C
+import GF.GFCC.Macros (lookType,catSkeleton)
+import GF.GFCC.DataGFCC (GFCC)
+import GF.Conversion.Types
+import GF.CF.PPrCF (prCFCat)
+import GF.Data.Utilities
+import GF.Formalism.CFG
+import GF.Formalism.Utilities (Symbol(..), mapSymbol, filterCats, symbol,
+ NameProfile(..), Profile(..), name2fun, forestName)
+import GF.Infra.Ident
+import GF.Infra.Option
+import GF.Infra.Print
+import GF.Speech.Relation
+import GF.Compile.ShellState (StateGrammar, stateCFG, stateGrammarST, startCatStateOpts, stateOptions)
+
+import Control.Monad
+import Control.Monad.State (State, get, put, evalState)
+import Data.Map (Map)
+import qualified Data.Map as Map
+import Data.List
+import Data.Maybe (fromMaybe)
+import Data.Monoid (mconcat)
+import Data.Set (Set)
+import qualified Data.Set as Set
+
+-- not very nice to replace the structured CFCat type with a simple string
+type CFRule_ = CFRule Cat_ CFTerm Token
+
+data CFTerm
+ = CFObj Fun [CFTerm] -- ^ an abstract syntax function with arguments
+ | CFAbs Int CFTerm -- ^ A lambda abstraction. The Int is the variable id.
+ | CFApp CFTerm CFTerm -- ^ Application
+ | CFRes Int -- ^ The result of the n:th (0-based) non-terminal
+ | CFVar Int -- ^ A lambda-bound variable
+ | CFMeta String -- ^ A metavariable
+ deriving (Eq,Ord,Show)
+
+type Cat_ = String
+type CFSymbol_ = Symbol Cat_ Token
+
+type CFRules = Map Cat_ (Set CFRule_)
+
+
+cfgToCFRules :: StateGrammar -> CFRules
+cfgToCFRules s =
+ groupProds [CFRule (catToString c) (map symb r) (nameToTerm n)
+ | CFRule c r n <- cfg]
+ where cfg = stateCFG s
+ symb = mapSymbol catToString id
+ catToString = prt
+ gfcc = stateGFCC s
+ nameToTerm (Name IW [Unify [n]]) = CFRes n
+ nameToTerm (Name f@(IC c) prs) =
+ CFObj f (zipWith profileToTerm args prs)
+ where (args,_) = catSkeleton $ lookType gfcc (C.CId c)
+ nameToTerm n = error $ "cfgToCFRules.nameToTerm" ++ show n
+ profileToTerm (C.CId t) (Unify []) = CFMeta t
+ profileToTerm _ (Unify xs) = CFRes (last xs) -- FIXME: unify
+ profileToTerm (C.CId t) (Constant f) = maybe (CFMeta t) (\x -> CFObj x []) (forestName f)
+
+getStartCat :: Options -> StateGrammar -> String
+getStartCat opts sgr = prCFCat (startCatStateOpts opts' sgr)
+ where opts' = addOptions opts (stateOptions sgr)
+
+getStartCatCF :: Options -> StateGrammar -> String
+getStartCatCF opts sgr = getStartCat opts sgr ++ "{}.s"
+
+stateGFCC :: StateGrammar -> GFCC
+stateGFCC = canon2gfcc noOptions . stateGrammarST
+
+-- * Grammar filtering
+
+-- | Removes all directly and indirectly cyclic productions.
+-- FIXME: this may be too aggressive, only one production
+-- needs to be removed to break a given cycle. But which
+-- one should we pick?
+-- FIXME: Does not (yet) remove productions which are cyclic
+-- because of empty productions.
+removeCycles :: CFRules -> CFRules
+removeCycles = groupProds . f . allRules
+ where f rs = filter (not . isCycle) rs
+ where alias = transitiveClosure $ mkRel [(c,c') | CFRule c [Cat c'] _ <- rs]
+ isCycle (CFRule c [Cat c'] _) = isRelatedTo alias c' c
+ isCycle _ = False
+
+-- | Better bottom-up filter that also removes categories which contain no finite
+-- strings.
+bottomUpFilter :: CFRules -> CFRules
+bottomUpFilter gr = fix grow Map.empty
+ where grow g = g `unionCFRules` filterCFRules (all (okSym g) . ruleRhs) gr
+ okSym g = symbol (`elem` allCats g) (const True)
+
+-- | Removes categories which are not reachable from the start category.
+topDownFilter :: Cat_ -> CFRules -> CFRules
+topDownFilter start rules = filterCFRulesCats (isRelatedTo uses start) rules
+ where
+ rhsCats = [ (lhsCat r, c') | r <- allRules rules, c' <- filterCats (ruleRhs r) ]
+ uses = reflexiveClosure_ (allCats rules) $ transitiveClosure $ mkRel rhsCats
+
+-- | Merges categories with identical right-hand-sides.
+-- FIXME: handle probabilities
+mergeIdentical :: CFRules -> CFRules
+mergeIdentical g = groupProds $ map subst $ allRules g
+ where
+ -- maps categories to their replacement
+ m = Map.fromList [(y,concat (intersperse "+" xs))
+ | (_,xs) <- buildMultiMap [(rulesKey rs,c) | (c,rs) <- Map.toList g], y <- xs]
+ -- build data to compare for each category: a set of name,rhs pairs
+ rulesKey = Set.map (\ (CFRule _ r n) -> (n,r))
+ subst (CFRule c r n) = CFRule (substCat c) (map (mapSymbol substCat id) r) n
+ substCat c = Map.findWithDefault (error $ "mergeIdentical: " ++ c) c m
+
+-- * Removing left recursion
+
+-- The LC_LR algorithm from
+-- http://research.microsoft.com/users/bobmoore/naacl2k-proc-rev.pdf
+removeLeftRecursion :: Cat_ -> CFRules -> CFRules
+removeLeftRecursion start gr
+ = groupProds $ concat [scheme1, scheme2, scheme3, scheme4]
+ where
+ scheme1 = [CFRule a [x,Cat a_x] n' |
+ a <- retainedLeftRecursive,
+ x <- properLeftCornersOf a,
+ not (isLeftRecursive x),
+ let a_x = mkCat (Cat a) x,
+ -- this is an extension of LC_LR to avoid generating
+ -- A-X categories for which there are no productions:
+ a_x `Set.member` newCats,
+ let n' = symbol (\_ -> CFApp (CFRes 1) (CFRes 0))
+ (\_ -> CFRes 0) x]
+ scheme2 = [CFRule a_x (beta++[Cat a_b]) n' |
+ a <- retainedLeftRecursive,
+ b@(Cat b') <- properLeftCornersOf a,
+ isLeftRecursive b,
+ CFRule _ (x:beta) n <- catRules gr b',
+ let a_x = mkCat (Cat a) x,
+ let a_b = mkCat (Cat a) b,
+ let i = length $ filterCats beta,
+ let n' = symbol (\_ -> CFAbs 1 (CFApp (CFRes i) (shiftTerm n)))
+ (\_ -> CFApp (CFRes i) n) x]
+ scheme3 = [CFRule a_x beta n' |
+ a <- retainedLeftRecursive,
+ x <- properLeftCornersOf a,
+ CFRule _ (x':beta) n <- catRules gr a,
+ x == x',
+ let a_x = mkCat (Cat a) x,
+ let n' = symbol (\_ -> CFAbs 1 (shiftTerm n))
+ (\_ -> n) x]
+ scheme4 = catSetRules gr $ Set.fromList $ filter (not . isLeftRecursive . Cat) cats
+
+ newCats = Set.fromList (map lhsCat (scheme2 ++ scheme3))
+
+ shiftTerm :: CFTerm -> CFTerm
+ shiftTerm (CFObj f ts) = CFObj f (map shiftTerm ts)
+ shiftTerm (CFRes 0) = CFVar 1
+ shiftTerm (CFRes n) = CFRes (n-1)
+ shiftTerm t = t
+ -- note: the rest don't occur in the original grammar
+
+ cats = allCats gr
+ rules = allRules gr
+
+ directLeftCorner = mkRel [(Cat c,t) | CFRule c (t:_) _ <- allRules gr]
+ leftCorner = reflexiveClosure_ (map Cat cats) $ transitiveClosure directLeftCorner
+ properLeftCorner = transitiveClosure directLeftCorner
+ properLeftCornersOf = Set.toList . allRelated properLeftCorner . Cat
+ isProperLeftCornerOf = flip (isRelatedTo properLeftCorner)
+
+ leftRecursive = reflexiveElements properLeftCorner
+ isLeftRecursive = (`Set.member` leftRecursive)
+
+ retained = start `Set.insert`
+ Set.fromList [a | r <- allRules (filterCFRulesCats (not . isLeftRecursive . Cat) gr),
+ Cat a <- ruleRhs r]
+ isRetained = (`Set.member` retained)
+
+ retainedLeftRecursive = filter (isLeftRecursive . Cat) $ Set.toList retained
+
+mkCat :: CFSymbol_ -> CFSymbol_ -> Cat_
+mkCat x y = showSymbol x ++ "-" ++ showSymbol y
+ where showSymbol = symbol id show
+
+{-
+
+-- Paull's algorithm, see
+-- http://research.microsoft.com/users/bobmoore/naacl2k-proc-rev.pdf
+removeLeftRecursion :: Cat_ -> CFRules -> CFRules
+removeLeftRecursion start rs = removeDirectLeftRecursions $ map handleProds rs
+ where
+ handleProds (c, r) = (c, concatMap handleProd r)
+ handleProd (CFRule ai (Cat aj:alpha) n) | aj < ai =
+ -- FIXME: for non-recursive categories, this changes
+ -- the grammar unneccessarily, maybe we can use mutRecCats
+ -- to make this less invasive
+ -- FIXME: this will give multiple rules with the same name,
+ -- which may mess up the probabilities.
+ [CFRule ai (beta ++ alpha) n | CFRule _ beta _ <- lookup' aj rs]
+ handleProd r = [r]
+
+removeDirectLeftRecursions :: CFRules -> CFRules
+removeDirectLeftRecursions = concat . flip evalState 0 . mapM removeDirectLeftRecursion
+
+removeDirectLeftRecursion :: (Cat_,[CFRule_]) -- ^ All productions for a category
+ -> State Int CFRules
+removeDirectLeftRecursion (a,rs)
+ | null dr = return [(a,rs)]
+ | otherwise =
+ do
+ a' <- fresh a
+ let as = maybeEndWithA' nr
+ is = [CFRule a' (tail r) n | CFRule _ r n <- dr]
+ a's = maybeEndWithA' is
+ -- the not null constraint here avoids creating new
+ -- left recursive (cyclic) rules.
+ maybeEndWithA' xs = xs ++ [CFRule c (r++[Cat a']) n | CFRule c r n <- xs,
+ not (null r)]
+ return [(a, as), (a', a's)]
+ where
+ (dr,nr) = partition isDirectLeftRecursive rs
+ fresh x = do { n <- get; put (n+1); return $ x ++ "-" ++ show n }
+
+isDirectLeftRecursive :: CFRule_ -> Bool
+isDirectLeftRecursive (CFRule c (Cat c':_) _) = c == c'
+isDirectLeftRecursive _ = False
+
+-}
+
+-- | Get the sets of mutually recursive non-terminals for a grammar.
+mutRecCats :: Bool -- ^ If true, all categories will be in some set.
+ -- If false, only recursive categories will be included.
+ -> CFRules -> [Set Cat_]
+mutRecCats incAll g = equivalenceClasses $ refl $ symmetricSubrelation $ transitiveClosure r
+ where r = mkRel [(c,c') | CFRule c ss _ <- allRules g, Cat c' <- ss]
+ refl = if incAll then reflexiveClosure_ (allCats g) else reflexiveSubrelation
+
+--
+-- * Approximate context-free grammars with regular grammars.
+--
+
+-- Use the transformation algorithm from \"Regular Approximation of Context-free
+-- Grammars through Approximation\", Mohri and Nederhof, 2000
+-- to create an over-generating regular frammar for a context-free
+-- grammar
+makeRegular :: CFRules -> CFRules
+makeRegular g = groupProds $ concatMap trSet (mutRecCats True g)
+ where trSet cs | allXLinear cs rs = rs
+ | otherwise = concatMap handleCat csl
+ where csl = Set.toList cs
+ rs = catSetRules g cs
+ handleCat c = [CFRule c' [] (mkCFTerm (c++"-empty"))] -- introduce A' -> e
+ ++ concatMap (makeRightLinearRules c) (catRules g c)
+ where c' = newCat c
+ makeRightLinearRules b' (CFRule c ss n) =
+ case ys of
+ [] -> newRule b' (xs ++ [Cat (newCat c)]) n -- no non-terminals left
+ (Cat b:zs) -> newRule b' (xs ++ [Cat b]) n
+ ++ makeRightLinearRules (newCat b) (CFRule c zs n)
+ where (xs,ys) = break (`catElem` cs) ss
+ -- don't add rules on the form A -> A
+ newRule c rhs n | rhs == [Cat c] = []
+ | otherwise = [CFRule c rhs n]
+ newCat c = c ++ "$"
+
+--
+-- * CFG rule utilities
+--
+
+-- | Group productions by their lhs categories
+groupProds :: [CFRule_] -> CFRules
+groupProds = Map.fromListWith Set.union . map (\r -> (lhsCat r,Set.singleton r))
+
+allRules :: CFRules -> [CFRule_]
+allRules = concat . map Set.toList . Map.elems
+
+allRulesGrouped :: CFRules -> [(Cat_,[CFRule_])]
+allRulesGrouped = Map.toList . Map.map Set.toList
+
+allCats :: CFRules -> [Cat_]
+allCats = Map.keys
+
+catRules :: CFRules -> Cat_ -> [CFRule_]
+catRules rs c = Set.toList $ Map.findWithDefault Set.empty c rs
+
+catSetRules :: CFRules -> Set Cat_ -> [CFRule_]
+catSetRules g cs = allRules $ Map.filterWithKey (\c _ -> c `Set.member` cs) g
+
+cleanCFRules :: CFRules -> CFRules
+cleanCFRules = Map.filter (not . Set.null)
+
+unionCFRules :: CFRules -> CFRules -> CFRules
+unionCFRules = Map.unionWith Set.union
+
+filterCFRules :: (CFRule_ -> Bool) -> CFRules -> CFRules
+filterCFRules p = cleanCFRules . Map.map (Set.filter p)
+
+filterCFRulesCats :: (Cat_ -> Bool) -> CFRules -> CFRules
+filterCFRulesCats p = Map.filterWithKey (\c _ -> p c)
+
+countCats :: CFRules -> Int
+countCats = Map.size . cleanCFRules
+
+countRules :: CFRules -> Int
+countRules = length . allRules
+
+lhsCat :: CFRule c n t -> c
+lhsCat (CFRule c _ _) = c
+
+ruleRhs :: CFRule c n t -> [Symbol c t]
+ruleRhs (CFRule _ ss _) = ss
+
+ruleFun :: CFRule_ -> Fun
+ruleFun (CFRule _ _ t) = f t
+ where f (CFObj n _) = n
+ f (CFApp _ x) = f x
+ f (CFAbs _ x) = f x
+ f _ = IC ""
+
+-- | Checks if a symbol is a non-terminal of one of the given categories.
+catElem :: Ord c => Symbol c t -> Set c -> Bool
+catElem s cs = symbol (`Set.member` cs) (const False) s
+
+-- | Check if any of the categories used on the right-hand side
+-- are in the given list of categories.
+anyUsedBy :: Eq c => [c] -> CFRule c n t -> Bool
+anyUsedBy cs (CFRule _ ss _) = any (`elem` cs) (filterCats ss)
+
+mkCFTerm :: String -> CFTerm
+mkCFTerm n = CFObj (IC n) []
+
+ruleIsNonRecursive :: Ord c => Set c -> CFRule c n t -> Bool
+ruleIsNonRecursive cs = noCatsInSet cs . ruleRhs
+
+noCatsInSet :: Ord c => Set c -> [Symbol c t] -> Bool
+noCatsInSet cs = not . any (`catElem` cs)
+
+-- | Check if all the rules are right-linear, or all the rules are
+-- left-linear, with respect to given categories.
+allXLinear :: Ord c => Set c -> [CFRule c n t] -> Bool
+allXLinear cs rs = all (isRightLinear cs) rs || all (isLeftLinear cs) rs
+
+-- | Checks if a context-free rule is right-linear.
+isRightLinear :: Ord c =>
+ Set c -- ^ The categories to consider
+ -> CFRule c n t -- ^ The rule to check for right-linearity
+ -> Bool
+isRightLinear cs = noCatsInSet cs . safeInit . ruleRhs
+
+-- | Checks if a context-free rule is left-linear.
+isLeftLinear :: Ord c =>
+ Set c -- ^ The categories to consider
+ -> CFRule c n t -- ^ The rule to check for left-linearity
+ -> Bool
+isLeftLinear cs = noCatsInSet cs . drop 1 . ruleRhs
+
+prCFRules :: CFRules -> String
+prCFRules = unlines . map prRule . allRules
+ where
+ prRule r = lhsCat r ++ " --> " ++ unwords (map prSym (ruleRhs r))
+ prSym = symbol id (\t -> "\""++ t ++"\"")