summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorbringert <bringert@cs.chalmers.se>2006-12-15 16:09:58 +0000
committerbringert <bringert@cs.chalmers.se>2006-12-15 16:09:58 +0000
commit215bf61115a1b78c6466830c89574091459bebdb (patch)
tree0c54ecb6ae408f316fea227ca6e202930fab5929 /src
parent1e1401472fdc55ba8f208baa7f07e2a4a3cb906c (diff)
Towards smaller SRGs when lots of variants are used.
Diffstat (limited to 'src')
-rw-r--r--src/GF/Speech/FiniteState.hs35
-rw-r--r--src/GF/Speech/Graph.hs14
-rw-r--r--src/GF/Speech/PrJSGF.hs32
-rw-r--r--src/GF/Speech/PrRegExp.hs21
-rw-r--r--src/GF/Speech/PrSRGS.hs71
-rw-r--r--src/GF/Speech/RegExp.hs84
-rw-r--r--src/GF/Speech/SRG.hs23
-rw-r--r--src/GF/UseGrammar/Custom.hs4
8 files changed, 201 insertions, 83 deletions
diff --git a/src/GF/Speech/FiniteState.hs b/src/GF/Speech/FiniteState.hs
index 832fb81d4..7054784c8 100644
--- a/src/GF/Speech/FiniteState.hs
+++ b/src/GF/Speech/FiniteState.hs
@@ -14,11 +14,17 @@
module GF.Speech.FiniteState (FA(..), State, NFA, DFA,
startState, finalStates,
states, transitions,
+ isInternal,
newFA,
addFinalState,
newState, newStates,
newTransition, newTransitions,
+ insertTransitionWith, insertTransitionsWith,
mapStates, mapTransitions,
+ modifyTransitions,
+ nonLoopTransitionsTo, nonLoopTransitionsFrom,
+ loops,
+ removeState,
oneFinalState,
insertNFA,
onGraph,
@@ -41,6 +47,7 @@ import qualified GF.Visualization.Graphviz as Dot
type State = Int
+-- | Type parameters: node id type, state label type, edge label type
data FA n a b = FA !(Graph n a b) !n ![n]
type NFA a = FA State () (Maybe a)
@@ -82,18 +89,46 @@ newTransition f t l = onGraph (newEdge (f,t,l))
newTransitions :: [(n, n, b)] -> FA n a b -> FA n a b
newTransitions es = onGraph (newEdges es)
+insertTransitionWith :: Eq n =>
+ (b -> b -> b) -> (n, n, b) -> FA n a b -> FA n a b
+insertTransitionWith f t = onGraph (insertEdgeWith f t)
+
+insertTransitionsWith :: Eq n =>
+ (b -> b -> b) -> [(n, n, b)] -> FA n a b -> FA n a b
+insertTransitionsWith f ts fa =
+ foldl' (flip (insertTransitionWith f)) fa ts
+
mapStates :: (a -> c) -> FA n a b -> FA n c b
mapStates f = onGraph (nmap f)
mapTransitions :: (b -> c) -> FA n a b -> FA n a c
mapTransitions f = onGraph (emap f)
+modifyTransitions :: ([(n,n,b)] -> [(n,n,b)]) -> FA n a b -> FA n a b
+modifyTransitions f = onGraph (\ (Graph r ns es) -> Graph r ns (f es))
+
+removeState :: Ord n => n -> FA n a b -> FA n a b
+removeState n = onGraph (removeNode n)
+
minimize :: Ord a => NFA a -> DFA a
minimize = determinize . reverseNFA . dfa2nfa . determinize . reverseNFA
unusedNames :: FA n a b -> [n]
unusedNames (FA (Graph names _ _) _ _) = names
+-- | Gets all incoming transitions to a given state, excluding
+-- transtions from the state itself.
+nonLoopTransitionsTo :: Eq n => n -> FA n a b -> [(n,b)]
+nonLoopTransitionsTo s fa =
+ [(f,l) | (f,t,l) <- transitions fa, t == s && f /= s]
+
+nonLoopTransitionsFrom :: Eq n => n -> FA n a b -> [(n,b)]
+nonLoopTransitionsFrom s fa =
+ [(t,l) | (f,t,l) <- transitions fa, f == s && t /= s]
+
+loops :: Eq n => n -> FA n a b -> [b]
+loops s fa = [l | (f,t,l) <- transitions fa, f == s && t == s]
+
-- | Give new names to all nodes.
renameStates :: Ord x => [y] -- ^ Infinite supply of new names
-> FA x a b
diff --git a/src/GF/Speech/Graph.hs b/src/GF/Speech/Graph.hs
index c23c5e384..1a0ebe0c0 100644
--- a/src/GF/Speech/Graph.hs
+++ b/src/GF/Speech/Graph.hs
@@ -14,7 +14,8 @@
module GF.Speech.Graph ( Graph(..), Node, Edge, NodeInfo
, newGraph, nodes, edges
, nmap, emap, newNode, newNodes, newEdge, newEdges
- , removeNodes
+ , insertEdgeWith
+ , removeNode, removeNodes
, nodeInfo
, getIncoming, getOutgoing, getNodeLabel
, inDegree, outDegree
@@ -82,6 +83,17 @@ newEdges es g = foldl' (flip newEdge) g es
-- lazy version:
-- newEdges es' (Graph c ns es) = Graph c ns (es'++es)
+insertEdgeWith :: Eq n =>
+ (b -> b -> b) -> Edge n b -> Graph n a b -> Graph n a b
+insertEdgeWith f e@(x,y,l) (Graph c ns es) = Graph c ns (h es)
+ where h [] = [e]
+ h (e'@(x',y',l'):es') | x' == x && y' == y = (x',y', f l l'):es'
+ | otherwise = e':h es'
+
+-- | Remove a node and all edges to and from that node.
+removeNode :: Ord n => n -> Graph n a b -> Graph n a b
+removeNode n = removeNodes (Set.singleton n)
+
-- | Remove a set of nodes and all edges to and from those nodes.
removeNodes :: Ord n => Set n -> Graph n a b -> Graph n a b
removeNodes xs (Graph c ns es) = Graph c ns' es'
diff --git a/src/GF/Speech/PrJSGF.hs b/src/GF/Speech/PrJSGF.hs
index 6183b9826..f09d454d9 100644
--- a/src/GF/Speech/PrJSGF.hs
+++ b/src/GF/Speech/PrJSGF.hs
@@ -28,6 +28,7 @@ import GF.Infra.Print
import GF.Infra.Option
import GF.Probabilistic.Probabilistic (Probs)
import GF.Speech.SRG
+import GF.Speech.RegExp
jsgfPrinter :: Ident -- ^ Grammar name
-> String -- ^ Start category
@@ -48,20 +49,27 @@ prJSGF (SRG{grammarName=name,startCat=start,origStartCat=origStart,rules=rs})
. showString "public <MAIN> = " . prCat start . showChar ';' . nl . nl
prRule (SRGRule cat origCat rhs) =
comments [origCat] . nl
- . prCat cat . showString " = " . joinS " | " (map prAlt rhs) . nl
+ . prCat cat . showString " = " . joinS " | " (map prAlt (ebnfSRGAlts rhs)) . nl
-- FIXME: use the probability
- prAlt (SRGAlt mp _ rhs)
- | null rhs' = showString "<NULL>"
- | otherwise = wrap "(" (unwordsS (map prSymbol rhs')) ")"
- where rhs' = rmPunct rhs
- prSymbol (Cat (c,_)) = prCat c
- prSymbol (Tok t) = wrap "\"" (prtS t) "\""
- prCat c = showChar '<' . showString c . showChar '>'
+ prAlt (EBnfSRGAlt mp _ rhs) = prItem rhs
-rmPunct :: [Symbol c Token] -> [Symbol c Token]
-rmPunct [] = []
-rmPunct (Tok t:ss) | all isPunct (prt t) = rmPunct ss
-rmPunct (s:ss) = s : rmPunct ss
+prCat :: SRGCat -> ShowS
+prCat c = showChar '<' . showString c . showChar '>'
+
+prItem :: EBnfSRGItem -> ShowS
+prItem = f
+ where
+ f (REUnion []) = showString "<VOID>"
+ f (REUnion xs) = wrap "(" (joinS " | " (map f xs)) ")"
+ f (REConcat []) = showString "<NULL>"
+ f (REConcat xs) = wrap "(" (unwordsS (map f xs)) ")"
+ f (RERepeat x) = wrap "(" (f x) ")" . showString "*"
+ f (RESymbol s) = prSymbol s
+
+prSymbol :: Symbol SRGNT Token -> ShowS
+prSymbol (Cat (c,_)) = prCat c
+prSymbol (Tok t) | all isPunct (prt t) = id -- removes punctuation
+ | otherwise = wrap "\"" (prtS t) "\""
isPunct :: Char -> Bool
isPunct c = c `elem` "-_.;.,?!"
diff --git a/src/GF/Speech/PrRegExp.hs b/src/GF/Speech/PrRegExp.hs
new file mode 100644
index 000000000..c0aadab00
--- /dev/null
+++ b/src/GF/Speech/PrRegExp.hs
@@ -0,0 +1,21 @@
+----------------------------------------------------------------------
+-- |
+-- Module : PrSLF
+-- Maintainer : BB
+-- Stability : (stable)
+-- Portability : (portable)
+--
+-- This module prints a grammar as a regular expression.
+-----------------------------------------------------------------------------
+
+module GF.Speech.PrRegExp (regexpPrinter) where
+
+import GF.Conversion.Types
+import GF.Infra.Ident
+import GF.Speech.CFGToFiniteState
+import GF.Speech.RegExp
+
+
+regexpPrinter :: Ident -- ^ Grammar name
+ -> String -> CGrammar -> String
+regexpPrinter name start cfg = prRE $ dfa2re $ cfgToFA start cfg
diff --git a/src/GF/Speech/PrSRGS.hs b/src/GF/Speech/PrSRGS.hs
index 9f86c1468..63ca91034 100644
--- a/src/GF/Speech/PrSRGS.hs
+++ b/src/GF/Speech/PrSRGS.hs
@@ -15,6 +15,7 @@ module GF.Speech.PrSRGS (SISRFormat(..), srgsXmlPrinter) where
import GF.Data.Utilities
import GF.Data.XML
+import GF.Speech.RegExp
import GF.Speech.SISR as SISR
import GF.Speech.SRG
import GF.Infra.Ident
@@ -85,9 +86,12 @@ mkProd sisr (EBnfSRGAlt mp n@(Name f prs) rhs) = Tag "item" w (t ++ xs)
argInit (Constant f) = maybe "?" prIdent (forestName f)
mkItem :: Maybe SISRFormat -> EBnfSRGItem -> XML
-mkItem sisr (EBnfOneOf xs) = oneOf (map (mkItem sisr) xs)
-mkItem sisr (EBnfSeq xs) = Tag "item" [] (map (mkItem sisr) xs)
-mkItem sisr (EBnfSymbol s) = symItem sisr s
+mkItem sisr = f
+ where
+ f (REUnion xs) = oneOf (map f xs)
+ f (REConcat xs) = Tag "item" [] (map f xs)
+ f (RERepeat x) = Tag "item" [("repeat","0-")] [f x]
+ f (RESymbol s) = symItem sisr s
symItem :: Maybe SISRFormat -> Symbol SRGNT Token -> XML
symItem sisr (Cat (c,slots)) = Tag "item" [] ([Tag "ruleref" [("uri","#" ++ prCat c)] []]++t)
@@ -107,8 +111,7 @@ showToken :: Token -> String
showToken t = t
oneOf :: [XML] -> XML
-oneOf [x] = x
-oneOf xs = Tag "one-of" [] xs
+oneOf = Tag "one-of" []
grammar :: Maybe SISRFormat
-> String -- ^ root
@@ -130,61 +133,3 @@ optimizeSRGS = bottomUpXML f
where f (Tag "item" [] [x@(Tag "item" [] _)]) = x
f (Tag "one-of" [] [x]) = x
f x = x
-
-{-
-
---
--- * SRGS minimization
---
-
-minimizeRule :: XML -> XML
-minimizeRule (Tag "rule" attrs cs)
- = Tag "rule" attrs (map minimizeOneOf cs)
-
-minimizeOneOf :: XML -> XML
-minimizeOneOf (Tag "one-of" attrs cs)
- = Tag "item" [] (p++[Tag "one-of" attrs cs'])
- where
- (pref,cs') = factor cs
- p = if null pref then [] else [Tag "one-of" [] pref]
-minimizeOneOf x = x
-
-factor :: [XML] -> ([XML],[XML])
-factor xs = case f of
- Just (ps,xs') -> (map it ps, map it xs')
- Nothing -> ([],xs)
- where
- -- FIXME: maybe getting all the longest terminal prefixes
- -- is not optimal?
- f = cartesianFactor $ map (terminalPrefix . unIt) xs
- unIt (Tag "item" [] cs) = cs
- it cs = Tag "item" [] cs
-
-terminalPrefix :: [XML] -> ([XML],[XML])
-terminalPrefix cs = (terms, tags ++ cs'')
- where (tags,cs') = span isTag cs
- (terms,cs'') = span isTerminalItem cs'
-
-isTag :: XML -> Bool
-isTag (Tag t _ _) = t == "tag"
-isTag _ = False
-
-isTerminalItem :: XML -> Bool
-isTerminalItem (Tag "item" [] [Data _]) = True
-isTerminalItem _ = False
-
---
--- * Utilities
---
-
-allEqual :: Eq a => [a] -> Bool
-allEqual [] = True
-allEqual (x:xs) = all (x==) xs
-
-cartesianFactor :: (Ord a, Ord b) => [(a,b)] -> Maybe ([a],[b])
-cartesianFactor xs
- | not (null es) && allEqual es = Just (Map.keys m, Set.elems (head es))
- | otherwise = Nothing
- where m = Map.fromListWith Set.union [(x,Set.singleton y) | (x,y) <- xs]
- es = Map.elems m
--} \ No newline at end of file
diff --git a/src/GF/Speech/RegExp.hs b/src/GF/Speech/RegExp.hs
new file mode 100644
index 000000000..2f4c7cd48
--- /dev/null
+++ b/src/GF/Speech/RegExp.hs
@@ -0,0 +1,84 @@
+module GF.Speech.RegExp (RE(..), dfa2re, prRE) where
+
+import Data.List
+
+import GF.Data.Utilities
+import GF.Speech.FiniteState
+
+data RE a =
+ REUnion [RE a] -- ^ REUnion [] is null
+ | REConcat [RE a] -- ^ REConcat [] is epsilon
+ | RERepeat (RE a)
+ | RESymbol a
+ deriving (Eq,Show)
+
+
+dfa2re :: Show a => DFA a -> RE a
+dfa2re = finalRE . elimStates . modifyTransitions merge . addLoops
+ . oneFinalState () epsilonRE . mapTransitions RESymbol
+ where addLoops fa = newTransitions [(s,s,nullRE) | (s,_) <- states fa] fa
+ merge es = [(f,t,unionRE ls)
+ | ((f,t),ls) <- buildMultiMap [((f,t),l) | (f,t,l) <- es]]
+
+elimStates :: Show a => DFA (RE a) -> DFA (RE a)
+elimStates fa =
+ case [s | (s,_) <- states fa, isInternal fa s] of
+ [] -> fa
+ sE:_ -> elimStates $ insertTransitionsWith (\x y -> unionRE [x,y]) ts $ removeState sE fa
+ where sAs = nonLoopTransitionsTo sE fa
+ sBs = nonLoopTransitionsFrom sE fa
+ r2 = unionRE $ loops sE fa
+ ts = [(sA, sB, r r1 r3) | (sA,r1) <- sAs, (sB,r3) <- sBs]
+ r r1 r3 = concatRE [r1, repeatRE r2, r3]
+
+epsilonRE = REConcat []
+
+nullRE = REUnion []
+
+isNull (REUnion []) = True
+isNull _ = False
+
+isEpsilon (REConcat []) = True
+isEpsilon _ = False
+
+unionRE :: [RE a] -> RE a
+unionRE = unionOrId . concatMap toList
+ where
+ toList (REUnion xs) = xs
+ toList x = [x]
+ unionOrId [r] = r
+ unionOrId rs = REUnion rs
+
+concatRE :: [RE a] -> RE a
+concatRE xs | any isNull xs = nullRE
+ | otherwise = case concatMap toList xs of
+ [r] -> r
+ rs -> REConcat rs
+ where
+ toList (REConcat xs) = xs
+ toList x = [x]
+
+repeatRE :: RE a -> RE a
+repeatRE x | isNull x || isEpsilon x = epsilonRE
+ | otherwise = RERepeat x
+
+finalRE :: DFA (RE a) -> RE a
+finalRE fa = concatRE [repeatRE r1, r2,
+ repeatRE (unionRE [r3, concatRE [r4, repeatRE r1, r2]])]
+ where
+ s0 = startState fa
+ [sF] = finalStates fa
+ r1 = unionRE $ loops s0 fa
+ r2 = unionRE $ map snd $ nonLoopTransitionsTo sF fa
+ r3 = unionRE $ loops sF fa
+ r4 = unionRE $ map snd $ nonLoopTransitionsFrom sF fa
+
+-- Debugging
+
+prRE :: Show a => RE a -> String
+prRE (REUnion []) = "<NULL>"
+prRE (REUnion xs) = "(" ++ concat (intersperse " | " (map prRE xs)) ++ ")"
+prRE (REConcat xs) = "(" ++ unwords (map prRE xs) ++ ")"
+prRE (RERepeat x) = "(" ++ prRE x ++ ")*"
+prRE (RESymbol s) = show s
+
diff --git a/src/GF/Speech/SRG.hs b/src/GF/Speech/SRG.hs
index 9082fa1f4..b27c5ad56 100644
--- a/src/GF/Speech/SRG.hs
+++ b/src/GF/Speech/SRG.hs
@@ -23,7 +23,7 @@ module GF.Speech.SRG (SRG(..), SRGRule(..), SRGAlt(..),
makeSimpleSRG, makeSRG
, lookupFM_, prtS
, topDownFilter
- , EBnfSRGAlt(..), EBnfSRGItem(..)
+ , EBnfSRGAlt(..), EBnfSRGItem
, ebnfSRGAlts
) where
@@ -38,6 +38,8 @@ import GF.Conversion.Types
import GF.Infra.Print
import GF.Speech.TransformCFG
import GF.Speech.Relation
+import GF.Speech.FiniteState
+import GF.Speech.RegExp
import GF.Infra.Option
import GF.Probabilistic.Probabilistic (Probs)
@@ -173,18 +175,25 @@ allSRGCats SRG { rules = rs } = [c | SRGRule c _ _ <- rs]
data EBnfSRGAlt = EBnfSRGAlt (Maybe Double) Name EBnfSRGItem
deriving (Eq,Show)
-data EBnfSRGItem =
- EBnfOneOf [EBnfSRGItem]
- | EBnfSeq [EBnfSRGItem]
- | EBnfSymbol (Symbol SRGNT Token)
- deriving (Eq,Show)
+type EBnfSRGItem = RE (Symbol SRGNT Token)
+
ebnfSRGAlts :: [SRGAlt] -> [EBnfSRGAlt]
ebnfSRGAlts alts = [EBnfSRGAlt p n (ebnfSRGItem sss)
| ((p,n),sss) <- buildMultiMap [((p,n),ss) | SRGAlt p n ss <- alts]]
ebnfSRGItem :: [[Symbol SRGNT Token]] -> EBnfSRGItem
-ebnfSRGItem sss = EBnfOneOf (map (EBnfSeq . map EBnfSymbol) sss)
+ebnfSRGItem = dfa2re . mkSRGFA
+
+mkSRGFA :: [[Symbol SRGNT Token]] -> DFA (Symbol SRGNT Token)
+mkSRGFA = minimize . dfa2nfa . foldr addString (newFA ())
+
+addString :: [a] -> DFA a -> DFA a
+addString xs fa = addFinalState (last sts0) $ newTransitions ts fa'
+ where (fa',ss) = newStates (replicate (length xs) ()) fa
+ sts0 = startState fa : sts1
+ sts1 = map fst ss
+ ts = zip3 sts0 sts1 xs
--
-- * Utilities for building and printing SRGs
diff --git a/src/GF/UseGrammar/Custom.hs b/src/GF/UseGrammar/Custom.hs
index 7e76688d9..92b95756a 100644
--- a/src/GF/UseGrammar/Custom.hs
+++ b/src/GF/UseGrammar/Custom.hs
@@ -64,6 +64,7 @@ import GF.Speech.PrJSGF (jsgfPrinter)
import qualified GF.Speech.PrSRGS as SRGS
import GF.Speech.PrSLF
import GF.Speech.PrFA (faGraphvizPrinter,regularPrinter,faCPrinter)
+import GF.Speech.PrRegExp (regexpPrinter)
import GF.Speech.GrammarToVoiceXML (grammar2vxml)
import GF.Data.Zipper
@@ -284,6 +285,9 @@ customGrammarPrinter =
,(strCI "fa_c", \opts s -> let start = getStartCatCF opts s
name = cncId s
in faCPrinter name start $ stateCFG s)
+ ,(strCI "regexp", \opts s -> let start = getStartCatCF opts s
+ name = cncId s
+ in regexpPrinter name start $ stateCFG s)
,(strCI "regular", \_ -> regularPrinter . stateCFG)
,(strCI "plbnf", \_ -> prLBNF True)
,(strCI "lbnf", \_ -> prLBNF False)