summaryrefslogtreecommitdiff
path: root/src/GF/Speech/SRG.hs
blob: 107d81e1067937b15f25faf11acb43fe25117614 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
----------------------------------------------------------------------
-- |
-- Module      : SRG
--
-- Representation of, conversion to, and utilities for 
-- printing of a general Speech Recognition Grammar. 
--
-- FIXME: remove \/ warn \/ fail if there are int \/ string literal
-- categories in the grammar
----------------------------------------------------------------------
module GF.Speech.SRG (SRG(..), SRGRule(..), SRGAlt(..), SRGItem, SRGSymbol
                     , SRGNT, CFTerm
                     , ebnfPrinter
                     , nonLeftRecursivePrinter
                     , regularPrinter
                     , makeSimpleSRG
                     , makeNonRecursiveSRG
                     , getSpeechLanguage
                     , isExternalCat
                     , lookupFM_, prtS
                     ) where

import GF.Data.Operations
import GF.Data.Utilities
import GF.Infra.Ident
import GF.Infra.PrintClass
import GF.Speech.CFG
import GF.Speech.PGFToCFG
import GF.Speech.Relation
import GF.Speech.FiniteState
import GF.Speech.RegExp
import GF.Speech.CFGToFA
import GF.Infra.Option
import PGF.CId
import PGF.Data
import PGF.Macros

import Data.List
import Data.Maybe (fromMaybe, maybeToList)
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Set (Set)
import qualified Data.Set as Set

import Debug.Trace

data SRG = SRG { srgName :: String    -- ^ grammar name
		 , srgStartCat :: Cat     -- ^ start category name
                 , srgExternalCats :: Set Cat
                 , srgLanguage :: Maybe String -- ^ The language for which the grammar 
                                                   --   is intended, e.g. en-UK
	         , srgRules :: [SRGRule] 
	       }
	 deriving (Eq,Show)

data SRGRule = SRGRule Cat [SRGAlt]
	     deriving (Eq,Show)

-- | maybe a probability, a rule name and an EBNF right-hand side
data SRGAlt = SRGAlt (Maybe Double) CFTerm SRGItem
	      deriving (Eq,Show)

type SRGItem = RE SRGSymbol

type SRGSymbol = Symbol SRGNT Token

-- | An SRG non-terminal. Category name and its number in the profile.
type SRGNT = (Cat, Int)


ebnfPrinter :: Maybe SISRFormat -> PGF -> CId -> String
ebnfPrinter sisr pgf cnc = prSRG sisr $ makeSRG id pgf cnc

nonLeftRecursivePrinter :: Maybe SISRFormat -> PGF -> CId -> String
nonLeftRecursivePrinter sisr pgf cnc = prSRG sisr $ makeSRG removeLeftRecursion pgf cnc

regularPrinter :: PGF -> CId -> String
regularPrinter pgf cnc = prSRG Nothing $ makeSRG makeRegular pgf cnc

makeSRG :: (CFG -> CFG) -> PGF -> CId -> SRG
makeSRG = mkSRG cfgToSRG
    where
      cfgToSRG cfg = [cfRulesToSRGRule rs | (_,rs) <- allRulesGrouped cfg]

-- | Create a compact filtered non-left-recursive SRG. 
makeSimpleSRG :: PGF -> CId -> SRG
makeSimpleSRG  = makeSRG preprocess
    where
      preprocess =   traceStats "After mergeIdentical"
                   . mergeIdentical
                   . traceStats "After removeLeftRecursion"
                   . removeLeftRecursion 
                   . traceStats "After topDownFilter" 
                   . topDownFilter
                   . traceStats "After bottomUpFilter"
                   . bottomUpFilter
                   . traceStats "After removeCycles"
                   . removeCycles 
                   . traceStats "Inital CFG"

traceStats s g = trace ("---- " ++ s ++ ": " ++ stats g {- ++ "\n" ++ prCFRules g ++ "----" -}) g

stats g = "Categories: " ++ show (countCats g)
          ++ ", External categories: " ++ show (Set.size (cfgExternalCats g))
          ++ ", Rules: " ++ show (countRules g)

makeNonRecursiveSRG :: PGF
                    -> CId -- ^ Concrete syntax name.
                    -> SRG
makeNonRecursiveSRG = mkSRG cfgToSRG id
    where
      cfgToSRG cfg = [SRGRule l [SRGAlt Nothing dummyCFTerm (dfaToSRGItem dfa)] | (l,dfa) <- dfas]
          where
            MFA _ dfas = cfgToMFA cfg
            dfaToSRGItem = mapRE dummySRGNT . minimizeRE . dfa2re
            dummyCFTerm = CFMeta (mkCId "dummy")
            dummySRGNT = mapSymbol (\c -> (c,0)) id

mkSRG :: (CFG -> [SRGRule]) -> (CFG -> CFG) -> PGF -> CId -> SRG
mkSRG mkRules preprocess pgf cnc =
    SRG { srgName = prCId cnc,
	  srgStartCat = cfgStartCat cfg,
          srgExternalCats = cfgExternalCats cfg,
          srgLanguage = getSpeechLanguage pgf cnc,
	  srgRules = mkRules cfg }
    where cfg = renameCats (prCId cnc) $ preprocess $ pgfToCFG pgf cnc

-- | Renames all external cats C to C_cat, and all internal cats C_X (where X is any string), 
--   to C_N where N is an integer.
renameCats :: String -> CFG -> CFG
renameCats prefix cfg = mapCFGCats renameCat cfg
  where renameCat c | isExternal c = c ++ "_cat"
                    | otherwise = Map.findWithDefault (error ("renameCats: " ++ c)) c names
        isExternal c = c `Set.member` cfgExternalCats cfg        
        catsByPrefix = buildMultiMap [(takeWhile (/='_') cat, cat) | cat <- allCats cfg, not (isExternal cat)]
        names = Map.fromList [(c,pref++"_"++show i) | (pref,cs) <- catsByPrefix, (c,i) <- zip cs [1..]]

getSpeechLanguage :: PGF -> CId -> Maybe String
getSpeechLanguage pgf cnc = fmap (replace '_' '-') $ lookConcrFlag pgf cnc (mkCId "language")

cfRulesToSRGRule :: [CFRule] -> SRGRule
cfRulesToSRGRule rs@(r:_) = SRGRule (lhsCat r) rhs
    where 
      alts = [((n,Nothing),mkSRGSymbols 0 ss) | CFRule c ss n <- rs]
      rhs = [SRGAlt p n (srgItem sss) | ((n,p),sss) <- buildMultiMap alts ]

      mkSRGSymbols _ [] = []
      mkSRGSymbols i (NonTerminal c:ss) = NonTerminal (c,i) : mkSRGSymbols (i+1) ss
      mkSRGSymbols i (Terminal t:ss)    = Terminal t : mkSRGSymbols i ss

srgLHSCat :: SRGRule -> Cat
srgLHSCat (SRGRule c _) = c

isExternalCat :: SRG -> Cat -> Bool
isExternalCat srg c = c `Set.member` srgExternalCats srg

--
-- * Size-optimized EBNF SRGs
--

srgItem :: [[SRGSymbol]] -> SRGItem
srgItem = unionRE . map mergeItems . sortGroupBy (compareBy filterCats)
-- non-optimizing version:
--srgItem = unionRE . map seqRE

-- | Merges a list of right-hand sides which all have the same 
-- sequence of non-terminals.
mergeItems :: [[SRGSymbol]] -> SRGItem
mergeItems = minimizeRE . ungroupTokens . minimizeRE . unionRE . map seqRE . map groupTokens

groupTokens :: [SRGSymbol] -> [Symbol SRGNT [Token]]
groupTokens [] = []
groupTokens (Terminal t:ss) = case groupTokens ss of
                                Terminal ts:ss' -> Terminal (t:ts):ss'
                                ss'             -> Terminal [t]:ss'
groupTokens (NonTerminal c:ss) = NonTerminal c : groupTokens ss

ungroupTokens :: RE (Symbol SRGNT [Token]) -> RE SRGSymbol
ungroupTokens = joinRE . mapRE (symbol (RESymbol . NonTerminal) (REConcat . map (RESymbol . Terminal)))

--
-- * Utilities for building and printing SRGs
--

prSRG :: Maybe SISRFormat -> SRG -> String
prSRG sisr srg = prProductions $ map prRule $ ext ++ int
    where 
      (ext,int) = partition (isExternalCat srg . srgLHSCat) (srgRules srg)
      prRule (SRGRule c alts) = (c,unwords (intersperse "|" (concatMap prAlt alts)))
      prAlt (SRGAlt _ t rhs) = 
          -- FIXME: hack: we high-jack the --sisr flag to add 
          -- a simple lambda calculus format for semantic interpretation
          -- Maybe the --sisr flag should be renamed.
          case sisr of
            Just _  -> 
                -- copy tags to each part of a top-level union,
                -- to get simpler output
                case rhs of
                  REUnion xs -> map prOneAlt xs
                  _          -> [prOneAlt rhs]
                where prOneAlt a = prRE prSym a ++ " { " ++ prCFTerm t ++ " }"
            Nothing -> [prRE prSym rhs]
      prSym = symbol fst (\t -> "\""++ t ++"\"")

lookupFM_ :: (Ord key, Show key) => Map key elt -> key -> elt
lookupFM_ fm k = Map.findWithDefault err k fm
  where err = error $ "Key not found: " ++ show k
                      ++ "\namong " ++ show (Map.keys fm)

prtS :: Print a => a -> ShowS
prtS = showString . prt