5 files changed, 480 insertions, 363 deletions
diff --git a/src/compiler/GF/Grammar/CF.hs b/src/compiler/GF/Grammar/CF.hs
deleted file mode 100644
index a48238e42..000000000
--- a/src/compiler/GF/Grammar/CF.hs
+++ /dev/null
@@ -1,143 +0,0 @@
-----------------------------------------------------------------------
--- |
--- Module      : CF
--- Maintainer  : AR
--- Stability   : (stable)
--- Portability : (portable)
---
--- > CVS $Date: 2005/11/15 17:56:13 $ 
--- > CVS $Author: aarne $
--- > CVS $Revision: 1.13 $
---
--- parsing CF grammars and converting them to GF
------------------------------------------------------------------------------
-
-module GF.Grammar.CF (getCF,CFItem,CFCat,CFFun,cf2gf,CFRule) where
-
-import GF.Grammar.Grammar
-import GF.Grammar.Macros
-import GF.Infra.Ident(Ident,identS)
-import GF.Infra.Option
-import GF.Infra.UseIO
-
-import GF.Data.Operations
-import GF.Data.Utilities (nub')
-
-import qualified Data.Set as S
-import Data.Char
-import Data.List
---import System.FilePath
-
-getCF :: ErrorMonad m => FilePath -> String -> m SourceGrammar
-getCF fpath = fmap (cf2gf fpath . uniqueFuns) . pCF
-
----------------------
--- the parser -------
----------------------
-
-pCF :: ErrorMonad m => String -> m CF
-pCF s = do
-  rules <- mapM getCFRule $ filter isRule $ lines s
-  return $ concat rules
- where
-   isRule line = case dropWhile isSpace line of
-     '-':'-':_ -> False
-     _ -> not $ all isSpace line
-
--- rules have an amazingly easy parser, if we use the format
--- fun. C -> item1 item2 ... where unquoted items are treated as cats
--- Actually would be nice to add profiles to this.
-
-getCFRule :: ErrorMonad m => String -> m [CFRule]
-getCFRule s = getcf (wrds s) where
-  getcf ws = case ws of
-    fun : cat : a : its | isArrow a -> 
-      return [L NoLoc (init fun, (cat, map mkIt its))]
-    cat : a : its | isArrow a -> 
-      return [L NoLoc (mkFun cat it, (cat, map mkIt it)) | it <- chunk its]
-    _ -> raise (" invalid rule:" +++ s)
-  isArrow a = elem a ["->", "::="] 
-  mkIt w = case w of
-    ('"':w@(_:_)) -> Right (init w)
-    _             -> Left w
-  chunk its = case its of
-    [] -> [[]]
-    _ -> chunks "|" its
-  mkFun cat its = case its of
-    [] -> cat ++ "_"
-    _ -> concat $ intersperse "_" (cat : map clean its) -- CLE style
-  clean = filter isAlphaNum -- to form valid identifiers
-  wrds = takeWhile (/= ";") . words -- to permit semicolon in the end
-
-type CF = [CFRule]
-
-type CFRule = L (CFFun, (CFCat, [CFItem]))
-
-type CFItem = Either CFCat String
-
-type CFCat = String
-type CFFun = String
-
-
---------------------------------
--- make function names unique -- 
---------------------------------
-
-uniqueFuns :: CF -> CF
-uniqueFuns = snd . mapAccumL uniqueFun S.empty
-  where
-    uniqueFun funs (L l (fun,rule)) = (S.insert fun' funs,L l (fun',rule))
-      where
-        fun' = head [fun'|suffix<-"":map show ([2..]::[Int]),
-                          let fun'=fun++suffix,
-                          not (fun' `S.member` funs)]
-
-
---------------------------
--- the compiler ----------
---------------------------
-
-cf2gf :: FilePath -> CF -> SourceGrammar
-cf2gf fpath cf = mGrammar [
-  (aname, ModInfo MTAbstract MSComplete (modifyFlags (\fs -> fs{optStartCat = Just cat})) [] Nothing [] [] fpath Nothing abs),
-  (cname, ModInfo (MTConcrete aname) MSComplete noOptions [] Nothing [] [] fpath Nothing cnc)
-  ]
- where
-   name = justModuleName fpath
-   (abs,cnc,cat) = cf2grammar cf
-   aname = identS $ name ++ "Abs"
-   cname = identS name
-
-
-cf2grammar :: CF -> (BinTree Ident Info, BinTree Ident Info, String)
-cf2grammar rules = (buildTree abs, buildTree conc, cat) where
-  abs   = cats ++ funs
-  conc  = lincats ++ lins
-  cat   = case rules of
-            (L _ (_,(c,_))):_ -> c  -- the value category of the first rule
-            _ -> error "empty CF" 
-  cats  = [(cat, AbsCat (Just (L NoLoc []))) | 
-             cat <- nub' (concat (map cf2cat rules))] ----notPredef cat
-  lincats = [(cat, CncCat (Just (L loc defLinType)) Nothing Nothing Nothing Nothing) | (cat,AbsCat (Just (L loc _))) <- cats]
-  (funs,lins) = unzip (map cf2rule rules)
-
-cf2cat :: CFRule -> [Ident]
-cf2cat (L loc (_,(cat, items))) = map identS $ cat : [c | Left c <- items]
-
-cf2rule :: CFRule -> ((Ident,Info),(Ident,Info))
-cf2rule (L loc (fun, (cat, items))) = (def,ldef) where
-  f     = identS fun
-  def   = (f, AbsFun (Just (L loc (mkProd args' (Cn (identS cat)) []))) Nothing Nothing (Just True))
-  args0 = zip (map (identS . ("x" ++) . show) [0..]) items
-  args  = [((Explicit,v), Cn (identS c)) | (v, Left c) <- args0]
-  args' = [(Explicit,identS "_", Cn (identS c)) | (_, Left c) <- args0]
-  ldef  = (f, CncFun 
-               Nothing 
-               (Just (L loc (mkAbs (map fst args) 
-                      (mkRecord (const theLinLabel) [foldconcat (map mkIt args0)]))))
-               Nothing
-               Nothing)
-  mkIt (v, Left _) = P (Vr v) theLinLabel
-  mkIt (_, Right a) = K a
-  foldconcat [] = K ""
-  foldconcat tt = foldr1 C tt
diff --git a/src/compiler/GF/Grammar/CFG.hs b/src/compiler/GF/Grammar/CFG.hs
new file mode 100644
index 000000000..93bce2aad
--- /dev/null
+++ b/src/compiler/GF/Grammar/CFG.hs
@@ -0,0 +1,386 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : GF.Speech.CFG
+--
+-- Context-free grammar representation and manipulation.
+----------------------------------------------------------------------
+module GF.Grammar.CFG where
+
+import GF.Data.Utilities
+import PGF
+--import GF.Infra.Option
+import GF.Data.Relation
+
+--import Control.Monad
+--import Control.Monad.State (State, get, put, evalState)
+import Data.Map (Map)
+import qualified Data.Map as Map
+import Data.List
+--import Data.Maybe (fromMaybe)
+--import Data.Monoid (mconcat)
+import Data.Set (Set)
+import qualified Data.Set as Set
+
+--
+-- * Types
+--
+
+type Cat = String
+
+data Symbol c t = NonTerminal c | Terminal t
+  deriving (Eq, Ord, Show)
+
+type CFSymbol = Symbol Cat Token
+
+data CFRule = CFRule { 
+      lhsCat :: Cat,
+      ruleRhs :: [CFSymbol],
+      ruleName :: CFTerm 
+    }
+  deriving (Eq, Ord, Show)
+
+data CFTerm
+    = CFObj CId [CFTerm] -- ^ an abstract syntax function with arguments
+    | CFAbs Int CFTerm -- ^ A lambda abstraction. The Int is the variable id.
+    | CFApp CFTerm CFTerm -- ^ Application
+    | CFRes Int -- ^ The result of the n:th (0-based) non-terminal
+    | CFVar Int -- ^ A lambda-bound variable
+    | CFMeta CId -- ^ A metavariable
+  deriving (Eq, Ord, Show)
+
+data CFG = CFG { cfgStartCat :: Cat,
+                 cfgExternalCats :: Set Cat,
+                 cfgRules :: Map Cat (Set CFRule) }
+  deriving (Eq, Ord, Show)
+
+
+--
+-- * Grammar filtering
+--
+
+-- | Removes all directly and indirectly cyclic productions.
+--   FIXME: this may be too aggressive, only one production
+--   needs to be removed to break a given cycle. But which
+--   one should we pick?
+--   FIXME: Does not (yet) remove productions which are cyclic
+--   because of empty productions.
+removeCycles :: CFG -> CFG
+removeCycles = onRules f
+  where f rs = filter (not . isCycle) rs
+          where alias = transitiveClosure $ mkRel [(c,c') | CFRule c [NonTerminal c'] _ <- rs]
+                isCycle (CFRule c [NonTerminal c'] _) = isRelatedTo alias c' c
+                isCycle _ = False
+
+-- | Better bottom-up filter that also removes categories which contain no finite
+-- strings.
+bottomUpFilter :: CFG -> CFG
+bottomUpFilter gr = fix grow (gr { cfgRules = Map.empty })
+  where grow g = g `unionCFG` filterCFG (all (okSym g) . ruleRhs) gr
+        okSym g = symbol (`elem` allCats g) (const True)
+
+-- | Removes categories which are not reachable from any external category.
+topDownFilter :: CFG -> CFG
+topDownFilter cfg = filterCFGCats (`Set.member` keep) cfg
+  where
+    rhsCats = [ (lhsCat r, c') | r <- allRules cfg, c' <- filterCats (ruleRhs r) ]
+    uses = reflexiveClosure_ (allCats cfg) $ transitiveClosure $ mkRel rhsCats
+    keep = Set.unions $ map (allRelated uses) $ Set.toList $ cfgExternalCats cfg
+
+-- | Merges categories with identical right-hand-sides.
+-- FIXME: handle probabilities
+mergeIdentical :: CFG -> CFG
+mergeIdentical g = onRules (map subst) g
+  where
+    -- maps categories to their replacement
+    m = Map.fromList [(y,concat (intersperse "+" xs)) 
+                          | (_,xs) <- buildMultiMap [(rulesKey rs,c) | (c,rs) <- Map.toList (cfgRules g)], y <- xs]
+    -- build data to compare for each category: a set of name,rhs pairs
+    rulesKey = Set.map (\ (CFRule _ r n) -> (n,r))
+    subst (CFRule c r n) = CFRule (substCat c) (map (mapSymbol substCat id) r) n
+    substCat c = Map.findWithDefault (error $ "mergeIdentical: " ++ c) c m
+
+-- | Keeps only the start category as an external category.
+purgeExternalCats :: CFG -> CFG
+purgeExternalCats cfg = cfg { cfgExternalCats = Set.singleton (cfgStartCat cfg) }
+
+--
+-- * Removing left recursion
+--
+
+-- The LC_LR algorithm from
+-- http://research.microsoft.com/users/bobmoore/naacl2k-proc-rev.pdf
+removeLeftRecursion :: CFG -> CFG
+removeLeftRecursion gr 
+    = gr { cfgRules = groupProds $ concat [scheme1, scheme2, scheme3, scheme4] }
+  where
+    scheme1 = [CFRule a [x,NonTerminal a_x] n' | 
+               a <- retainedLeftRecursive, 
+               x <- properLeftCornersOf a,
+               not (isLeftRecursive x),
+               let a_x = mkCat (NonTerminal a) x,
+               -- this is an extension of LC_LR to avoid generating
+               -- A-X categories for which there are no productions:
+               a_x `Set.member` newCats,
+               let n' = symbol (\_ -> CFApp (CFRes 1) (CFRes 0))
+                               (\_ -> CFRes 0) x] 
+    scheme2 = [CFRule a_x (beta++[NonTerminal a_b]) n' | 
+               a <- retainedLeftRecursive, 
+               b@(NonTerminal b') <- properLeftCornersOf a,
+               isLeftRecursive b,
+               CFRule _ (x:beta) n <- catRules gr b', 
+               let a_x = mkCat (NonTerminal a) x,
+               let a_b = mkCat (NonTerminal a) b,
+               let i = length $ filterCats beta,
+               let n' = symbol (\_ -> CFAbs 1 (CFApp (CFRes i) (shiftTerm n)))
+                               (\_ -> CFApp (CFRes i) n) x]
+    scheme3 = [CFRule a_x beta n' |
+               a <- retainedLeftRecursive, 
+               x <- properLeftCornersOf a,
+               CFRule _ (x':beta) n <- catRules gr a,
+               x == x',
+               let a_x = mkCat (NonTerminal a) x,
+               let n' = symbol (\_ -> CFAbs 1 (shiftTerm n)) 
+                               (\_ -> n) x]
+    scheme4 = catSetRules gr $ Set.fromList $ filter (not . isLeftRecursive . NonTerminal) cats
+
+    newCats = Set.fromList (map lhsCat (scheme2 ++ scheme3))
+
+    shiftTerm :: CFTerm -> CFTerm
+    shiftTerm (CFObj f ts) = CFObj f (map shiftTerm ts)
+    shiftTerm (CFRes 0) = CFVar 1
+    shiftTerm (CFRes n) = CFRes (n-1)
+    shiftTerm t = t
+    -- note: the rest don't occur in the original grammar
+
+    cats = allCats gr
+    rules = allRules gr
+
+    directLeftCorner = mkRel [(NonTerminal c,t) | CFRule c (t:_) _ <- allRules gr]
+    leftCorner = reflexiveClosure_ (map NonTerminal cats) $ transitiveClosure directLeftCorner
+    properLeftCorner = transitiveClosure directLeftCorner
+    properLeftCornersOf = Set.toList . allRelated properLeftCorner . NonTerminal
+    isProperLeftCornerOf = flip (isRelatedTo properLeftCorner)
+
+    leftRecursive = reflexiveElements properLeftCorner
+    isLeftRecursive = (`Set.member` leftRecursive)
+
+    retained = cfgStartCat gr `Set.insert`
+                Set.fromList [a | r <- allRules (filterCFGCats (not . isLeftRecursive . NonTerminal) gr),
+                                  NonTerminal a <- ruleRhs r]
+    isRetained = (`Set.member` retained)
+
+    retainedLeftRecursive = filter (isLeftRecursive . NonTerminal) $ Set.toList retained
+
+    mkCat :: CFSymbol -> CFSymbol -> Cat
+    mkCat x y = showSymbol x ++ "-" ++ showSymbol y
+        where showSymbol = symbol id show
+
+-- | Get the sets of mutually recursive non-terminals for a grammar.
+mutRecCats :: Bool    -- ^ If true, all categories will be in some set.
+                      --   If false, only recursive categories will be included.
+           -> CFG -> [Set Cat]
+mutRecCats incAll g = equivalenceClasses $ refl $ symmetricSubrelation $ transitiveClosure r
+  where r = mkRel [(c,c') | CFRule c ss _ <- allRules g, NonTerminal c' <- ss]
+        refl = if incAll then reflexiveClosure_ (allCats g) else reflexiveSubrelation
+
+--
+-- * Approximate context-free grammars with regular grammars.
+--
+
+makeSimpleRegular :: CFG -> CFG
+makeSimpleRegular = makeRegular . topDownFilter . bottomUpFilter . removeCycles
+
+-- Use the transformation algorithm from \"Regular Approximation of Context-free
+-- Grammars through Approximation\", Mohri and Nederhof, 2000
+-- to create an over-generating regular grammar for a context-free 
+-- grammar
+makeRegular :: CFG -> CFG
+makeRegular g = g { cfgRules = groupProds $ concatMap trSet (mutRecCats True g) }
+  where trSet cs | allXLinear cs rs = rs
+                 | otherwise = concatMap handleCat (Set.toList cs)
+            where rs = catSetRules g cs
+                  handleCat c = [CFRule c' [] (mkCFTerm (c++"-empty"))] -- introduce A' -> e
+                                ++ concatMap (makeRightLinearRules c) (catRules g c)
+                      where c' = newCat c
+                  makeRightLinearRules b' (CFRule c ss n) = 
+                      case ys of
+                              [] -> newRule b' (xs ++ [NonTerminal (newCat c)]) n -- no non-terminals left
+                              (NonTerminal b:zs) -> newRule b' (xs ++ [NonTerminal b]) n 
+                                        ++ makeRightLinearRules (newCat b) (CFRule c zs n)
+                      where (xs,ys) = break (`catElem` cs) ss
+                            -- don't add rules on the form A -> A
+                            newRule c rhs n | rhs == [NonTerminal c] = []
+                                            | otherwise = [CFRule c rhs n]
+        newCat c = c ++ "$"
+
+--
+-- * CFG Utilities
+--
+
+mkCFG :: Cat -> Set Cat -> [CFRule] -> CFG
+mkCFG start ext rs = CFG { cfgStartCat = start, cfgExternalCats = ext, cfgRules = groupProds rs }
+
+groupProds :: [CFRule] -> Map Cat (Set CFRule)
+groupProds = Map.fromListWith Set.union . map (\r -> (lhsCat r,Set.singleton r))
+
+uniqueFuns :: CFG -> CFG
+uniqueFuns cfg = CFG {cfgStartCat     = cfgStartCat cfg
+                     ,cfgExternalCats = cfgExternalCats cfg
+                     ,cfgRules        = Map.fromList (snd (mapAccumL uniqueFunSet Set.empty (Map.toList (cfgRules cfg))))
+                     }
+  where
+    uniqueFunSet funs (cat,rules) =
+      let (funs',rules') = mapAccumL uniqueFun funs (Set.toList rules)
+      in (funs',(cat,Set.fromList rules'))
+    uniqueFun funs (CFRule cat items (CFObj fun args)) = (Set.insert fun' funs,CFRule cat items (CFObj fun' args))
+      where
+        fun' = head [fun'|suffix<-"":map show ([2..]::[Int]),
+                          let fun'=mkCId (showCId fun++suffix),
+                          not (fun' `Set.member` funs)]
+
+-- | Gets all rules in a CFG.
+allRules :: CFG -> [CFRule]
+allRules = concat . map Set.toList . Map.elems . cfgRules
+
+-- | Gets all rules in a CFG, grouped by their LHS categories.
+allRulesGrouped :: CFG -> [(Cat,[CFRule])]
+allRulesGrouped = Map.toList . Map.map Set.toList . cfgRules
+
+-- | Gets all categories which have rules.
+allCats :: CFG -> [Cat]
+allCats = Map.keys . cfgRules
+
+-- | Gets all categories which have rules or occur in a RHS.
+allCats' :: CFG -> [Cat]
+allCats' cfg = Set.toList (Map.keysSet (cfgRules cfg) `Set.union` 
+                           Set.fromList [c | rs <- Map.elems (cfgRules cfg), 
+                                             r  <- Set.toList rs, 
+                                             NonTerminal c <- ruleRhs r])
+
+-- | Gets all rules for the given category.
+catRules :: CFG -> Cat -> [CFRule]
+catRules gr c = Set.toList $ Map.findWithDefault Set.empty c (cfgRules gr)
+
+-- | Gets all rules for categories in the given set.
+catSetRules :: CFG -> Set Cat -> [CFRule]
+catSetRules gr cs = allRules $ filterCFGCats (`Set.member` cs) gr
+
+mapCFGCats :: (Cat -> Cat) -> CFG -> CFG
+mapCFGCats f cfg = mkCFG (f (cfgStartCat cfg)) 
+                         (Set.map f (cfgExternalCats cfg))
+                         [CFRule (f lhs) (map (mapSymbol f id) rhs) t | CFRule lhs rhs t <- allRules cfg]
+
+onCFG :: (Map Cat (Set CFRule) -> Map Cat (Set CFRule)) -> CFG -> CFG
+onCFG f cfg = cfg { cfgRules = f (cfgRules cfg) }
+
+onRules :: ([CFRule] -> [CFRule]) -> CFG -> CFG
+onRules f cfg = cfg { cfgRules = groupProds $ f $ allRules cfg }
+
+-- | Clean up CFG after rules have been removed.
+cleanCFG :: CFG -> CFG
+cleanCFG = onCFG (Map.filter (not . Set.null))
+
+-- | Combine two CFGs.
+unionCFG :: CFG -> CFG -> CFG
+unionCFG x y = onCFG (\rs -> Map.unionWith Set.union rs (cfgRules y)) x
+
+filterCFG :: (CFRule -> Bool) -> CFG -> CFG
+filterCFG p = cleanCFG . onCFG (Map.map (Set.filter p))
+
+filterCFGCats :: (Cat -> Bool) -> CFG -> CFG
+filterCFGCats p = onCFG (Map.filterWithKey (\c _ -> p c))
+
+countCats :: CFG -> Int
+countCats = Map.size . cfgRules . cleanCFG
+
+countRules :: CFG -> Int
+countRules = length . allRules
+
+prCFG :: CFG -> String
+prCFG = prProductions . map prRule . allRules
+    where 
+      prRule r = (lhsCat r, unwords (map prSym (ruleRhs r)))
+      prSym = symbol id (\t -> "\""++ t ++"\"")
+
+prProductions :: [(Cat,String)] -> String
+prProductions prods = 
+    unlines [rpad maxLHSWidth lhs ++ " ::= " ++ rhs | (lhs,rhs) <- prods]
+    where
+      maxLHSWidth = maximum $ 0:(map (length . fst) prods)
+      rpad n s = s ++ replicate (n - length s) ' '
+
+prCFTerm :: CFTerm -> String
+prCFTerm = pr 0
+  where
+    pr p (CFObj f args) = paren p (showCId f ++ " (" ++ concat (intersperse "," (map (pr 0) args)) ++ ")")
+    pr p (CFAbs i t) = paren p ("\\x" ++ show i ++ ". " ++ pr 0 t)
+    pr p (CFApp t1 t2) = paren p (pr 1 t1 ++ "(" ++ pr 0 t2 ++ ")")
+    pr _ (CFRes i) = "$" ++ show i
+    pr _ (CFVar i) = "x" ++ show i
+    pr _ (CFMeta c) = "?" ++ showCId c
+    paren 0 x = x
+    paren 1 x = "(" ++ x ++ ")"
+
+--
+-- * CFRule Utilities
+--
+
+ruleFun :: CFRule -> CId
+ruleFun (CFRule _ _ t) = f t
+  where f (CFObj n _) = n
+        f (CFApp _ x) = f x
+        f (CFAbs _ x) = f x
+        f _ = mkCId ""
+
+-- | Check if any of the categories used on the right-hand side
+--   are in the given list of categories.
+anyUsedBy :: [Cat] -> CFRule -> Bool
+anyUsedBy cs (CFRule _ ss _) = any (`elem` cs) (filterCats ss)
+
+mkCFTerm :: String -> CFTerm
+mkCFTerm n = CFObj (mkCId n) []
+
+ruleIsNonRecursive :: Set Cat -> CFRule -> Bool
+ruleIsNonRecursive cs = noCatsInSet cs . ruleRhs
+
+-- | Check if all the rules are right-linear, or all the rules are
+--   left-linear, with respect to given categories.
+allXLinear :: Set Cat -> [CFRule] -> Bool
+allXLinear cs rs = all (isRightLinear cs) rs || all (isLeftLinear cs) rs
+
+-- | Checks if a context-free rule is right-linear.
+isRightLinear :: Set Cat  -- ^ The categories to consider
+              -> CFRule   -- ^ The rule to check for right-linearity
+              -> Bool
+isRightLinear cs = noCatsInSet cs . safeInit . ruleRhs
+
+-- | Checks if a context-free rule is left-linear.
+isLeftLinear :: Set Cat  -- ^ The categories to consider
+             -> CFRule   -- ^ The rule to check for left-linearity
+             -> Bool
+isLeftLinear cs = noCatsInSet cs . drop 1 . ruleRhs
+
+
+--
+-- * Symbol utilities
+--
+
+symbol :: (c -> a) -> (t -> a) -> Symbol c t -> a
+symbol fc ft (NonTerminal cat) = fc cat
+symbol fc ft (Terminal tok) = ft tok
+
+mapSymbol :: (c -> c') -> (t -> t') -> Symbol c t -> Symbol c' t'
+mapSymbol fc ft = symbol (NonTerminal . fc) (Terminal . ft)
+
+filterCats :: [Symbol c t] -> [c]
+filterCats syms = [ cat | NonTerminal cat <- syms ]
+
+filterToks :: [Symbol c t] -> [t]
+filterToks syms = [ tok | Terminal tok <- syms ]
+
+-- | Checks if a symbol is a non-terminal of one of the given categories.
+catElem :: Ord c => Symbol c t -> Set c -> Bool
+catElem s cs = symbol (`Set.member` cs) (const False) s
+
+noCatsInSet :: Ord c => Set c -> [Symbol c t] -> Bool
+noCatsInSet cs = not . any (`catElem` cs)
diff --git a/src/compiler/GF/Grammar/EBNF.hs b/src/compiler/GF/Grammar/EBNF.hs
index b1854da54..50a5ff90a 100644
--- a/src/compiler/GF/Grammar/EBNF.hs
+++ b/src/compiler/GF/Grammar/EBNF.hs
@@ -12,34 +12,19 @@
 -- (Description of the module)
 -----------------------------------------------------------------------------
 
-module GF.Grammar.EBNF (getEBNF) where
+module GF.Grammar.EBNF (EBNF, ERule, ERHS(..), ebnf2cf) where
 
 import GF.Data.Operations
---import GF.Infra.Comments
-import GF.Grammar.CF
---import GF.CF.CFIdent
-import GF.Grammar.Grammar
---import GF.Grammar.PrGrammar
---import qualified GF.Source.AbsGF as A
+import GF.Grammar.CFG
+import PGF (mkCId)
 
-import Data.Char
 import Data.List
---import System.FilePath
-
-
-
--- AR 18/4/2000 - 31/3/2004
-
-getEBNF :: FilePath -> String -> Err SourceGrammar
-getEBNF fpath = fmap (cf2gf fpath . ebnf2cf) . pEBNF
 
 type EBNF = [ERule]
 type ERule = (ECat, ERHS)
 type ECat = (String,[Int])
 type ETok = String
 
-ebnfID = "EBNF" ---- make this parametric!
-
 data ERHS =
    ETerm ETok
  | ENonTerm ECat
@@ -50,13 +35,14 @@ data ERHS =
  | EOpt  ERHS
  | EEmpty
 
-type CFRHS = [CFItem]
-type CFJustRule = (CFCat, CFRHS)
+type CFRHS = [CFSymbol]
+type CFJustRule = (Cat, CFRHS)
 
 ebnf2cf :: EBNF -> [CFRule]
 ebnf2cf ebnf = 
-  [L NoLoc (mkCFF i rule,rule) | (i,rule) <- zip [0..] (normEBNF ebnf)] where
-    mkCFF i (c, _) = ("Mk" ++ c ++ "_" ++ show i)
+  [CFRule cat items (mkCFF i cat) | (i,(cat,items)) <- zip [0..] (normEBNF ebnf)]
+  where
+    mkCFF i c = CFObj (mkCId ("Mk" ++ c ++ "_" ++ show i)) []
 
 normEBNF :: EBNF -> [CFJustRule]
 normEBNF erules = let
@@ -115,13 +101,13 @@ substERules g (cat,itss) = (cat, map sub itss) where
   sub (EIPlus r : ii) = EIPlus (substERules g r) : ii
   sub (EIOpt r : ii)  = EIOpt  (substERules g r) : ii
 
-eitem2cfitem :: EItem -> CFItem 
+eitem2cfitem :: EItem -> CFSymbol
 eitem2cfitem it = case it of
-  EITerm a       -> Right a
-  EINonTerm cat  -> Left (mkCFCatE cat)
-  EIStar (cat,_) -> Left (mkCFCatE (mkNewECat cat "Star"))
-  EIPlus (cat,_) -> Left (mkCFCatE (mkNewECat cat "Plus"))
-  EIOpt  (cat,_) -> Left (mkCFCatE (mkNewECat cat "Opt"))
+  EITerm a       -> Terminal a
+  EINonTerm cat  -> NonTerminal (mkCFCatE cat)
+  EIStar (cat,_) -> NonTerminal (mkCFCatE (mkNewECat cat "Star"))
+  EIPlus (cat,_) -> NonTerminal (mkCFCatE (mkNewECat cat "Plus"))
+  EIOpt  (cat,_) -> NonTerminal (mkCFCatE (mkNewECat cat "Opt"))
 
 type NormERule = (ECat,[[EItem]]) -- disjunction of sequences of items
 
@@ -157,198 +143,10 @@ mkECat ints = ("C", ints)
 prECat (c,[]) = c
 prECat (c,ints) = c ++ "_" ++ prTList "_" (map show ints)
 
-mkCFCatE :: ECat -> CFCat
+mkCFCatE :: ECat -> Cat
 mkCFCatE = prECat
 
 updECat _ (c,[]) = (c,[])
 updECat ii (c,_) = (c,ii)
 
 mkNewECat (c,ii) str = (c ++ str,ii)
-
------- parser for EBNF grammars
-
-pEBNF :: String -> Err EBNF
-pEBNF = parseResultErr (longestOfMany (pJ pERule))
-
-pERule :: Parser Char ERule
-pERule = pECat ... pJ (literals ":=" ||| literals "::=") +.. pERHS 0 ..+ jL ";"
-
-pERHS :: Int -> Parser Char ERHS
-pERHS 0 = pTList "|" (pERHS 1) *** foldr1 EAlt
-pERHS 1 = longestOfMany (pJ (pERHS 2)) *** foldr ESeq EEmpty
-pERHS 2 = pERHS 3 ... pJ pUnaryEOp *** (\ (a,f) -> f a)
-pERHS 3 = pQuotedString *** ETerm
-          ||| pECat *** ENonTerm ||| pParenth (pERHS 0)
-
-pUnaryEOp :: Parser Char (ERHS -> ERHS)
-pUnaryEOp = 
- lits "*" <<< EStar ||| lits "+" <<< EPlus ||| lits "?" <<< EOpt ||| succeed id
-
-pECat = pIdent *** (\c -> (c,[]))
-
-
-
-----------------------------------------------------------------------
--- Module      : Parsers
--- some parser combinators a la Wadler and Hutton.
--- (only used in module "EBNF")
------------------------------------------------------------------------------
-
-infixr 2 |||, +||
-infixr 3 ***
-infixr 5 .>.
-infixr 5 ...
-infixr 5 ....
-infixr 5 +..
-infixr 5 ..+
-infixr 6 |>
-infixr 3 <<<
-
-
-type Parser a b = [a] -> [(b,[a])]
-
-parseResults :: Parser a b -> [a] -> [b]
-parseResults p s = [x | (x,r) <- p s, null r]
-
-parseResultErr :: Show a => Parser a b -> [a] -> Err b
-parseResultErr p s = case parseResults p s of
-  [x] -> return x
-  []  -> case 
-       maximumBy (\x y -> compare (length y) (length x)) (s:[r | (_,r) <- p s]) of
-    r -> Bad $ "\nno parse; reached" ++++ take 300 (show r)
-  _   -> Bad "ambiguous"
-
-(...) :: Parser a b -> Parser a c -> Parser a (b,c)
-(p ... q) s = [((x,y),r) | (x,t) <- p s, (y,r) <- q t]
-
-(.>.) :: Parser a b -> (b -> Parser a c) -> Parser a c
-(p .>. f) s = [(c,r) | (x,t) <- p s, (c,r) <- f x t]
-
-(|||) :: Parser a b -> Parser a b -> Parser a b
-(p ||| q) s = p s ++ q s
-
-(+||) :: Parser a b -> Parser a b -> Parser a b
-p1 +|| p2 = take 1 . (p1 ||| p2)
-
-literal :: (Eq a) => a -> Parser a a
-literal x (c:cs) = [(x,cs) | x == c]
-literal _ _ = []
-
-(***) :: Parser a b -> (b -> c) -> Parser a c
-(p *** f) s = [(f x,r) | (x,r) <- p s] 
-
-succeed :: b -> Parser a b
-succeed v s = [(v,s)]
-
-fails :: Parser a b
-fails s = []
-
-(+..) :: Parser a b -> Parser a c -> Parser a c
-p1 +.. p2 = p1 ... p2 *** snd
-
-(..+) :: Parser a b -> Parser a c -> Parser a b
-p1 ..+ p2 = p1 ... p2 *** fst
-
-(<<<) :: Parser a b -> c -> Parser a c  -- return
-p <<< v = p *** (\x -> v)
-
-(|>) :: Parser a b -> (b -> Bool) -> Parser a b
-p |> b = p .>. (\x -> if b x then succeed x else fails)
-
-many :: Parser a b -> Parser a [b]
-many p = (p ... many p *** uncurry (:)) +|| succeed []
-
-some :: Parser a b -> Parser a [b]
-some p = (p ... many p) *** uncurry (:)
-
-longestOfMany :: Parser a b -> Parser a [b]
-longestOfMany p = p .>. (\x -> longestOfMany p *** (x:)) +|| succeed []
-
-closure :: (b -> Parser a b) -> (b -> Parser a b)
-closure p v = p v .>. closure p ||| succeed v
-
-pJunk   :: Parser Char String
-pJunk = longestOfMany (satisfy (\x -> elem x "\n\t "))
-
-pJ :: Parser Char a -> Parser Char a
-pJ p = pJunk +.. p ..+ pJunk
-
-pTList  :: String -> Parser Char a -> Parser Char [a]
-pTList t p = p .... many (jL t +.. p) *** (\ (x,y) -> x:y) -- mod. AR 5/1/1999
-
-pTJList  :: String -> String -> Parser Char a -> Parser Char [a]
-pTJList t1 t2 p = p .... many (literals t1 +.. jL t2 +.. p) *** (uncurry (:))
-
-pElem   :: [String] -> Parser Char String
-pElem l = foldr (+||) fails (map literals l)
-
-(....) :: Parser Char b -> Parser Char c -> Parser Char (b,c)
-p1 .... p2 = p1 ... pJunk +.. p2
-
-item :: Parser a a
-item (c:cs) = [(c,cs)]
-item [] = []
-
-satisfy :: (a -> Bool) -> Parser a a
-satisfy b = item |> b
-
-literals :: (Eq a,Show a) => [a] -> Parser a [a]
-literals l = case l of 
-  []  -> succeed [] 
-  a:l -> literal a ... literals l *** (\ (x,y) -> x:y)
-
-lits :: (Eq a,Show a) => [a] -> Parser a [a]
-lits ts = literals ts
-
-jL :: String -> Parser Char String
-jL = pJ . lits
-
-pParenth :: Parser Char a -> Parser Char a
-pParenth p = literal '(' +.. pJunk +.. p ..+ pJunk ..+ literal ')'
-
--- | p,...,p
-pCommaList :: Parser Char a -> Parser Char [a]
-pCommaList p = pTList "," (pJ p)                      
-
--- | the same or nothing
-pOptCommaList :: Parser Char a -> Parser Char [a]
-pOptCommaList p = pCommaList p ||| succeed []            
-
--- | (p,...,p), poss. empty
-pArgList :: Parser Char a -> Parser Char [a]
-pArgList p = pParenth (pCommaList p) ||| succeed [] 
-
--- | min. 2 args
-pArgList2 :: Parser Char a -> Parser Char [a]
-pArgList2 p = pParenth (p ... jL "," +.. pCommaList p) *** uncurry (:) 
-
-longestOfSome :: Parser a b -> Parser a [b]
-longestOfSome p = (p ... longestOfMany p) *** (\ (x,y) -> x:y)
-
-pIdent :: Parser Char String
-pIdent = pLetter ... longestOfMany pAlphaPlusChar *** uncurry (:)
-  where alphaPlusChar c = isAlphaNum c || c=='_' || c=='\''
-
-pLetter, pDigit :: Parser Char Char
-pLetter = satisfy (`elem` (['A'..'Z'] ++ ['a'..'z'] ++ 
-                           ['\192' .. '\255'])) -- no such in Char
-pDigit  = satisfy isDigit
-
-pLetters :: Parser Char String
-pLetters = longestOfSome pLetter
-
-pAlphanum, pAlphaPlusChar :: Parser Char Char
-pAlphanum      = pDigit ||| pLetter
-pAlphaPlusChar = pAlphanum ||| satisfy (`elem` "_'")
-
-pQuotedString :: Parser Char String
-pQuotedString = literal '"' +.. pEndQuoted where
- pEndQuoted =
-       literal '"' *** (const [])
-   +|| (literal '\\' +.. item .>. \ c -> pEndQuoted *** (c:))
-   +|| item .>. \ c -> pEndQuoted *** (c:)
-
-pIntc :: Parser Char Int
-pIntc = some (satisfy numb) *** read
-         where numb x = elem x ['0'..'9']
-
diff --git a/src/compiler/GF/Grammar/Lexer.x b/src/compiler/GF/Grammar/Lexer.x
index c4f7159a2..0293d3915 100644
--- a/src/compiler/GF/Grammar/Lexer.x
+++ b/src/compiler/GF/Grammar/Lexer.x
@@ -26,7 +26,7 @@ $i = [$l $d _ ']          -- identifier character
 $u = [.\n]                -- universal: any character
 
 @rsyms =    -- symbols and non-identifier-like reserved words
-   \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/
+   \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/ | \: \= | \: \: \=
 
 :-
 "--" [.]* ; -- Toss single line comments
@@ -83,6 +83,7 @@ data Token
  | T_ccurly
  | T_underscore
  | T_at
+ | T_cfarrow
  | T_PType
  | T_Str
  | T_Strs
@@ -169,6 +170,8 @@ resWords = Map.fromList
  , b "|"  T_bar
  , b "_"  T_underscore
  , b "@"  T_at
+ , b "::=" T_cfarrow
+ , b ":="  T_cfarrow
  , b "PType"      T_PType
  , b "Str"        T_Str
  , b "Strs"       T_Strs
diff --git a/src/compiler/GF/Grammar/Parser.y b/src/compiler/GF/Grammar/Parser.y
index 6f7f5854e..387b69dd3 100644
--- a/src/compiler/GF/Grammar/Parser.y
+++ b/src/compiler/GF/Grammar/Parser.y
@@ -7,6 +7,8 @@ module GF.Grammar.Parser
          , pModHeader
          , pExp
          , pTopDef
+         , pCFRules
+         , pEBNFRules
          ) where
 
 import GF.Infra.Ident
@@ -14,17 +16,23 @@ import GF.Infra.Option
 import GF.Data.Operations
 import GF.Grammar.Predef
 import GF.Grammar.Grammar
+import GF.Grammar.CFG
+import GF.Grammar.EBNF
 import GF.Grammar.Macros
 import GF.Grammar.Lexer
 import GF.Compile.Update (buildAnyTree)
---import Codec.Binary.UTF8.String(decodeString)
---import Data.Char(toLower)
+import Data.List(intersperse)
+import Data.Char(isAlphaNum)
+import PGF(mkCId)
+
 }
 
 %name pModDef ModDef
 %name pTopDef TopDef
 %partial pModHeader ModHeader
 %name pExp Exp
+%name pCFRules ListCFRule
+%name pEBNFRules ListEBNFRule
 
 -- no lexer declaration
 %monad { P } { >>= } { return }
@@ -64,6 +72,7 @@ import GF.Compile.Update (buildAnyTree)
  '\\\\'       { T_lamlam    }
  '_'          { T_underscore}
  '|'          { T_bar       }
+ '::='        { T_cfarrow   }
  'PType'      { T_PType     }
  'Str'        { T_Str       }
  'Strs'       { T_Strs      }
@@ -602,6 +611,70 @@ ListDDecl
   : {- empty -}     { []       } 
   | DDecl ListDDecl { $1 ++ $2 }
 
+ListCFRule :: { [CFRule] }
+ListCFRule
+  : CFRule            { $1       }
+  | CFRule ListCFRule { $1 ++ $2 }
+
+CFRule :: { [CFRule] }
+CFRule
+  : Ident '.' Ident '::=' ListCFSymbol ';' { [CFRule (showIdent $3) $5 (CFObj (mkCId (showIdent $1)) [])]
+                                           }
+  | Ident '::=' ListCFRHS ';'              { let { cat = showIdent $1;
+                                                   mkFun cat its =
+                                                     case its of {
+                                                       [] -> cat ++ "_";
+                                                       _  -> concat $ intersperse "_" (cat : filter (not . null) (map clean its)) -- CLE style
+                                                     };
+                                                   clean sym = 
+                                                     case sym of {
+                                                       Terminal    c -> filter isAlphaNum c;
+                                                       NonTerminal t -> t
+                                                     }
+                                             } in map (\rhs -> CFRule cat rhs (CFObj (mkCId (mkFun cat rhs)) [])) $3
+                                           }
+
+ListCFRHS :: { [[CFSymbol]] }
+ListCFRHS
+  : ListCFSymbol                { [$1]    }
+  | ListCFSymbol '|' ListCFRHS  { $1 : $3 }
+
+ListCFSymbol :: { [CFSymbol] }
+ListCFSymbol
+  : {- empty -}           { []      } 
+  | CFSymbol ListCFSymbol { $1 : $2 }
+
+CFSymbol :: { CFSymbol }
+  : String                { Terminal $1 }
+  | Ident                 { NonTerminal (showIdent $1) }
+
+ListEBNFRule :: { [ERule] }
+ListEBNFRule
+  : EBNFRule              { [$1]    }
+  | EBNFRule ListEBNFRule { $1 : $2 }
+
+EBNFRule :: { ERule }
+  : Ident '::=' ERHS0 ';' { ((showIdent $1,[]),$3) }
+
+ERHS0 :: { ERHS }
+  : ERHS1                 { $1         }
+  | ERHS1 '|' ERHS0       { EAlt $1 $3 }
+
+ERHS1 :: { ERHS }
+  : ERHS2                 { $1         }
+  | ERHS2 ERHS1           { ESeq $1 $2 }
+
+ERHS2 :: { ERHS }
+  : ERHS3 '*'             { EStar $1 }
+  | ERHS3 '+'             { EPlus $1 }
+  | ERHS3 '?'             { EOpt $1  }
+  | ERHS3                 { $1       }
+
+ERHS3 :: { ERHS }
+  : String                { ETerm $1 }
+  | Ident                 { ENonTerm (showIdent $1,[]) }
+  | '(' ERHS0 ')'         { $2         }
+
 Posn :: { Posn }
 Posn
   : {- empty -}     {% getPosn }