diff options
| author | aarne <aarne@cs.chalmers.se> | 2008-05-21 09:26:44 +0000 |
|---|---|---|
| committer | aarne <aarne@cs.chalmers.se> | 2008-05-21 09:26:44 +0000 |
| commit | 055c0d0d5a5bb0dc75904fe53df7f2e4f5732a8f (patch) | |
| tree | 0e63fb68c69c8f6ad0f78893c63420f0a3600e1c /src-3.0/GF/UseGrammar/Parsing.hs | |
| parent | 915a1de71783ab8446b1af9e72c7ba7dfbc12d3f (diff) | |
GF/src is now for 2.9, and the new sources are in src-3.0 - keep it this way until the release of GF 3
Diffstat (limited to 'src-3.0/GF/UseGrammar/Parsing.hs')
| -rw-r--r-- | src-3.0/GF/UseGrammar/Parsing.hs | 177 |
1 files changed, 177 insertions, 0 deletions
diff --git a/src-3.0/GF/UseGrammar/Parsing.hs b/src-3.0/GF/UseGrammar/Parsing.hs new file mode 100644 index 000000000..2ca057410 --- /dev/null +++ b/src-3.0/GF/UseGrammar/Parsing.hs @@ -0,0 +1,177 @@ +---------------------------------------------------------------------- +-- | +-- Module : Parsing +-- Maintainer : AR +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/06/02 10:23:52 $ +-- > CVS $Author: aarne $ +-- > CVS $Revision: 1.25 $ +-- +-- (Description of the module) +----------------------------------------------------------------------------- + +module GF.UseGrammar.Parsing where + +import GF.Infra.CheckM +import qualified GF.Canon.AbsGFC as C +import GF.Canon.GFC +import GF.Canon.MkGFC (trExp) ---- +import GF.Canon.CMacros +import GF.Grammar.MMacros (refreshMetas) +import GF.UseGrammar.Linear +import GF.Data.Str +import GF.CF.CF +import GF.CF.CFIdent +import GF.Infra.Ident +import GF.Grammar.TypeCheck +import GF.Grammar.Values +--import CFMethod +import GF.UseGrammar.Tokenize +import GF.UseGrammar.Morphology (isKnownWord) +import GF.CF.Profile +import GF.Infra.Option +import GF.UseGrammar.Custom +import GF.Compile.ShellState + +import GF.CF.PPrCF (prCFTree) +-- import qualified GF.OldParsing.ParseGFC as NewOld -- OBSOLETE +import qualified GF.Parsing.GFC as New + +import GF.Data.Operations + +import Data.List (nub,sortBy) +import Data.Char (toLower) +import Control.Monad (liftM) + +-- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002 + +parseString :: Options -> StateGrammar -> CFCat -> String -> Err [Tree] +parseString os sg cat = liftM fst . parseStringMsg os sg cat + +parseStringMsg :: Options -> StateGrammar -> CFCat -> String -> Err ([Tree],String) +parseStringMsg os sg cat s = do + case checkStart $ parseStringC os sg cat s of + Ok (ts,(_,ss)) -> return (ts, unlines $ reverse ss) + Bad s -> return ([],s) + +parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree] +parseStringC opts0 sg cat s + | oElem (iOpt "old") opts0 || + (not (oElem (iOpt "fcfg") opts0) && stateHasHOAS sg) = do + let opts = unionOptions opts0 $ stateOptions sg + cf = stateCF sg + gr = stateGrammarST sg + cn = cncId sg + toks = customOrDefault opts useTokenizer customTokenizer sg s + parser = customOrDefault opts useParser customParser sg cat + if oElem (iOpt "cut") opts + then doUntil (not . null) $ map (tokens2trms opts sg cn parser) toks + else mapM (tokens2trms opts sg cn parser) toks >>= return . concat + +---- | or [oElem p opts0 | +---- p <- [newCParser,newMParser,newFParser,newParser,newerParser] = do + + | otherwise = do + let opts = unionOptions opts0 $ stateOptions sg + algorithm | oElem newCParser opts0 = "c" + | oElem newMParser opts0 = "m" + | oElem newFParser opts0 = "f" + | otherwise = "f" -- default algorithm: FCFG + strategy = maybe "bottomup" id $ getOptVal opts useParser + -- -parser=bottomup/topdown + tokenizer = customOrDefault opts useTokenizer customTokenizer sg + toks = case tokenizer s of + t:_ -> t + _ -> [] ---- no support for undet. tok. + unknowns = + [w | TC w <- toks, unk w && unk (uncap w)] ++ [w | TS w <- toks, unk w] + where + unk w = not $ isKnownWord (morpho sg) w + uncap (c:cs) = toLower c : cs + uncap s = s + + case unknowns of + _:_ | oElem (iOpt "trynextlang") opts -> return [] + _:_ -> fail $ "Unknown words:" +++ unwords unknowns + _ -> do + + ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks + ts' <- checkErr $ + allChecks $ map (annotate (stateGrammarST sg) . refreshMetas []) ts + return $ optIntOrAll opts flagNumber ts' + + +tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree] +tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info + where result = parser toks + info = snd result + trees = {- nub $ -} cfParseResults result -- peb 25/5-04: removed nub (O(n^2)) + +trees2trms :: + Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree] +trees2trms opts sg cn as ts0 info = do + let s = unwords $ map prCFTok as + ts <- case () of + _ | null ts0 -> checkWarn ("No success in cf parsing" +++ s) >> return [] + _ | raw -> do + ts1 <- return (map cf2trm0 ts0) ----- should not need annot + checks [ + mapM (checkErr . (annotate gr) . trExp) ts1 ---- complicated, often fails + ,checkWarn (unlines ("Raw CF trees:":(map prCFTree ts0))) >> return [] + ] + _ -> do + let num = optIntOrN opts flagRawtrees 999999 + let (ts01,rest) = splitAt num ts0 + if null rest then return () + else raise ("Warning: only" +++ show num +++ "raw parses out of" +++ + show (length ts0) +++ + "considered; use -rawtrees=<Int> to see more" + ) + (ts1,ss) <- checkErr $ mapErrN 1 postParse ts01 + if null ts1 then raise ss else return () + ts2 <- checkErr $ + allChecks $ map (annotate gr . refreshMetas [] . trExp) ts1 ---- + if forgive then return ts2 else do + let tsss = [(t, allLinsOfTree gr cn t) | t <- ts2] + ps = [t | (t,ss) <- tsss, + any (compatToks as) (map str2cftoks ss)] + if null ps + then raise $ "Failure in morphology." ++ + if verb + then "\nPossible corrections: " +++++ + unlines (nub (map sstr (concatMap snd tsss))) + else "" + else return ps + if verb + then checkWarn $ " the token list" +++ show as ++++ unknownWords sg as +++++ info + else return () + + return $ optIntOrAll opts flagNumber $ nub ts + where + gr = stateGrammarST sg + + raw = oElem rawParse opts + verb = oElem beVerbose opts + forgive = oElem forgiveParse opts + +---- Operatins.allChecks :: ErrorMonad m => [m a] -> m [a] + +unknownWords sg ts = case filter noMatch [t | t@(TS _) <- ts] of + [] -> "where all words are known" + us -> "with the unknown tokens" +++ show us --- needs to be fixed for literals + where + terminals = map TS $ stateGrammarWords sg + noMatch t = all (not . compatTok t) terminals + + +--- too much type checking in building term info? return FullTerm to save work? + +-- | raw parsing: so simple it is for a context-free CF grammar +cf2trm0 :: CFTree -> C.Exp +cf2trm0 (CFTree (fun, (_, trees))) = mkAppAtom (cffun2trm fun) (map cf2trm0 trees) + where + cffun2trm (CFFun (fun,_)) = fun + mkApp = foldl C.EApp + mkAppAtom a = mkApp (C.EAtom a) |
