summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoraarne <aarne@cs.chalmers.se>2006-06-20 08:38:44 +0000
committeraarne <aarne@cs.chalmers.se>2006-06-20 08:38:44 +0000
commit402a113b567a96eef61946552b68df6ac6eb6712 (patch)
tree6446bbaf195800c7e0ec71d30cba113c9bcf1f74
parentcb168e92e23d0f620b75f6119a4cb298360e6f21 (diff)
made -fcfg default parser; added lexer textvars
-rw-r--r--doc/gf-history.html10
-rw-r--r--src/GF/Compile/ShellState.hs4
-rw-r--r--src/GF/Grammar/LookAbs.hs5
-rw-r--r--src/GF/Grammar/Macros.hs4
-rw-r--r--src/GF/Shell/HelpFile.hs9
-rw-r--r--src/GF/Shell/ShellCommands.hs5
-rw-r--r--src/GF/UseGrammar/Custom.hs1
-rw-r--r--src/GF/UseGrammar/Parsing.hs43
-rw-r--r--src/GF/UseGrammar/Tokenize.hs12
-rw-r--r--src/HelpFile9
10 files changed, 67 insertions, 35 deletions
diff --git a/doc/gf-history.html b/doc/gf-history.html
index 57a425ca2..ca71a3b7c 100644
--- a/doc/gf-history.html
+++ b/doc/gf-history.html
@@ -12,6 +12,16 @@ Changes in functionality since May 17, 2005, release of GF Version 2.2
</center>
+<p>
+
+20/6 (AR) The FCFG parser is know the default, as it even handles literals.
+The old default can be selected by <tt>p -old</tt>. Since
+FCFG does not support variable bindings, <tt>-old</tt> is automatically
+selected if the grammar has bindings - and unless the <tt>-fcfg</tt> flag
+is used.
+
+<p>
+
17/6 (AR) The FCFG parser is now the recommended method for parsing
heavy grammars such as the resource grammars. It does not yet support
literals and variable bindings.
diff --git a/src/GF/Compile/ShellState.hs b/src/GF/Compile/ShellState.hs
index 3a7115b34..88da1283b 100644
--- a/src/GF/Compile/ShellState.hs
+++ b/src/GF/Compile/ShellState.hs
@@ -188,6 +188,10 @@ stateOptions = loptions
stateGrammarWords = allMorphoWords . stateMorpho
stateGrammarLang st = (grammar st, cncId st)
+---- this should be computed at compile time and stored
+stateHasHOAS :: StateGrammar -> Bool
+stateHasHOAS = hasHOAS . stateGrammarST
+
cncModuleIdST :: StateGrammar -> CanonGrammar
cncModuleIdST = stateGrammarST
diff --git a/src/GF/Grammar/LookAbs.hs b/src/GF/Grammar/LookAbs.hs
index 0c86ae3e9..5bd4c1e41 100644
--- a/src/GF/Grammar/LookAbs.hs
+++ b/src/GF/Grammar/LookAbs.hs
@@ -21,6 +21,7 @@ module GF.Grammar.LookAbs (GFCGrammar,
lookupRef,
refsForType,
funRulesOf,
+ hasHOAS,
allCatsOf,
allBindCatsOf,
funsForType,
@@ -130,6 +131,10 @@ funRulesOf gr =
mtype m == MTAbstract,
(f, C.AbsFun typ _) <- tree2list (jments m)]
+-- testing for higher-order abstract syntax
+hasHOAS :: GFCGrammar -> Bool
+hasHOAS gr = any isHigherOrderType [t | (_,t) <- funRulesOf gr] where
+
allCatsOf :: GFCGrammar -> [(Cat,Context)]
allCatsOf gr =
[((i,c),cont) | (i, ModMod m) <- modules gr,
diff --git a/src/GF/Grammar/Macros.hs b/src/GF/Grammar/Macros.hs
index e7d073382..a3cad8bae 100644
--- a/src/GF/Grammar/Macros.hs
+++ b/src/GF/Grammar/Macros.hs
@@ -136,6 +136,10 @@ isRecursiveType t = errVal False $ do
(cc,c) <- catSkeleton t -- thus recursivity on Cat level
return $ any (== c) cc
+isHigherOrderType :: Type -> Bool
+isHigherOrderType t = errVal True $ do -- pessimistic choice
+ co <- contextOfType t
+ return $ not $ null [x | (x,Prod _ _ _) <- co]
contextOfType :: Type -> Err Context
contextOfType typ = case typ of
diff --git a/src/GF/Shell/HelpFile.hs b/src/GF/Shell/HelpFile.hs
index 215a33875..18dcd3579 100644
--- a/src/GF/Shell/HelpFile.hs
+++ b/src/GF/Shell/HelpFile.hs
@@ -212,12 +212,12 @@ txtHelpFile =
"\n -fail show strings whose parse fails prefixed by #FAIL" ++
"\n -ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS" ++
"\n options for selecting parsing method:" ++
- "\n (default)parse using an overgenerating CFG" ++
+ "\n -fcfg parse using a fast variant of MCFG (default is no HOAS in grammar)" ++
+ "\n -old parse using an overgenerating CFG (default if HOAS in grammar)" ++
"\n -cfg parse using a much less overgenerating CFG" ++
"\n -mcfg parse using an even less overgenerating MCFG" ++
- "\n -fcfg parse using a faster variant of MCFG" ++
- "\n Note: the first time parsing with -cfg, -mcfg, and -fcfg might take a long time" ++
- "\n options that only work for the default parsing method:" ++
+ "\n Note: the first time parsing with -cfg, -mcfg, and -fcfg may take a long time" ++
+ "\n options that only work for the -old default parsing method:" ++
"\n -n non-strict: tolerates morphological errors" ++
"\n -ign ignore unknown words when parsing" ++
"\n -raw return context-free terms in raw form" ++
@@ -594,6 +594,7 @@ txtHelpFile =
"\n -lexer=chars each character is a token" ++
"\n -lexer=code use Haskell's lex" ++
"\n -lexer=codevars like code, but treat unknown words as variables, ?? as meta " ++
+ "\n -lexer=textvars like text, but treat unknown words as variables, ?? as meta " ++
"\n -lexer=text with conventions on punctuation and capital letters" ++
"\n -lexer=codelit like code, but treat unknown words as string literals" ++
"\n -lexer=textlit like text, but treat unknown words as string literals" ++
diff --git a/src/GF/Shell/ShellCommands.hs b/src/GF/Shell/ShellCommands.hs
index ff3960eef..08b9720bd 100644
--- a/src/GF/Shell/ShellCommands.hs
+++ b/src/GF/Shell/ShellCommands.hs
@@ -184,8 +184,9 @@ optionsOfCommand co = case co of
CTransformGrammar _ -> flags "printer"
CConvertLatex _ -> none
CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer mark"
- CParse -> both "ambiguous fail cut new newer cfg mcfg fcfg n ign raw v lines all prob"
- "cat lang lexer parser number rawtrees"
+ CParse ->
+ both "ambiguous fail cut new newer old cfg mcfg fcfg n ign raw v lines all prob"
+ "cat lang lexer parser number rawtrees"
CTranslate _ _ -> opts "cat lexer parser"
CGenerateRandom -> both "cf prob" "cat lang number depth atoms noexpand doexpand"
CGenerateTrees -> both "metas" "atoms depth alts cat lang number noexpand doexpand"
diff --git a/src/GF/UseGrammar/Custom.hs b/src/GF/UseGrammar/Custom.hs
index 4314747bf..07eda0a37 100644
--- a/src/GF/UseGrammar/Custom.hs
+++ b/src/GF/UseGrammar/Custom.hs
@@ -450,6 +450,7 @@ customTokenizer =
,(strCI "chars", const $ sg . map (tS . singleton))
,(strCI "code", const $ sg . lexHaskell)
,(strCI "codevars", \gr -> sg . (lexHaskellVar $ stateIsWord gr))
+ ,(strCI "textvars", \gr -> sg . (lexTextVar $ stateIsWord gr))
,(strCI "text", const $ sg . lexText)
,(strCI "unglue", \gr -> sg . map tS . decomposeWords (stateMorpho gr))
,(strCI "codelit", \gr -> sg . (lexHaskellLiteral $ stateIsWord gr))
diff --git a/src/GF/UseGrammar/Parsing.hs b/src/GF/UseGrammar/Parsing.hs
index 6e8965f08..e979579c9 100644
--- a/src/GF/UseGrammar/Parsing.hs
+++ b/src/GF/UseGrammar/Parsing.hs
@@ -54,25 +54,30 @@ parseStringMsg os sg cat s = do
return (ts, unlines $ reverse ss)
parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree]
+parseStringC opts0 sg cat s
+ | oElem (iOpt "old") opts0 ||
+ (not (oElem (iOpt "fcfg") opts0) && stateHasHOAS sg) = do
+ let opts = unionOptions opts0 $ stateOptions sg
+ cf = stateCF sg
+ gr = stateGrammarST sg
+ cn = cncId sg
+ toks = customOrDefault opts useTokenizer customTokenizer sg s
+ parser = customOrDefault opts useParser customParser sg cat
+ if oElem (iOpt "cut") opts
+ then doUntil (not . null) $ map (tokens2trms opts sg cn parser) toks
+ else mapM (tokens2trms opts sg cn parser) toks >>= return . concat
+
+---- | or [oElem p opts0 |
+---- p <- [newCParser,newMParser,newFParser,newParser,newerParser] = do
----- to test peb's new parser 6/10/2003
----- (obsoleted by "newer" below)
--- parseStringC opts0 sg cat s
--- | oElem newParser opts0 = do
--- let pm = maybe "" id $ getOptVal opts0 useParser -- -parser=pm
--- ct = cfCat2Cat cat
--- ts <- checkErr $ NewOld.newParser pm sg ct s
--- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
-
--- to use peb's newer parser 7/4-05
-parseStringC opts0 sg cat s
- | oElem newCParser opts0 || oElem newMParser opts0 || oElem newFParser opts0 || oElem newParser opts0 || oElem newerParser opts0 = do
+ | otherwise = do
let opts = unionOptions opts0 $ stateOptions sg
algorithm | oElem newCParser opts0 = "c"
| oElem newMParser opts0 = "m"
| oElem newFParser opts0 = "f"
- | otherwise = "c" -- default algorithm
- strategy = maybe "bottomup" id $ getOptVal opts useParser -- -parser=bottomup/topdown
+ | otherwise = "f" -- default algorithm: FCFG
+ strategy = maybe "bottomup" id $ getOptVal opts useParser
+ -- -parser=bottomup/topdown
tokenizer = customOrDefault opts useTokenizer customTokenizer sg
toks = case tokenizer s of
t:_ -> t
@@ -81,16 +86,6 @@ parseStringC opts0 sg cat s
ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
return $ optIntOrAll opts flagNumber ts'
-parseStringC opts0 sg cat s = do
- let opts = unionOptions opts0 $ stateOptions sg
- cf = stateCF sg
- gr = stateGrammarST sg
- cn = cncId sg
- toks = customOrDefault opts useTokenizer customTokenizer sg s
- parser = customOrDefault opts useParser customParser sg cat
- if oElem (iOpt "cut") opts
- then doUntil (not . null) $ map (tokens2trms opts sg cn parser) toks
- else mapM (tokens2trms opts sg cn parser) toks >>= return . concat
tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]
tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
diff --git a/src/GF/UseGrammar/Tokenize.hs b/src/GF/UseGrammar/Tokenize.hs
index d16fdf32f..9f1ab5449 100644
--- a/src/GF/UseGrammar/Tokenize.hs
+++ b/src/GF/UseGrammar/Tokenize.hs
@@ -20,6 +20,7 @@ module GF.UseGrammar.Tokenize ( tokWords,
lexHaskellLiteral,
lexHaskellVar,
lexText,
+ lexTextVar,
lexC2M, lexC2M',
lexTextLiteral,
lexIgnore,
@@ -58,6 +59,10 @@ isFloat s = case s of
'.':cs@(_:_) -> all isDigit cs
_ -> False
+isString s = case s of
+ c:cs@(_:_) -> (c == '\'' && d == '\'') || (c == '"' && d == '"') where d = last cs
+ _ -> False
+
mkCFTok :: String -> CFTok
mkCFTok s = case s of
@@ -86,6 +91,7 @@ mkLit s
| all isDigit s = tI s
| otherwise = tL s
+-- obsolete
mkTL :: String -> CFTok
mkTL s
| isFloat s = tF s
@@ -104,6 +110,7 @@ lexText :: String -> [CFTok]
lexText = uncap . lx where
lx s = case s of
+ '?':'?':cs -> tS "??" : lx cs
p : cs | isMPunct p -> tS [p] : uncap (lx cs)
p : cs | isPunct p -> tS [p] : lx cs
s : cs | isSpace s -> lx cs
@@ -177,7 +184,7 @@ unknown2string isKnown = map mkOne where
| isFloat s = tF s
| all isDigit s = tI s
| otherwise = tL s
- mkOne t@(TC s) = if isKnown s then t else mkTL s
+ mkOne t@(TC s) = if isKnown s then t else mkLit s
mkOne t = t
unknown2var :: (String -> Bool) -> [CFTok] -> [CFTok]
@@ -186,6 +193,7 @@ unknown2var isKnown = map mkOne where
mkOne t@(TS s)
| isKnown s = t
| isFloat s = tF s
+ | isString s = tL (init (tail s))
| all isDigit s = tI s
| otherwise = tV s
mkOne t@(TC s) = if isKnown s then t else tV s
@@ -197,6 +205,8 @@ lexTextLiteral isKnown = unknown2string (eitherUpper isKnown) . lexText
lexHaskellLiteral isKnown = unknown2string isKnown . lexHaskell
lexHaskellVar isKnown = unknown2var isKnown . lexHaskell
+lexTextVar isKnown = unknown2var (eitherUpper isKnown) . lexText
+
eitherUpper isKnown w@(c:cs) = isKnown (toLower c : cs) || isKnown (toUpper c : cs)
eitherUpper isKnown w = isKnown w
diff --git a/src/HelpFile b/src/HelpFile
index c3402c383..96afff0ec 100644
--- a/src/HelpFile
+++ b/src/HelpFile
@@ -183,12 +183,12 @@ p, parse: p String
-fail show strings whose parse fails prefixed by #FAIL
-ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS
options for selecting parsing method:
- (default)parse using an overgenerating CFG
+ -fcfg parse using a fast variant of MCFG (default is no HOAS in grammar)
+ -old parse using an overgenerating CFG (default if HOAS in grammar)
-cfg parse using a much less overgenerating CFG
-mcfg parse using an even less overgenerating MCFG
- -fcfg parse using a faster variant of MCFG
- Note: the first time parsing with -cfg, -mcfg, and -fcfg might take a long time
- options that only work for the default parsing method:
+ Note: the first time parsing with -cfg, -mcfg, and -fcfg may take a long time
+ options that only work for the -old default parsing method:
-n non-strict: tolerates morphological errors
-ign ignore unknown words when parsing
-raw return context-free terms in raw form
@@ -565,6 +565,7 @@ q, quit: q
-lexer=chars each character is a token
-lexer=code use Haskell's lex
-lexer=codevars like code, but treat unknown words as variables, ?? as meta
+ -lexer=textvars like text, but treat unknown words as variables, ?? as meta
-lexer=text with conventions on punctuation and capital letters
-lexer=codelit like code, but treat unknown words as string literals
-lexer=textlit like text, but treat unknown words as string literals