summaryrefslogtreecommitdiff
path: root/src/GF
diff options
context:
space:
mode:
authoraarne <aarne@cs.chalmers.se>2005-11-17 23:17:42 +0000
committeraarne <aarne@cs.chalmers.se>2005-11-17 23:17:42 +0000
commit524c4829f9cc5720c18b8d43bd430d0627edcb89 (patch)
treec10cc4dbb4b6f0bb5464369b1ed3d028c29fec18 /src/GF
parente29a1430bf76b00c3714b72b7763190df6716081 (diff)
nondeterministic lexer, e.g. subseqs
Diffstat (limited to 'src/GF')
-rw-r--r--src/GF/Data/Operations.hs19
-rw-r--r--src/GF/Shell/HelpFile.hs7
-rw-r--r--src/GF/Shell/ShellCommands.hs2
-rw-r--r--src/GF/UseGrammar/Custom.hs32
-rw-r--r--src/GF/UseGrammar/Parsing.hs22
5 files changed, 53 insertions, 29 deletions
diff --git a/src/GF/Data/Operations.hs b/src/GF/Data/Operations.hs
index 339a053cf..f5434486f 100644
--- a/src/GF/Data/Operations.hs
+++ b/src/GF/Data/Operations.hs
@@ -65,14 +65,14 @@ module GF.Data.Operations (-- * misc functions
updateAssoc, removeAssoc,
-- * chop into separator-separated parts
- chunks, readIntArg,
+ chunks, readIntArg, subSequences,
-- * state monad with error; from Agda 6\/11\/2001
STM(..), appSTM, stm, stmr, readSTM, updateSTM, writeSTM, done,
-- * error monad class
- ErrorMonad(..), checkAgain, checks, allChecks
-
+ ErrorMonad(..), checkAgain, checks, allChecks, doUntil
+
) where
import Data.Char (isSpace, toUpper, isSpace, isDigit)
@@ -656,3 +656,16 @@ allChecks ms = case ms of
(m: ms) -> let rs = allChecks ms in handle_ (liftM2 (:) m rs) rs
_ -> return []
+doUntil :: ErrorMonad m => (a -> Bool) -> [m a] -> m a
+doUntil cond ms = case ms of
+ a:as -> do
+ v <- a
+ if cond v then return v else doUntil cond as
+ _ -> raise "no result"
+
+-- subsequences sorted from longest to shortest ; their number is 2^n
+subSequences :: [a] -> [[a]]
+subSequences = sortBy (\x y -> compare (length y) (length x)) . subs where
+ subs xs = case xs of
+ [] -> [[]]
+ x:xs -> let xss = subs xs in [x:y | y <- xss] ++ xss
diff --git a/src/GF/Shell/HelpFile.hs b/src/GF/Shell/HelpFile.hs
index 61532737b..8ec2971b6 100644
--- a/src/GF/Shell/HelpFile.hs
+++ b/src/GF/Shell/HelpFile.hs
@@ -5,9 +5,9 @@
-- Stability : (stable)
-- Portability : (portable)
--
--- > CVS $Date: 2005/11/14 16:03:41 $
+-- > CVS $Date: 2005/05/12 10:03:34 $
-- > CVS $Author: aarne $
--- > CVS $Revision: 1.20 $
+-- > CVS $Revision: 1.9 $
--
-- Help on shell commands. Generated from HelpFile by 'make help'.
-- PLEASE DON'T EDIT THIS FILE.
@@ -198,6 +198,7 @@ txtHelpFile =
"\n -lines parse each line of input separately, ignoring empty lines" ++
"\n -all as -lines, but also parse empty lines" ++
"\n -prob rank results by probability" ++
+ "\n -cut stop after first lexing result leading to parser success" ++
"\n options for selecting parsing method:" ++
"\n (default)parse using an overgenerating CFG" ++
"\n -cfg parse using a much less overgenerating CFG" ++
@@ -531,6 +532,8 @@ txtHelpFile =
"\n -lexer=codelit like code, but treat unknown words as string literals" ++
"\n -lexer=textlit like text, but treat unknown words as string literals" ++
"\n -lexer=codeC use a C-like lexer" ++
+ "\n -lexer=ignore like literals, but ignore unknown words" ++
+ "\n -lexer=subseqs like ignore, but then try all subsequences from longest" ++
"\n" ++
"\n-number, the maximum number of generated items in a list. " ++
"\n The default is unlimited." ++
diff --git a/src/GF/Shell/ShellCommands.hs b/src/GF/Shell/ShellCommands.hs
index 56cedc202..121d8cda6 100644
--- a/src/GF/Shell/ShellCommands.hs
+++ b/src/GF/Shell/ShellCommands.hs
@@ -173,7 +173,7 @@ optionsOfCommand co = case co of
CTransformGrammar _ -> flags "printer"
CConvertLatex _ -> none
CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer"
- CParse -> both "new newer cfg mcfg n ign raw v lines all prob"
+ CParse -> both "cut new newer cfg mcfg n ign raw v lines all prob"
"cat lang lexer parser number rawtrees"
CTranslate _ _ -> opts "cat lexer parser"
CGenerateRandom -> both "cf prob" "cat lang number depth"
diff --git a/src/GF/UseGrammar/Custom.hs b/src/GF/UseGrammar/Custom.hs
index 75294ff4b..26bad1ee9 100644
--- a/src/GF/UseGrammar/Custom.hs
+++ b/src/GF/UseGrammar/Custom.hs
@@ -161,7 +161,7 @@ customStringCommand :: CustomData (StateGrammar -> String -> String)
customParser :: CustomData (StateGrammar -> CFCat -> CFParser)
-- | useTokenizer, \"-lexer=x\"
-customTokenizer :: CustomData (StateGrammar -> String -> [CFTok])
+customTokenizer :: CustomData (StateGrammar -> String -> [[CFTok]])
-- | useUntokenizer, \"-unlexer=x\" --- should be from token list to string
customUntokenizer :: CustomData (StateGrammar -> String -> String)
@@ -416,22 +416,24 @@ customParser =
-- add your own parsers here
]
-customTokenizer =
+customTokenizer =
+ let sg = singleton in
customData "Tokenizers, selected by option -lexer=x" $
[
- (strCI "words", const $ tokWords)
- ,(strCI "literals", const $ tokLits)
- ,(strCI "vars", const $ tokVars)
- ,(strCI "chars", const $ map (tS . singleton))
- ,(strCI "code", const $ lexHaskell)
- ,(strCI "codevars", lexHaskellVar . stateIsWord)
- ,(strCI "text", const $ lexText)
- ,(strCI "unglue", \gr -> map tS . decomposeWords (stateMorpho gr))
- ,(strCI "codelit", lexHaskellLiteral . stateIsWord)
- ,(strCI "textlit", lexTextLiteral . stateIsWord)
- ,(strCI "codeC", const $ lexC2M)
- ,(strCI "ignore", \gr -> lexIgnore (stateIsWord gr) . tokLits)
- ,(strCI "codeCHigh", const $ lexC2M' True)
+ (strCI "words", const $ sg . tokWords)
+ ,(strCI "literals", const $ sg . tokLits)
+ ,(strCI "vars", const $ sg . tokVars)
+ ,(strCI "chars", const $ sg . map (tS . singleton))
+ ,(strCI "code", const $ sg . lexHaskell)
+ ,(strCI "codevars", \gr -> sg . (lexHaskellVar $ stateIsWord gr))
+ ,(strCI "text", const $ sg . lexText)
+ ,(strCI "unglue", \gr -> sg . map tS . decomposeWords (stateMorpho gr))
+ ,(strCI "codelit", \gr -> sg . (lexHaskellLiteral $ stateIsWord gr))
+ ,(strCI "textlit", \gr -> sg . (lexTextLiteral $ stateIsWord gr))
+ ,(strCI "codeC", const $ sg . lexC2M)
+ ,(strCI "ignore", \gr -> sg . lexIgnore (stateIsWord gr) . tokLits)
+ ,(strCI "subseqs", \gr -> subSequences . lexIgnore (stateIsWord gr) . tokLits)
+ ,(strCI "codeCHigh", const $ sg . lexC2M' True)
-- add your own tokenizers here
]
diff --git a/src/GF/UseGrammar/Parsing.hs b/src/GF/UseGrammar/Parsing.hs
index a9da37df5..a4699bcab 100644
--- a/src/GF/UseGrammar/Parsing.hs
+++ b/src/GF/UseGrammar/Parsing.hs
@@ -40,7 +40,7 @@ import qualified GF.Parsing.GFC as New
import GF.Data.Operations
-import Data.List (nub)
+import Data.List (nub,sortBy)
import Control.Monad (liftM)
-- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002
@@ -51,7 +51,7 @@ parseString os sg cat = liftM fst . parseStringMsg os sg cat
parseStringMsg :: Options -> StateGrammar -> CFCat -> String -> Err ([Tree],String)
parseStringMsg os sg cat s = do
(ts,(_,ss)) <- checkStart $ parseStringC os sg cat s
- return (ts,unlines ss)
+ return (ts, unlines $ reverse ss)
parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree]
@@ -73,7 +73,10 @@ parseStringC opts0 sg cat s
| otherwise = "c" -- default algorithm
strategy = maybe "bottomup" id $ getOptVal opts useParser -- -parser=bottomup/topdown
tokenizer = customOrDefault opts useTokenizer customTokenizer sg
- ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat (tokenizer s)
+ toks = case tokenizer s of
+ t:_ -> t
+ _ -> [] ---- no support for undet. tok.
+ ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
return $ optIntOrAll opts flagNumber ts'
@@ -82,10 +85,11 @@ parseStringC opts0 sg cat s = do
cf = stateCF sg
gr = stateGrammarST sg
cn = cncId sg
- tok = customOrDefault opts useTokenizer customTokenizer sg
+ toks = customOrDefault opts useTokenizer customTokenizer sg s
parser = customOrDefault opts useParser customParser sg cat
- tokens2trms opts sg cn parser (tok s)
-
+ if oElem (iOpt "cut") opts
+ then doUntil (not . null) $ map (tokens2trms opts sg cn parser) toks
+ else mapM (tokens2trms opts sg cn parser) toks >>= return . concat
tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]
tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
@@ -93,10 +97,12 @@ tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
info = snd result
trees = {- nub $ -} cfParseResults result -- peb 25/5-04: removed nub (O(n^2))
-trees2trms :: Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree]
+trees2trms ::
+ Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree]
trees2trms opts sg cn as ts0 info = do
+ let s = unwords $ map prCFTok as
ts <- case () of
- _ | null ts0 -> checkWarn "No success in cf parsing" >> return []
+ _ | null ts0 -> checkWarn ("No success in cf parsing" +++ s) >> return []
_ | raw -> do
ts1 <- return (map cf2trm0 ts0) ----- should not need annot
checks [