nondeterministic lexer, e.g. subseqs

author: aarne <aarne@cs.chalmers.se> 2005-11-17 23:17:42 +0000
committer: aarne <aarne@cs.chalmers.se> 2005-11-17 23:17:42 +0000
commit: 524c4829f9cc5720c18b8d43bd430d0627edcb89 (patch)
tree: c10cc4dbb4b6f0bb5464369b1ed3d028c29fec18 /src/GF
parent: e29a1430bf76b00c3714b72b7763190df6716081 (diff)
5 files changed, 53 insertions, 29 deletions
diff --git a/src/GF/Data/Operations.hs b/src/GF/Data/Operations.hs
index 339a053cf..f5434486f 100644
--- a/src/GF/Data/Operations.hs
+++ b/src/GF/Data/Operations.hs
@@ -65,14 +65,14 @@ module GF.Data.Operations (-- * misc functions
 		   updateAssoc, removeAssoc,
 
 		   -- * chop into separator-separated parts
-		   chunks, readIntArg,
+		   chunks, readIntArg, subSequences,
 
 		   -- * state monad with error; from Agda 6\/11\/2001
 		   STM(..), appSTM, stm, stmr, readSTM, updateSTM, writeSTM, done,
 
 		   -- * error monad class
-		   ErrorMonad(..), checkAgain, checks, allChecks
-
+		   ErrorMonad(..), checkAgain, checks, allChecks, doUntil
+                
 		  ) where
 
 import Data.Char (isSpace, toUpper, isSpace, isDigit)
@@ -656,3 +656,16 @@ allChecks ms = case ms of
   (m: ms) -> let rs = allChecks ms in handle_ (liftM2 (:) m rs) rs
   _ -> return []
 
+doUntil :: ErrorMonad m => (a -> Bool) -> [m a] -> m a
+doUntil cond ms = case ms of
+  a:as -> do
+    v <- a
+    if cond v then return v else doUntil cond as
+  _ -> raise "no result"
+
+-- subsequences sorted from longest to shortest ; their number is 2^n
+subSequences :: [a] -> [[a]]
+subSequences = sortBy (\x y -> compare (length y) (length x)) . subs where
+  subs xs = case xs of
+    [] -> [[]]
+    x:xs -> let xss = subs xs in [x:y | y <- xss] ++ xss
diff --git a/src/GF/Shell/HelpFile.hs b/src/GF/Shell/HelpFile.hs
index 61532737b..8ec2971b6 100644
--- a/src/GF/Shell/HelpFile.hs
+++ b/src/GF/Shell/HelpFile.hs
@@ -5,9 +5,9 @@
 -- Stability   : (stable)
 -- Portability : (portable)
 --
--- > CVS $Date: 2005/11/14 16:03:41 $
+-- > CVS $Date: 2005/05/12 10:03:34 $
 -- > CVS $Author: aarne $
--- > CVS $Revision: 1.20 $
+-- > CVS $Revision: 1.9 $
 --
 -- Help on shell commands. Generated from HelpFile by 'make help'.
 -- PLEASE DON'T EDIT THIS FILE.
@@ -198,6 +198,7 @@ txtHelpFile =
   "\n      -lines   parse each line of input separately, ignoring empty lines" ++
   "\n      -all     as -lines, but also parse empty lines" ++
   "\n      -prob    rank results by probability" ++
+  "\n      -cut     stop after first lexing result leading to parser success" ++
   "\n  options for selecting parsing method:" ++
   "\n      (default)parse using an overgenerating CFG" ++
   "\n      -cfg     parse using a much less overgenerating CFG" ++
@@ -531,6 +532,8 @@ txtHelpFile =
   "\n    -lexer=codelit       like code, but treat unknown words as string literals" ++
   "\n    -lexer=textlit       like text, but treat unknown words as string literals" ++
   "\n    -lexer=codeC         use a C-like lexer" ++
+  "\n    -lexer=ignore        like literals, but ignore unknown words" ++
+  "\n    -lexer=subseqs       like ignore, but then try all subsequences from longest" ++
   "\n" ++
   "\n-number, the maximum number of generated items in a list. " ++
   "\n       The default is unlimited." ++
diff --git a/src/GF/Shell/ShellCommands.hs b/src/GF/Shell/ShellCommands.hs
index 56cedc202..121d8cda6 100644
--- a/src/GF/Shell/ShellCommands.hs
+++ b/src/GF/Shell/ShellCommands.hs
@@ -173,7 +173,7 @@ optionsOfCommand co = case co of
   CTransformGrammar _ -> flags "printer"
   CConvertLatex _ -> none
   CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer"
-  CParse -> both "new newer cfg mcfg n ign raw v lines all prob" 
+  CParse -> both "cut new newer cfg mcfg n ign raw v lines all prob" 
                  "cat lang lexer parser number rawtrees"
   CTranslate _ _ -> opts "cat lexer parser"
   CGenerateRandom -> both "cf prob" "cat lang number depth"
diff --git a/src/GF/UseGrammar/Custom.hs b/src/GF/UseGrammar/Custom.hs
index 75294ff4b..26bad1ee9 100644
--- a/src/GF/UseGrammar/Custom.hs
+++ b/src/GF/UseGrammar/Custom.hs
@@ -161,7 +161,7 @@ customStringCommand  :: CustomData (StateGrammar -> String -> String)
 customParser         :: CustomData (StateGrammar -> CFCat -> CFParser)
 
 -- | useTokenizer, \"-lexer=x\"
-customTokenizer      :: CustomData (StateGrammar -> String -> [CFTok])  
+customTokenizer      :: CustomData (StateGrammar -> String -> [[CFTok]])  
 
 -- | useUntokenizer, \"-unlexer=x\" --- should be from token list to string
 customUntokenizer    :: CustomData (StateGrammar -> String -> String)  
@@ -416,22 +416,24 @@ customParser =
 -- add your own parsers here
   ]
 
-customTokenizer = 
+customTokenizer =
+  let sg = singleton in 
   customData "Tokenizers, selected by option -lexer=x" $
   [
-   (strCI "words",     const $ tokWords)
-  ,(strCI "literals",  const $ tokLits)
-  ,(strCI "vars",      const $ tokVars)
-  ,(strCI "chars",     const $ map (tS . singleton))
-  ,(strCI "code",      const $ lexHaskell)
-  ,(strCI "codevars",  lexHaskellVar . stateIsWord)
-  ,(strCI "text",      const $ lexText)
-  ,(strCI "unglue",    \gr -> map tS . decomposeWords (stateMorpho gr))
-  ,(strCI "codelit",   lexHaskellLiteral . stateIsWord)
-  ,(strCI "textlit",   lexTextLiteral . stateIsWord)
-  ,(strCI "codeC",     const $ lexC2M)
-  ,(strCI "ignore",    \gr -> lexIgnore (stateIsWord gr) . tokLits)
-  ,(strCI "codeCHigh", const $ lexC2M' True)
+   (strCI "words",     const $ sg . tokWords)
+  ,(strCI "literals",  const $ sg . tokLits)
+  ,(strCI "vars",      const $ sg . tokVars)
+  ,(strCI "chars",     const $ sg . map (tS . singleton))
+  ,(strCI "code",      const $ sg . lexHaskell)
+  ,(strCI "codevars",  \gr -> sg . (lexHaskellVar $ stateIsWord gr))
+  ,(strCI "text",      const $ sg . lexText)
+  ,(strCI "unglue",    \gr -> sg . map tS . decomposeWords (stateMorpho gr))
+  ,(strCI "codelit",   \gr -> sg . (lexHaskellLiteral $ stateIsWord gr))
+  ,(strCI "textlit",   \gr -> sg . (lexTextLiteral $ stateIsWord gr))
+  ,(strCI "codeC",     const $ sg . lexC2M)
+  ,(strCI "ignore",    \gr -> sg . lexIgnore (stateIsWord gr) . tokLits)
+  ,(strCI "subseqs",   \gr -> subSequences . lexIgnore (stateIsWord gr) . tokLits)
+  ,(strCI "codeCHigh", const $ sg . lexC2M' True)
 -- add your own tokenizers here
   ]
 
diff --git a/src/GF/UseGrammar/Parsing.hs b/src/GF/UseGrammar/Parsing.hs
index a9da37df5..a4699bcab 100644
--- a/src/GF/UseGrammar/Parsing.hs
+++ b/src/GF/UseGrammar/Parsing.hs
@@ -40,7 +40,7 @@ import qualified GF.Parsing.GFC as New
 
 import GF.Data.Operations
 
-import Data.List (nub)
+import Data.List (nub,sortBy)
 import Control.Monad (liftM)
 
 -- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002
@@ -51,7 +51,7 @@ parseString os sg cat = liftM fst . parseStringMsg os sg cat
 parseStringMsg :: Options -> StateGrammar -> CFCat -> String -> Err ([Tree],String)
 parseStringMsg os sg cat s = do
   (ts,(_,ss)) <- checkStart $ parseStringC os sg cat s
-  return (ts,unlines ss)
+  return (ts, unlines $ reverse ss)
 
 parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree]
 
@@ -73,7 +73,10 @@ parseStringC opts0 sg cat s
 		| otherwise              = "c" -- default algorithm
       strategy  = maybe "bottomup" id $ getOptVal opts useParser -- -parser=bottomup/topdown
       tokenizer = customOrDefault opts useTokenizer customTokenizer sg
-  ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat (tokenizer s)
+      toks = case tokenizer s of
+               t:_ -> t
+               _ -> [] ---- no support for undet. tok.
+  ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
   ts' <- mapM (checkErr . annotate (stateGrammarST sg) . refreshMetas []) ts
   return $ optIntOrAll opts flagNumber ts'
 
@@ -82,10 +85,11 @@ parseStringC opts0 sg cat s = do
       cf  = stateCF sg
       gr  = stateGrammarST sg
       cn  = cncId sg
-      tok = customOrDefault opts useTokenizer customTokenizer sg
+      toks = customOrDefault opts useTokenizer customTokenizer sg s
       parser = customOrDefault opts useParser customParser sg cat
-  tokens2trms opts sg cn parser (tok s)
-
+  if oElem (iOpt "cut") opts 
+    then doUntil (not . null) $ map (tokens2trms opts sg cn parser) toks
+    else mapM (tokens2trms opts sg cn parser) toks >>= return . concat
 
 tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]
 tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
@@ -93,10 +97,12 @@ tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
 	  info   = snd result
 	  trees  = {- nub $ -} cfParseResults result -- peb 25/5-04: removed nub (O(n^2))
 
-trees2trms :: Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree]
+trees2trms :: 
+  Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree]
 trees2trms opts sg cn as ts0 info = do
+  let s = unwords $ map prCFTok as
   ts  <- case () of
-    _ | null ts0 -> checkWarn "No success in cf parsing" >> return []
+    _ | null ts0 -> checkWarn ("No success in cf parsing" +++ s) >> return []
     _ | raw      -> do
       ts1 <- return (map cf2trm0 ts0) ----- should not need annot
       checks [
author	aarne <aarne@cs.chalmers.se>	2005-11-17 23:17:42 +0000
committer	aarne <aarne@cs.chalmers.se>	2005-11-17 23:17:42 +0000
commit	524c4829f9cc5720c18b8d43bd430d0627edcb89 (patch)
tree	c10cc4dbb4b6f0bb5464369b1ed3d028c29fec18 /src/GF
parent	e29a1430bf76b00c3714b72b7763190df6716081 (diff)