summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorkrasimir <krasimir@chalmers.se>2009-10-23 08:35:32 +0000
committerkrasimir <krasimir@chalmers.se>2009-10-23 08:35:32 +0000
commitd5f4669aec26c6a580a28b05a6005425ad663555 (patch)
treea0ea03cadac7cce788e3359ed48434837fcd0fcd /src
parent70b5e2a93037603f9f5b20c46e9ad15a95a7c097 (diff)
experimental robust parser
Diffstat (limited to 'src')
-rw-r--r--src/GF/Command/Commands.hs18
-rw-r--r--src/GF/Data/TrieMap.hs11
-rw-r--r--src/GFI.hs8
-rw-r--r--src/PGF.hs17
-rw-r--r--src/PGF/Parsing/FCFG/Incremental.hs93
5 files changed, 122 insertions, 25 deletions
diff --git a/src/GF/Command/Commands.hs b/src/GF/Command/Commands.hs
index 1b12d82cc..d182b65ba 100644
--- a/src/GF/Command/Commands.hs
+++ b/src/GF/Command/Commands.hs
@@ -389,12 +389,17 @@ allCommands cod env@(pgf, mos) = Map.fromList [
"Shows all trees returned by parsing a string in the grammars in scope.",
"The -lang flag can be used to restrict this to fewer languages.",
"The default start category can be overridden by the -cat flag.",
- "See also the ps command for lexing and character encoding."
+ "See also the ps command for lexing and character encoding.",
+ "",
+ "The -openclass flag is experimental and allows some robustness in ",
+ "the parser. For example if -openclass=\"A,N,V\" is given, the parser",
+ "will accept unknown adjectives, nouns and verbs with the resource grammar."
],
exec = \opts -> returnFromExprs . concatMap (par opts) . toStrings,
flags = [
("cat","target category of parsing"),
- ("lang","the languages of parsing (comma-separated, no spaces)")
+ ("lang","the languages of parsing (comma-separated, no spaces)"),
+ ("openclass","list of open-class categories for robust parsing")
]
}),
("pg", emptyCommandInfo { -----
@@ -742,7 +747,9 @@ allCommands cod env@(pgf, mos) = Map.fromList [
]
where
enc = encodeUnicode cod
- par opts s = concat [parse pgf lang (optType opts) s | lang <- optLangs opts, canParse pgf lang]
+ par opts s = case optOpenTypes opts of
+ [] -> concat [parse pgf lang (optType opts) s | lang <- optLangs opts, canParse pgf lang]
+ open_typs -> concat [parseWithRecovery pgf lang (optType opts) open_typs s | lang <- optLangs opts, canParse pgf lang]
void = ([],[])
@@ -789,6 +796,11 @@ allCommands cod env@(pgf, mos) = Map.fromList [
"" -> languages pgf
lang -> map mkCId (chunks ',' lang)
optLang opts = head $ optLangs opts ++ [wildCId]
+
+ optOpenTypes opts = case valStrOpts "openclass" "" opts of
+ "" -> []
+ cats -> mapMaybe readType (chunks ',' cats)
+
optType opts =
let str = valStrOpts "cat" (showCId $ lookStartCat pgf) opts
in case readType str of
diff --git a/src/GF/Data/TrieMap.hs b/src/GF/Data/TrieMap.hs
index 37c56fc3a..a6749d641 100644
--- a/src/GF/Data/TrieMap.hs
+++ b/src/GF/Data/TrieMap.hs
@@ -12,6 +12,9 @@ module GF.Data.TrieMap
, insertWith
, unionWith
+ , unionsWith
+
+ , elems
) where
import Prelude hiding (lookup, null)
@@ -53,3 +56,11 @@ unionWith f (Tr mb_v1 m1) (Tr mb_v2 m2) =
(Just v1,Just v2) -> Just (f v1 v2)
m = Map.unionWith (unionWith f) m1 m2
in Tr mb_v m
+
+unionsWith :: Ord k => (v -> v -> v) -> [TrieMap k v] -> TrieMap k v
+unionsWith f = foldl (unionWith f) empty
+
+elems :: TrieMap k v -> [v]
+elems tr = collect tr []
+ where
+ collect (Tr mb_v m) xs = maybe id (:) mb_v (Map.fold collect xs m)
diff --git a/src/GFI.hs b/src/GFI.hs
index 7b98f0ff8..727e91209 100644
--- a/src/GFI.hs
+++ b/src/GFI.hs
@@ -262,7 +262,7 @@ wordCompletion gfenv line0 prefix0 p =
-> do mb_state0 <- try (evaluate (initState pgf (optLang opts) (optType opts)))
case mb_state0 of
Right state0 -> let ws = words (take (length s - length prefix) s)
- in case foldM nextState state0 ws of
+ in case loop state0 ws of
Nothing -> ret ' ' []
Just state -> let compls = getCompletions state prefix
in ret ' ' (map (encode gfenv) (Map.keys compls))
@@ -295,7 +295,11 @@ wordCompletion gfenv line0 prefix0 p =
Just ty -> ty
Nothing -> error ("Can't parse '"++str++"' as type")
-
+ loop ps [] = Just ps
+ loop ps (t:ts) = case nextState ps t of
+ Left es -> Nothing
+ Right ps -> loop ps ts
+
ret c [x] = return [x++[c]]
ret _ xs = return xs
diff --git a/src/PGF.hs b/src/PGF.hs
index b9ad357c9..81e6d5024 100644
--- a/src/PGF.hs
+++ b/src/PGF.hs
@@ -29,7 +29,7 @@ module PGF(
-- * Types
Type, Hypo,
showType, readType,
- mkType, mkHypo, mkDepHypo, mkImplHypo,
+ mkType, mkHypo, mkDepHypo, mkImplHypo,
categories, startCat,
-- * Functions
@@ -54,7 +54,7 @@ module PGF(
showPrintName,
-- ** Parsing
- parse, canParse, parseAllLang, parseAll,
+ parse, parseWithRecovery, canParse, parseAllLang, parseAll,
-- ** Evaluation
PGF.compute, paraphrase,
@@ -75,7 +75,7 @@ module PGF(
-- ** Word Completion (Incremental Parsing)
complete,
Incremental.ParseState,
- Incremental.initState, Incremental.nextState, Incremental.getCompletions, Incremental.extractTrees,
+ Incremental.initState, Incremental.nextState, Incremental.getCompletions, Incremental.recoveryStates, Incremental.extractTrees,
-- ** Generation
generateRandom, generateAll, generateAllDepth,
@@ -131,6 +131,8 @@ linearize :: PGF -> Language -> Tree -> String
-- for parsing, see 'canParse'.
parse :: PGF -> Language -> Type -> String -> [Tree]
+parseWithRecovery :: PGF -> Language -> Type -> [Type] -> String -> [Tree]
+
-- | Checks whether the given language can be used for parsing.
canParse :: PGF -> Language -> Bool
@@ -241,6 +243,8 @@ parse pgf lang typ s =
Nothing -> error ("No parser built for language: " ++ showCId lang)
Nothing -> error ("Unknown language: " ++ showCId lang)
+parseWithRecovery pgf lang typ open_typs s = Incremental.parseWithRecovery pgf lang typ open_typs (words s)
+
canParse pgf cnc = isJust (lookParser pgf cnc)
linearizeAll mgr = map snd . linearizeAllLang mgr
@@ -282,7 +286,7 @@ functionType pgf fun =
complete pgf from typ input =
let (ws,prefix) = tokensAndPrefix input
state0 = Incremental.initState pgf from typ
- in case foldM Incremental.nextState state0 ws of
+ in case loop state0 ws of
Nothing -> []
Just state ->
(if null prefix && not (null (Incremental.extractTrees state typ)) then [unwords ws ++ " "] else [])
@@ -294,6 +298,11 @@ complete pgf from typ input =
| otherwise = (init ws, last ws)
where ws = words s
+ loop ps [] = Just ps
+ loop ps (t:ts) = case Incremental.nextState ps t of
+ Left es -> Nothing
+ Right ps -> loop ps ts
+
-- | Converts an expression to normal form
compute :: PGF -> Expr -> Expr
compute pgf = PGF.Data.normalForm (funs (abstract pgf)) 0 []
diff --git a/src/PGF/Parsing/FCFG/Incremental.hs b/src/PGF/Parsing/FCFG/Incremental.hs
index 6ae18e3bf..dbb87556d 100644
--- a/src/PGF/Parsing/FCFG/Incremental.hs
+++ b/src/PGF/Parsing/FCFG/Incremental.hs
@@ -1,11 +1,14 @@
{-# LANGUAGE BangPatterns #-}
module PGF.Parsing.FCFG.Incremental
( ParseState
+ , ErrorState
, initState
, nextState
, getCompletions
+ , recoveryStates
, extractTrees
, parse
+ , parseWithRecovery
) where
import Data.Array.IArray
@@ -26,8 +29,28 @@ import PGF.Macros
import PGF.TypeCheck
import Debug.Trace
-parse :: PGF -> Language -> Type -> [String] -> [Expr]
-parse pgf lang typ toks = maybe [] (\ps -> extractTrees ps typ) (foldM nextState (initState pgf lang typ) toks)
+parse :: PGF -> Language -> Type -> [String] -> [Tree]
+parse pgf lang typ toks = loop (initState pgf lang typ) toks
+ where
+ loop ps [] = extractTrees ps typ
+ loop ps (t:ts) = case nextState ps t of
+ Left es -> []
+ Right ps -> loop ps ts
+
+parseWithRecovery :: PGF -> Language -> Type -> [Type] -> [String] -> [Tree]
+parseWithRecovery pgf lang typ open_typs toks = accept (initState pgf lang typ) toks
+ where
+ accept ps [] = extractTrees ps typ
+ accept ps (t:ts) =
+ case nextState ps t of
+ Right ps -> accept ps ts
+ Left es -> skip (recoveryStates open_typs es) ts
+
+ skip ps_map [] = extractTrees (fst ps_map) typ
+ skip ps_map (t:ts) =
+ case Map.lookup t (snd ps_map) of
+ Just ps -> accept ps ts
+ Nothing -> skip ps_map ts
-- | Creates an initial parsing state for a given language and
-- startup category.
@@ -46,16 +69,18 @@ initState pgf lang (DTyp _ start _) =
Just pinfo -> pinfo
_ -> error ("Unknown language: " ++ showCId lang)
- in State pgf
- pinfo
- (Chart emptyAC [] emptyPC (productions pinfo) (totalCats pinfo) 0)
- (TMap.singleton [] (Set.fromList items))
+ in PState pgf
+ pinfo
+ (Chart emptyAC [] emptyPC (productions pinfo) (totalCats pinfo) 0)
+ (TMap.singleton [] (Set.fromList items))
-- | From the current state and the next token
--- 'nextState' computes a new state where the token
--- is consumed and the current position shifted by one.
-nextState :: ParseState -> String -> Maybe ParseState
-nextState (State pgf pinfo chart items) t =
+-- 'nextState' computes a new state, where the token
+-- is consumed and the current position is shifted by one.
+-- If the new token cannot be accepted then an error state
+-- is returned.
+nextState :: ParseState -> String -> Either ErrorState ParseState
+nextState (PState pgf pinfo chart items) t =
let (mb_agenda,map_items) = TMap.decompose items
agenda = maybe [] Set.toList mb_agenda
acc = fromMaybe TMap.empty (Map.lookup t map_items)
@@ -66,8 +91,8 @@ nextState (State pgf pinfo chart items) t =
, offset =offset chart1+1
}
in if TMap.null acc1
- then Nothing
- else Just (State pgf pinfo chart2 acc1)
+ then Left (EState pgf pinfo chart2)
+ else Right (PState pgf pinfo chart2 acc1)
where
add (tok:toks) item acc
| tok == t = TMap.insertWith Set.union toks (Set.singleton item) acc
@@ -78,7 +103,7 @@ nextState (State pgf pinfo chart items) t =
-- next words and the consequent states. This is used for word completions in
-- the GF interpreter.
getCompletions :: ParseState -> String -> Map.Map String ParseState
-getCompletions (State pgf pinfo chart items) w =
+getCompletions (PState pgf pinfo chart items) w =
let (mb_agenda,map_items) = TMap.decompose items
agenda = maybe [] Set.toList mb_agenda
acc = Map.filterWithKey (\tok _ -> isPrefixOf w tok) map_items
@@ -88,12 +113,34 @@ getCompletions (State pgf pinfo chart items) w =
, passive=emptyPC
, offset =offset chart1+1
}
- in fmap (State pgf pinfo chart2) acc'
+ in fmap (PState pgf pinfo chart2) acc'
where
add (tok:toks) item acc
| isPrefixOf w tok = Map.insertWith (TMap.unionWith Set.union) tok (TMap.singleton toks (Set.singleton item)) acc
add _ item acc = acc
+recoveryStates :: [Type] -> ErrorState -> (ParseState, Map.Map String ParseState)
+recoveryStates open_types (EState pgf pinfo chart) =
+ let open_fcats = concatMap type2fcats open_types
+ agenda = foldl (complete open_fcats) [] (actives chart)
+ (acc,chart1) = process Nothing add (sequences pinfo) (functions pinfo) agenda Map.empty chart
+ chart2 = chart1{ active =emptyAC
+ , actives=active chart1 : actives chart1
+ , passive=emptyPC
+ , offset =offset chart1+1
+ }
+ in (PState pgf pinfo chart (TMap.singleton [] (Set.fromList agenda)), fmap (PState pgf pinfo chart2) acc)
+ where
+ type2fcats (DTyp _ cat _) = fromMaybe [] (Map.lookup cat (startCats pinfo))
+
+ complete open_fcats items ac =
+ foldl (Set.fold (\(Active j' ppos funid seqid args keyc) ->
+ (:) (Active j' (ppos+1) funid seqid args keyc)))
+ items
+ [set | fcat <- open_fcats, set <- lookupACByFCat fcat ac]
+
+ add (tok:toks) item acc = Map.insertWith (TMap.unionWith Set.union) tok (TMap.singleton toks (Set.singleton item)) acc
+
-- | This function extracts the list of all completed parse trees
-- that spans the whole input consumed so far. The trees are also
-- limited by the category specified, which is usually
@@ -189,7 +236,7 @@ process mbt fn !seqs !funs (item@(Active j ppos funid seqid args key0):items) ac
Just set -> Set.fold (\(Active j' ppos funid seqid args keyc) ->
let FSymCat d _ = unsafeAt (unsafeAt seqs seqid) ppos
in (:) (Active j' (ppos+1) funid seqid (updateAt d fid args) keyc)) items set
- in process mbt fn seqs funs items2 acc chart{passive=insertPC (mkPK key0 j) fid (passive chart)
+ in process mbt fn seqs funs items2 acc chart{passive=insertPC (mkPK key0 j) fid (passive chart)
,forest =IntMap.insert fid (Set.singleton (FApply funid args)) (forest chart)
,nextId =nextId chart+1
}
@@ -243,6 +290,12 @@ emptyAC = IntMap.empty
lookupAC :: ActiveKey -> ActiveChart -> Maybe (Set.Set Active)
lookupAC (AK fcat l) chart = IntMap.lookup fcat chart >>= IntMap.lookup l
+lookupACByFCat :: FCat -> ActiveChart -> [Set.Set Active]
+lookupACByFCat fcat chart =
+ case IntMap.lookup fcat chart of
+ Nothing -> []
+ Just map -> IntMap.elems map
+
labelsAC :: FCat -> ActiveChart -> [FIndex]
labelsAC fcat chart =
case IntMap.lookup fcat chart of
@@ -296,7 +349,7 @@ foldForest f g b fcat forest =
-- | An abstract data type whose values represent
-- the current state in an incremental parser.
-data ParseState = State PGF ParserInfo Chart (TMap.TrieMap String (Set.Set Active))
+data ParseState = PState PGF ParserInfo Chart (TMap.TrieMap String (Set.Set Active))
data Chart
= Chart
@@ -308,3 +361,11 @@ data Chart
, offset :: {-# UNPACK #-} !Int
}
deriving Show
+
+----------------------------------------------------------------
+-- Error State
+----------------------------------------------------------------
+
+-- | An abstract data type whose values represent
+-- the state in an incremental parser after an error.
+data ErrorState = EState PGF ParserInfo Chart