diff options
| author | aarne <aarne@cs.chalmers.se> | 2007-06-11 07:49:30 +0000 |
|---|---|---|
| committer | aarne <aarne@cs.chalmers.se> | 2007-06-11 07:49:30 +0000 |
| commit | a22871d07485821dac45a03380f77bdb28240ce6 (patch) | |
| tree | 7109a1e3c41e5466010952ef5d887c88f34d6773 /src/GF/API.hs | |
| parent | 2353e930e3f5ca7f3448860d59255ae7c0cd23b9 (diff) | |
initial check for unknown words in parsing
Diffstat (limited to 'src/GF/API.hs')
| -rw-r--r-- | src/GF/API.hs | 19 |
1 files changed, 15 insertions, 4 deletions
diff --git a/src/GF/API.hs b/src/GF/API.hs index 762fa372f..3efd81472 100644 --- a/src/GF/API.hs +++ b/src/GF/API.hs @@ -75,6 +75,7 @@ import GF.Infra.UseIO import GF.Data.Zipper import Data.List (nub) +import Data.Char (toLower) import Data.Maybe (fromMaybe) import Control.Monad (liftM) import System (system) @@ -314,9 +315,16 @@ morphoAnalyse opts gr mo = morpho gr isKnownWord :: GFGrammar -> String -> Bool -isKnownWord gr s = case morphoAnalyse (options [beShort]) gr s of - a@(_:_:_) -> last (init a) /= '*' -- [word *] - _ -> False +isKnownWord gr s = GF.UseGrammar.Morphology.isKnownWord (morpho gr) s + +unknownTokens :: GFGrammar -> [CFTok] -> [String] +unknownTokens gr ts = + [w | TC w <- ts, unk w && unk (uncap w)] ++ [w | TS w <- ts, unk w] + where + unk w = not $ GF.API.isKnownWord gr w + uncap (c:cs) = toLower c : cs + uncap s = s + {- prExpXML :: StateGrammar -> Term -> [String] @@ -397,8 +405,11 @@ optTransfer opts g = case getOptVal opts transferFun of _ -> id -} +optTokenizerResult :: Options -> GFGrammar -> String -> [[CFTok]] +optTokenizerResult opts gr = customOrDefault opts useTokenizer customTokenizer gr + optTokenizer :: Options -> GFGrammar -> String -> String -optTokenizer opts gr = show . customOrDefault opts useTokenizer customTokenizer gr +optTokenizer opts gr = show . optTokenizerResult opts gr -- performs UTF8 if the language does not have flag coding=utf8; replaces name*U |
