summaryrefslogtreecommitdiff
path: root/src/GF/API.hs
diff options
context:
space:
mode:
authoraarne <aarne@cs.chalmers.se>2007-06-11 07:49:30 +0000
committeraarne <aarne@cs.chalmers.se>2007-06-11 07:49:30 +0000
commita22871d07485821dac45a03380f77bdb28240ce6 (patch)
tree7109a1e3c41e5466010952ef5d887c88f34d6773 /src/GF/API.hs
parent2353e930e3f5ca7f3448860d59255ae7c0cd23b9 (diff)
initial check for unknown words in parsing
Diffstat (limited to 'src/GF/API.hs')
-rw-r--r--src/GF/API.hs19
1 files changed, 15 insertions, 4 deletions
diff --git a/src/GF/API.hs b/src/GF/API.hs
index 762fa372f..3efd81472 100644
--- a/src/GF/API.hs
+++ b/src/GF/API.hs
@@ -75,6 +75,7 @@ import GF.Infra.UseIO
import GF.Data.Zipper
import Data.List (nub)
+import Data.Char (toLower)
import Data.Maybe (fromMaybe)
import Control.Monad (liftM)
import System (system)
@@ -314,9 +315,16 @@ morphoAnalyse opts gr
mo = morpho gr
isKnownWord :: GFGrammar -> String -> Bool
-isKnownWord gr s = case morphoAnalyse (options [beShort]) gr s of
- a@(_:_:_) -> last (init a) /= '*' -- [word *]
- _ -> False
+isKnownWord gr s = GF.UseGrammar.Morphology.isKnownWord (morpho gr) s
+
+unknownTokens :: GFGrammar -> [CFTok] -> [String]
+unknownTokens gr ts =
+ [w | TC w <- ts, unk w && unk (uncap w)] ++ [w | TS w <- ts, unk w]
+ where
+ unk w = not $ GF.API.isKnownWord gr w
+ uncap (c:cs) = toLower c : cs
+ uncap s = s
+
{-
prExpXML :: StateGrammar -> Term -> [String]
@@ -397,8 +405,11 @@ optTransfer opts g = case getOptVal opts transferFun of
_ -> id
-}
+optTokenizerResult :: Options -> GFGrammar -> String -> [[CFTok]]
+optTokenizerResult opts gr = customOrDefault opts useTokenizer customTokenizer gr
+
optTokenizer :: Options -> GFGrammar -> String -> String
-optTokenizer opts gr = show . customOrDefault opts useTokenizer customTokenizer gr
+optTokenizer opts gr = show . optTokenizerResult opts gr
-- performs UTF8 if the language does not have flag coding=utf8; replaces name*U