From a22871d07485821dac45a03380f77bdb28240ce6 Mon Sep 17 00:00:00 2001 From: aarne Date: Mon, 11 Jun 2007 07:49:30 +0000 Subject: initial check for unknown words in parsing --- examples/regulus/Toy0.gf | 18 ------------------ examples/regulus/Toy0Eng.gf | 28 ---------------------------- examples/regulus/Toy0Fin.gf | 5 ----- examples/regulus/Toy0Fre.gf | 30 ------------------------------ examples/regulus/Toy0Ger.gf | 30 ------------------------------ examples/regulus/Toy0I.gf | 20 -------------------- examples/regulus/Toy0Swe.gf | 5 ----- examples/regulus/Toy0_eng.gf | 39 --------------------------------------- examples/regulus/toy0/Toy0.gf | 18 ++++++++++++++++++ examples/regulus/toy0/Toy0Eng.gf | 28 ++++++++++++++++++++++++++++ examples/regulus/toy0/Toy0Fin.gf | 5 +++++ examples/regulus/toy0/Toy0Fre.gf | 30 ++++++++++++++++++++++++++++++ examples/regulus/toy0/Toy0Ger.gf | 30 ++++++++++++++++++++++++++++++ examples/regulus/toy0/Toy0I.gf | 20 ++++++++++++++++++++ examples/regulus/toy0/Toy0Swe.gf | 5 +++++ examples/regulus/toy0/Toy0_eng.gf | 39 +++++++++++++++++++++++++++++++++++++++ src/GF/API.hs | 19 +++++++++++++++---- src/GF/Shell.hs | 2 +- src/GF/UseGrammar/Parsing.hs | 21 +++++++++++++++++---- 19 files changed, 208 insertions(+), 184 deletions(-) delete mode 100644 examples/regulus/Toy0.gf delete mode 100644 examples/regulus/Toy0Eng.gf delete mode 100644 examples/regulus/Toy0Fin.gf delete mode 100644 examples/regulus/Toy0Fre.gf delete mode 100644 examples/regulus/Toy0Ger.gf delete mode 100644 examples/regulus/Toy0I.gf delete mode 100644 examples/regulus/Toy0Swe.gf delete mode 100644 examples/regulus/Toy0_eng.gf create mode 100644 examples/regulus/toy0/Toy0.gf create mode 100644 examples/regulus/toy0/Toy0Eng.gf create mode 100644 examples/regulus/toy0/Toy0Fin.gf create mode 100644 examples/regulus/toy0/Toy0Fre.gf create mode 100644 examples/regulus/toy0/Toy0Ger.gf create mode 100644 examples/regulus/toy0/Toy0I.gf create mode 100644 examples/regulus/toy0/Toy0Swe.gf create mode 100644 examples/regulus/toy0/Toy0_eng.gf diff --git a/examples/regulus/Toy0.gf b/examples/regulus/Toy0.gf deleted file mode 100644 index ed0ba51e8..000000000 --- a/examples/regulus/Toy0.gf +++ /dev/null @@ -1,18 +0,0 @@ -abstract Toy0 = { - --- grammar from Chapter 2 of the Regulus book - -flags startcat=MAIN ; - -cat - MAIN ; NP ; Noun ; Spec ; - -fun - Main : NP -> MAIN ; - SpecNoun : Spec -> Noun -> NP ; - - One, Two : Spec ; - Felis, Canis : Noun ; - -} - diff --git a/examples/regulus/Toy0Eng.gf b/examples/regulus/Toy0Eng.gf deleted file mode 100644 index 68e2e4c07..000000000 --- a/examples/regulus/Toy0Eng.gf +++ /dev/null @@ -1,28 +0,0 @@ -concrete Toy0Eng of Toy0 = { - -param - Number = Sg | Pl ; - -lincat - Spec = {s : Str ; n : Number} ; - Noun = {s : Number => Str} ; - MAIN,NP = {s : Str} ; - -lin - Main np = np ; - SpecNoun spec noun = {s = spec.s ++ noun.s ! spec.n} ; - - One = {s = "one" ; n = Sg} ; - Two = {s = "two" ; n = Pl} ; - - Felis = regNoun "cat" ; - Canis = regNoun "dog" ; - -oper - regNoun : Str -> {s : Number => Str} = \s -> { - s = table { - Sg => s ; - Pl => s + "s" - } - } ; -} diff --git a/examples/regulus/Toy0Fin.gf b/examples/regulus/Toy0Fin.gf deleted file mode 100644 index f550e751e..000000000 --- a/examples/regulus/Toy0Fin.gf +++ /dev/null @@ -1,5 +0,0 @@ ---# -path=.:present:prelude - -concrete Toy0Fin of Toy0 = Toy0I with - (Syntax = SyntaxFin), - (Lexicon = LexiconFin) ; diff --git a/examples/regulus/Toy0Fre.gf b/examples/regulus/Toy0Fre.gf deleted file mode 100644 index 425c85af0..000000000 --- a/examples/regulus/Toy0Fre.gf +++ /dev/null @@ -1,30 +0,0 @@ -concrete Toy0Fre of Toy0 = { - -param - Number = Sg | Pl ; - Gender = Masc | Fem ; - -lincat - Spec = {s : Gender => Str ; n : Number} ; - Noun = {s : Number => Str ; g : Gender} ; - MAIN,NP = {s : Str} ; - -lin - Main np = np ; - SpecNoun spec noun = {s = spec.s ! noun.g ++ noun.s ! spec.n} ; - - One = {s = table {Fem => "une" ; _ => "un"} ; n = Sg} ; - Two = {s = \\_ => "deux" ; n = Pl} ; - - Felis = mkNoun "chat" Masc ; - Canis = mkNoun "chien" Masc ; - -oper - mkNoun : Str -> Gender -> {s : Number => Str ; g : Gender} = \s,g -> { - s = table { - Sg => s ; - Pl => s + "s" - } ; - g = g - } ; -} diff --git a/examples/regulus/Toy0Ger.gf b/examples/regulus/Toy0Ger.gf deleted file mode 100644 index 68b3b7969..000000000 --- a/examples/regulus/Toy0Ger.gf +++ /dev/null @@ -1,30 +0,0 @@ -concrete Toy0Ger of Toy0 = { - -param - Number = Sg | Pl ; - Gender = Masc | Fem | Neutr ; - -lincat - Spec = {s : Gender => Str ; n : Number} ; - Noun = {s : Number => Str ; g : Gender} ; - MAIN,NP = {s : Str} ; - -lin - Main np = np ; - SpecNoun spec noun = {s = spec.s ! noun.g ++ noun.s ! spec.n} ; - - One = {s = table {Fem => "eine" ; _ => "ein"} ; n = Sg} ; - Two = {s = \\_ => "zwei" ; n = Pl} ; - - Felis = mkNoun "Katze" "Katzen" Fem ; - Canis = mkNoun "Hund" "Hünde" Masc ; - -oper - mkNoun : Str -> Str -> Gender -> {s : Number => Str ; g : Gender} = \s,p,g -> { - s = table { - Sg => s ; - Pl => p - } ; - g = g - } ; -} diff --git a/examples/regulus/Toy0I.gf b/examples/regulus/Toy0I.gf deleted file mode 100644 index 3d206d612..000000000 --- a/examples/regulus/Toy0I.gf +++ /dev/null @@ -1,20 +0,0 @@ -incomplete concrete Toy0I of Toy0 = open Syntax, Lexicon in { - -lincat - Spec = Det ; - Noun = N ; - NP = Syntax.NP ; - MAIN = Utt ; - -lin - Main np = mkUtt np ; - SpecNoun spec noun = mkNP spec noun ; - - One = mkDet one_Quant ; - Two = mkDet (mkNum n2_Numeral) ; - - Felis = cat_N ; - Canis = dog_N ; - -} - diff --git a/examples/regulus/Toy0Swe.gf b/examples/regulus/Toy0Swe.gf deleted file mode 100644 index 5de273d00..000000000 --- a/examples/regulus/Toy0Swe.gf +++ /dev/null @@ -1,5 +0,0 @@ ---# -path=.:present:prelude - -concrete Toy0Swe of Toy0 = Toy0I with - (Syntax = SyntaxSwe), - (Lexicon = LexiconSwe) ; diff --git a/examples/regulus/Toy0_eng.gf b/examples/regulus/Toy0_eng.gf deleted file mode 100644 index ed8fe8063..000000000 --- a/examples/regulus/Toy0_eng.gf +++ /dev/null @@ -1,39 +0,0 @@ --- grammar from Chapter 2 of the Regulus book - -flags startcat=MAIN ; - -cat - MAIN ; NP ; Noun ; Spec ; - -fun - Main : NP -> MAIN ; - SpecNoun : Spec -> Noun -> NP ; - - One, Two : Spec ; - Felis, Canis : Noun ; - -param - Number = Sg | Pl ; - -lincat - Spec = {s : Str ; n : Number} ; - Noun = {s : Number => Str} ; - MAIN,NP = {s : Str} ; - -lin - Main np = np ; - SpecNoun spec noun = {s = spec.s ++ noun.s ! spec.n} ; - - One = {s = "one" ; n = Sg} ; - Two = {s = "two" ; n = Pl} ; - - Felis = regNoun "cat" ; - Canis = regNoun "dog" ; - -oper - regNoun : Str -> {s : Number => Str} = \s -> { - s = table { - Sg => s ; - Pl => s + "s" - } - } ; diff --git a/examples/regulus/toy0/Toy0.gf b/examples/regulus/toy0/Toy0.gf new file mode 100644 index 000000000..ed0ba51e8 --- /dev/null +++ b/examples/regulus/toy0/Toy0.gf @@ -0,0 +1,18 @@ +abstract Toy0 = { + +-- grammar from Chapter 2 of the Regulus book + +flags startcat=MAIN ; + +cat + MAIN ; NP ; Noun ; Spec ; + +fun + Main : NP -> MAIN ; + SpecNoun : Spec -> Noun -> NP ; + + One, Two : Spec ; + Felis, Canis : Noun ; + +} + diff --git a/examples/regulus/toy0/Toy0Eng.gf b/examples/regulus/toy0/Toy0Eng.gf new file mode 100644 index 000000000..68e2e4c07 --- /dev/null +++ b/examples/regulus/toy0/Toy0Eng.gf @@ -0,0 +1,28 @@ +concrete Toy0Eng of Toy0 = { + +param + Number = Sg | Pl ; + +lincat + Spec = {s : Str ; n : Number} ; + Noun = {s : Number => Str} ; + MAIN,NP = {s : Str} ; + +lin + Main np = np ; + SpecNoun spec noun = {s = spec.s ++ noun.s ! spec.n} ; + + One = {s = "one" ; n = Sg} ; + Two = {s = "two" ; n = Pl} ; + + Felis = regNoun "cat" ; + Canis = regNoun "dog" ; + +oper + regNoun : Str -> {s : Number => Str} = \s -> { + s = table { + Sg => s ; + Pl => s + "s" + } + } ; +} diff --git a/examples/regulus/toy0/Toy0Fin.gf b/examples/regulus/toy0/Toy0Fin.gf new file mode 100644 index 000000000..f550e751e --- /dev/null +++ b/examples/regulus/toy0/Toy0Fin.gf @@ -0,0 +1,5 @@ +--# -path=.:present:prelude + +concrete Toy0Fin of Toy0 = Toy0I with + (Syntax = SyntaxFin), + (Lexicon = LexiconFin) ; diff --git a/examples/regulus/toy0/Toy0Fre.gf b/examples/regulus/toy0/Toy0Fre.gf new file mode 100644 index 000000000..425c85af0 --- /dev/null +++ b/examples/regulus/toy0/Toy0Fre.gf @@ -0,0 +1,30 @@ +concrete Toy0Fre of Toy0 = { + +param + Number = Sg | Pl ; + Gender = Masc | Fem ; + +lincat + Spec = {s : Gender => Str ; n : Number} ; + Noun = {s : Number => Str ; g : Gender} ; + MAIN,NP = {s : Str} ; + +lin + Main np = np ; + SpecNoun spec noun = {s = spec.s ! noun.g ++ noun.s ! spec.n} ; + + One = {s = table {Fem => "une" ; _ => "un"} ; n = Sg} ; + Two = {s = \\_ => "deux" ; n = Pl} ; + + Felis = mkNoun "chat" Masc ; + Canis = mkNoun "chien" Masc ; + +oper + mkNoun : Str -> Gender -> {s : Number => Str ; g : Gender} = \s,g -> { + s = table { + Sg => s ; + Pl => s + "s" + } ; + g = g + } ; +} diff --git a/examples/regulus/toy0/Toy0Ger.gf b/examples/regulus/toy0/Toy0Ger.gf new file mode 100644 index 000000000..68b3b7969 --- /dev/null +++ b/examples/regulus/toy0/Toy0Ger.gf @@ -0,0 +1,30 @@ +concrete Toy0Ger of Toy0 = { + +param + Number = Sg | Pl ; + Gender = Masc | Fem | Neutr ; + +lincat + Spec = {s : Gender => Str ; n : Number} ; + Noun = {s : Number => Str ; g : Gender} ; + MAIN,NP = {s : Str} ; + +lin + Main np = np ; + SpecNoun spec noun = {s = spec.s ! noun.g ++ noun.s ! spec.n} ; + + One = {s = table {Fem => "eine" ; _ => "ein"} ; n = Sg} ; + Two = {s = \\_ => "zwei" ; n = Pl} ; + + Felis = mkNoun "Katze" "Katzen" Fem ; + Canis = mkNoun "Hund" "Hünde" Masc ; + +oper + mkNoun : Str -> Str -> Gender -> {s : Number => Str ; g : Gender} = \s,p,g -> { + s = table { + Sg => s ; + Pl => p + } ; + g = g + } ; +} diff --git a/examples/regulus/toy0/Toy0I.gf b/examples/regulus/toy0/Toy0I.gf new file mode 100644 index 000000000..3d206d612 --- /dev/null +++ b/examples/regulus/toy0/Toy0I.gf @@ -0,0 +1,20 @@ +incomplete concrete Toy0I of Toy0 = open Syntax, Lexicon in { + +lincat + Spec = Det ; + Noun = N ; + NP = Syntax.NP ; + MAIN = Utt ; + +lin + Main np = mkUtt np ; + SpecNoun spec noun = mkNP spec noun ; + + One = mkDet one_Quant ; + Two = mkDet (mkNum n2_Numeral) ; + + Felis = cat_N ; + Canis = dog_N ; + +} + diff --git a/examples/regulus/toy0/Toy0Swe.gf b/examples/regulus/toy0/Toy0Swe.gf new file mode 100644 index 000000000..5de273d00 --- /dev/null +++ b/examples/regulus/toy0/Toy0Swe.gf @@ -0,0 +1,5 @@ +--# -path=.:present:prelude + +concrete Toy0Swe of Toy0 = Toy0I with + (Syntax = SyntaxSwe), + (Lexicon = LexiconSwe) ; diff --git a/examples/regulus/toy0/Toy0_eng.gf b/examples/regulus/toy0/Toy0_eng.gf new file mode 100644 index 000000000..ed8fe8063 --- /dev/null +++ b/examples/regulus/toy0/Toy0_eng.gf @@ -0,0 +1,39 @@ +-- grammar from Chapter 2 of the Regulus book + +flags startcat=MAIN ; + +cat + MAIN ; NP ; Noun ; Spec ; + +fun + Main : NP -> MAIN ; + SpecNoun : Spec -> Noun -> NP ; + + One, Two : Spec ; + Felis, Canis : Noun ; + +param + Number = Sg | Pl ; + +lincat + Spec = {s : Str ; n : Number} ; + Noun = {s : Number => Str} ; + MAIN,NP = {s : Str} ; + +lin + Main np = np ; + SpecNoun spec noun = {s = spec.s ++ noun.s ! spec.n} ; + + One = {s = "one" ; n = Sg} ; + Two = {s = "two" ; n = Pl} ; + + Felis = regNoun "cat" ; + Canis = regNoun "dog" ; + +oper + regNoun : Str -> {s : Number => Str} = \s -> { + s = table { + Sg => s ; + Pl => s + "s" + } + } ; diff --git a/src/GF/API.hs b/src/GF/API.hs index 762fa372f..3efd81472 100644 --- a/src/GF/API.hs +++ b/src/GF/API.hs @@ -75,6 +75,7 @@ import GF.Infra.UseIO import GF.Data.Zipper import Data.List (nub) +import Data.Char (toLower) import Data.Maybe (fromMaybe) import Control.Monad (liftM) import System (system) @@ -314,9 +315,16 @@ morphoAnalyse opts gr mo = morpho gr isKnownWord :: GFGrammar -> String -> Bool -isKnownWord gr s = case morphoAnalyse (options [beShort]) gr s of - a@(_:_:_) -> last (init a) /= '*' -- [word *] - _ -> False +isKnownWord gr s = GF.UseGrammar.Morphology.isKnownWord (morpho gr) s + +unknownTokens :: GFGrammar -> [CFTok] -> [String] +unknownTokens gr ts = + [w | TC w <- ts, unk w && unk (uncap w)] ++ [w | TS w <- ts, unk w] + where + unk w = not $ GF.API.isKnownWord gr w + uncap (c:cs) = toLower c : cs + uncap s = s + {- prExpXML :: StateGrammar -> Term -> [String] @@ -397,8 +405,11 @@ optTransfer opts g = case getOptVal opts transferFun of _ -> id -} +optTokenizerResult :: Options -> GFGrammar -> String -> [[CFTok]] +optTokenizerResult opts gr = customOrDefault opts useTokenizer customTokenizer gr + optTokenizer :: Options -> GFGrammar -> String -> String -optTokenizer opts gr = show . customOrDefault opts useTokenizer customTokenizer gr +optTokenizer opts gr = show . optTokenizerResult opts gr -- performs UTF8 if the language does not have flag coding=utf8; replaces name*U diff --git a/src/GF/Shell.hs b/src/GF/Shell.hs index 29a4b6c23..dd8267a91 100644 --- a/src/GF/Shell.hs +++ b/src/GF/Shell.hs @@ -254,7 +254,7 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com | otherwise -> parse $ prCommandArg a where parse x = do - warnDiscont opts + warnDiscont opts let p = optParseArgErrMsg opts gro x case p of Ok (ts,msg) diff --git a/src/GF/UseGrammar/Parsing.hs b/src/GF/UseGrammar/Parsing.hs index 65ed26863..599268b1d 100644 --- a/src/GF/UseGrammar/Parsing.hs +++ b/src/GF/UseGrammar/Parsing.hs @@ -29,6 +29,7 @@ import GF.Grammar.TypeCheck import GF.Grammar.Values --import CFMethod import GF.UseGrammar.Tokenize +import GF.UseGrammar.Morphology (isKnownWord) import GF.CF.Profile import GF.Infra.Option import GF.UseGrammar.Custom @@ -41,6 +42,7 @@ import qualified GF.Parsing.GFC as New import GF.Data.Operations import Data.List (nub,sortBy) +import Data.Char (toLower) import Control.Monad (liftM) -- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002 @@ -82,10 +84,21 @@ parseStringC opts0 sg cat s toks = case tokenizer s of t:_ -> t _ -> [] ---- no support for undet. tok. - ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks - ts' <- checkErr $ - allChecks $ map (annotate (stateGrammarST sg) . refreshMetas []) ts - return $ optIntOrAll opts flagNumber ts' + unknowns = + [w | TC w <- toks, unk w && unk (uncap w)] ++ [w | TS w <- toks, unk w] + where + unk w = not $ isKnownWord (morpho sg) w + uncap (c:cs) = toLower c : cs + uncap s = s + + case unknowns of + _:_ -> fail $ "Unknown words:" +++ unwords unknowns + _ -> do + + ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks + ts' <- checkErr $ + allChecks $ map (annotate (stateGrammarST sg) . refreshMetas []) ts + return $ optIntOrAll opts flagNumber ts' tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree] -- cgit v1.2.3