From 04a6260eeaf626cf4774d087d8810d438f554b46 Mon Sep 17 00:00:00 2001 From: hallgren Date: Wed, 9 Apr 2014 14:13:18 +0000 Subject: PGF Service: a bit more clever lexer=text Only change the first word to lowercase if the original input is not found in the grammar's morphology. This allows parsing of sentenses starting with "I" in English, nouns in German and proper names in other languages, but it can make the wrong choice for multi-words. --- src/runtime/haskell/PGF/Lexing.hs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/haskell/PGF/Lexing.hs b/src/runtime/haskell/PGF/Lexing.hs index 808a2af6f..10d8332f7 100644 --- a/src/runtime/haskell/PGF/Lexing.hs +++ b/src/runtime/haskell/PGF/Lexing.hs @@ -2,8 +2,13 @@ module PGF.Lexing where import Data.Char(isSpace,toLower,toUpper) -- * Text lexing +-- | Text lexing with standard word capitalization of the first word of every sentence lexText :: String -> [String] -lexText = uncap . lext where +lexText = lexText' uncapitInit + +-- | Text lexing with custom treatment of the first word of every sentence. +lexText' :: (String->String) -> String -> [String] +lexText' uncap1 = uncap . lext where lext s = case s of c:cs | isMajorPunct c -> [c] : uncap (lext cs) c:cs | isMinorPunct c -> [c] : lext cs @@ -11,7 +16,7 @@ lexText = uncap . lext where _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs _ -> [s] uncap s = case s of - (c:cs):ws -> (toLower c : cs):ws + w:ws -> uncap1 w:ws _ -> s unlexText :: [String] -> String @@ -78,6 +83,11 @@ capitInit s = case s of c:cs -> toUpper c : cs _ -> s +-- | Uncapitalize first letter +uncapitInit s = case s of + c:cs -> toLower c : cs + _ -> s + -- | Unquote each string wrapped in double quotes unquote = map unq where unq s = case s of -- cgit v1.2.3