summaryrefslogtreecommitdiff
path: root/src/GF/Text
diff options
context:
space:
mode:
authoraarne <aarne@cs.chalmers.se>2008-09-05 09:21:48 +0000
committeraarne <aarne@cs.chalmers.se>2008-09-05 09:21:48 +0000
commitbdcfcda786ba0332fb46b73b03d3e2a245144cc8 (patch)
tree5f2d05fd8fb3a457a9372d075d778624a6c74ef0 /src/GF/Text
parentb3803424ff7c82f340736842bb98fbd622c0e8bf (diff)
capitals in lextext and unlextext; notice that a sentence starting with a proper name now gets lexed with a small letter if lextext is used
Diffstat (limited to 'src/GF/Text')
-rw-r--r--src/GF/Text/Lexing.hs36
1 files changed, 24 insertions, 12 deletions
diff --git a/src/GF/Text/Lexing.hs b/src/GF/Text/Lexing.hs
index 70bb4e434..834f0b5cc 100644
--- a/src/GF/Text/Lexing.hs
+++ b/src/GF/Text/Lexing.hs
@@ -17,7 +17,7 @@ stringOp name = case name of
"lexmixed" -> Just $ appLexer lexMixed
"words" -> Just $ appLexer words
"bind" -> Just $ appUnlexer bindTok
- "uncars" -> Just $ appUnlexer concat
+ "unchars" -> Just $ appUnlexer concat
"unlextext" -> Just $ appUnlexer unlexText
"unlexcode" -> Just $ appUnlexer unlexCode
"unlexmixed" -> Just $ appUnlexer unlexMixed
@@ -40,11 +40,16 @@ wrapHTML = unlines . tag . intersperse "<br>" . lines where
tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"]
lexText :: String -> [String]
-lexText s = case s of
- c:cs | isPunct c -> [c] : lexText cs
- c:cs | isSpace c -> lexText cs
- _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lexText cs
- _ -> [s]
+lexText = uncap . lext where
+ lext s = case s of
+ c:cs | isMajorPunct c -> [c] : uncap (lext cs)
+ c:cs | isMinorPunct c -> [c] : lext cs
+ c:cs | isSpace c -> lext cs
+ _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
+ _ -> [s]
+ uncap s = case s of
+ (c:cs):ws -> (toLower c : cs):ws
+ _ -> s
-- | Haskell lexer, usable for much code
lexCode :: String -> [String]
@@ -70,12 +75,17 @@ bindTok ws = case ws of
[] -> ""
unlexText :: [String] -> String
-unlexText s = case s of
- w:[] -> w
- w:[c]:[] | isPunct c -> w ++ [c]
- w:[c]:cs | isPunct c -> w ++ [c] ++ " " ++ unlexText cs
- w:ws -> w ++ " " ++ unlexText ws
- _ -> []
+unlexText = cap . unlext where
+ unlext s = case s of
+ w:[] -> w
+ w:[c]:[] | isPunct c -> w ++ [c]
+ w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ cap (unlext cs)
+ w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
+ w:ws -> w ++ " " ++ unlext ws
+ _ -> []
+ cap s = case s of
+ c:cs -> toUpper c : cs
+ _ -> s
unlexCode :: [String] -> String
unlexCode s = case s of
@@ -97,5 +107,7 @@ unlexMixed = concat . alternate False where
sep env c = if env then c ++ " " else " " ++ c
isPunct = flip elem ".?!,:;"
+isMajorPunct = flip elem ".?!"
+isMinorPunct = flip elem ",:;"
isParen = flip elem "()[]{}"
isClosing = flip elem ")]}"