summaryrefslogtreecommitdiff
path: root/src/GF/Text/Lexing.hs
diff options
context:
space:
mode:
authoraarne <aarne@cs.chalmers.se>2008-06-25 16:54:35 +0000
committeraarne <aarne@cs.chalmers.se>2008-06-25 16:54:35 +0000
commite9e80fc389365e24d4300d7d5390c7d833a96c50 (patch)
treef0b58473adaa670bd8fc52ada419d8cad470ee03 /src/GF/Text/Lexing.hs
parentb96b36f43de3e2f8b58d5f539daa6f6d47f25870 (diff)
changed names of resource-1.3; added a note on homepage on release
Diffstat (limited to 'src/GF/Text/Lexing.hs')
-rw-r--r--src/GF/Text/Lexing.hs115
1 files changed, 115 insertions, 0 deletions
diff --git a/src/GF/Text/Lexing.hs b/src/GF/Text/Lexing.hs
new file mode 100644
index 000000000..2c6b417b8
--- /dev/null
+++ b/src/GF/Text/Lexing.hs
@@ -0,0 +1,115 @@
+module GF.Text.Lexing (stringOp) where
+
+import GF.Text.Transliterations
+import GF.Text.UTF8
+
+import Data.Char
+import Data.List (intersperse)
+
+-- lexers and unlexers - they work on space-separated word strings
+
+stringOp :: String -> Maybe (String -> String)
+stringOp name = case name of
+ "chars" -> Just $ appLexer (filter (not . all isSpace) . map return)
+ "lextext" -> Just $ appLexer lexText
+ "lexcode" -> Just $ appLexer lexText
+ "lexmixed" -> Just $ appLexer lexMixed
+ "words" -> Just $ appLexer words
+ "bind" -> Just $ appUnlexer bindTok
+ "uncars" -> Just $ appUnlexer concat
+ "unlextext" -> Just $ appUnlexer unlexText
+ "unlexcode" -> Just $ appUnlexer unlexCode
+ "unlexmixed" -> Just $ appUnlexer unlexMixed
+ "unwords" -> Just $ appUnlexer unwords
+ "to_html" -> Just wrapHTML
+ "to_utf8" -> Just encodeUTF8
+ "from_utf8" -> Just decodeUTF8
+ "to_cp1251" -> Just encodeCP1251
+ "from_cp1251" -> Just decodeCP1251
+ _ -> transliterate name
+
+appLexer :: (String -> [String]) -> String -> String
+appLexer f = unwords . filter (not . null) . f
+
+appUnlexer :: ([String] -> String) -> String -> String
+appUnlexer f = unlines . map (f . words) . lines
+
+wrapHTML :: String -> String
+wrapHTML = unlines . tag . intersperse "<br>" . lines where
+ tag ss = "<html>":"<body>" : ss ++ ["</body>","</html>"]
+
+lexText :: String -> [String]
+lexText s = case s of
+ c:cs | isPunct c -> [c] : lexText cs
+ c:cs | isSpace c -> lexText cs
+ _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lexText cs
+ _ -> [s]
+
+-- | Haskell lexer, usable for much code
+lexCode :: String -> [String]
+lexCode ss = case lex ss of
+ [(w@(_:_),ws)] -> w : lexCode ws
+ _ -> []
+
+-- | LaTeX style lexer, with "math" environment using Code between $...$
+lexMixed :: String -> [String]
+lexMixed = concat . alternate False where
+ alternate env s = case s of
+ _:_ -> case break (=='$') s of
+ (t,[]) -> lex env t : []
+ (t,c:m) -> lex env t : [[c]] : alternate (not env) m
+ _ -> []
+ lex env = if env then lexCode else lexText
+
+bindTok :: [String] -> String
+bindTok ws = case ws of
+ w:"&+":ws2 -> w ++ bindTok ws2
+ w:[] -> w
+ w:ws2 -> w ++ " " ++ bindTok ws2
+ [] -> ""
+
+unlexText :: [String] -> String
+unlexText s = case s of
+ w:[] -> w
+ w:[c]:[] | isPunct c -> w ++ [c]
+ w:[c]:cs | isPunct c -> w ++ [c] ++ " " ++ unlexText cs
+ w:ws -> w ++ " " ++ unlexText ws
+ _ -> []
+
+unlexCode :: [String] -> String
+unlexCode s = case s of
+ w:[] -> w
+ [c]:cs | isParen c -> [c] ++ unlexCode cs
+ w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
+ w:ws -> w ++ " " ++ unlexCode ws
+ _ -> []
+
+
+unlexMixed :: [String] -> String
+unlexMixed = concat . alternate False where
+ alternate env s = case s of
+ _:_ -> case break (=="$") s of
+ (t,[]) -> unlex env t : []
+ (t,c:m) -> unlex env t : sep env c : alternate (not env) m
+ _ -> []
+ unlex env = if env then unlexCode else unlexText
+ sep env c = if env then c ++ " " else " " ++ c
+
+isPunct = flip elem ".?!,:;"
+isParen = flip elem "()[]{}"
+isClosing = flip elem ")]}"
+
+
+-- might be in a file of its own: Windows Cyrillic, used in Bulgarian resource
+
+decodeCP1251 = map convert where
+ convert c
+ | c >= '\192' && c <= '\255' = chr (ord c + 848)
+ | otherwise = c
+
+encodeCP1251 = map convert where
+ convert c
+ | oc >= 1040 && oc <= 1103 = chr (oc - 848)
+ | otherwise = c
+ where oc = ord c
+