diff options
| author | aarne <aarne@cs.chalmers.se> | 2008-06-25 16:54:35 +0000 |
|---|---|---|
| committer | aarne <aarne@cs.chalmers.se> | 2008-06-25 16:54:35 +0000 |
| commit | e9e80fc389365e24d4300d7d5390c7d833a96c50 (patch) | |
| tree | f0b58473adaa670bd8fc52ada419d8cad470ee03 /src/GF/Text/Lexing.hs | |
| parent | b96b36f43de3e2f8b58d5f539daa6f6d47f25870 (diff) | |
changed names of resource-1.3; added a note on homepage on release
Diffstat (limited to 'src/GF/Text/Lexing.hs')
| -rw-r--r-- | src/GF/Text/Lexing.hs | 115 |
1 files changed, 115 insertions, 0 deletions
diff --git a/src/GF/Text/Lexing.hs b/src/GF/Text/Lexing.hs new file mode 100644 index 000000000..2c6b417b8 --- /dev/null +++ b/src/GF/Text/Lexing.hs @@ -0,0 +1,115 @@ +module GF.Text.Lexing (stringOp) where + +import GF.Text.Transliterations +import GF.Text.UTF8 + +import Data.Char +import Data.List (intersperse) + +-- lexers and unlexers - they work on space-separated word strings + +stringOp :: String -> Maybe (String -> String) +stringOp name = case name of + "chars" -> Just $ appLexer (filter (not . all isSpace) . map return) + "lextext" -> Just $ appLexer lexText + "lexcode" -> Just $ appLexer lexText + "lexmixed" -> Just $ appLexer lexMixed + "words" -> Just $ appLexer words + "bind" -> Just $ appUnlexer bindTok + "uncars" -> Just $ appUnlexer concat + "unlextext" -> Just $ appUnlexer unlexText + "unlexcode" -> Just $ appUnlexer unlexCode + "unlexmixed" -> Just $ appUnlexer unlexMixed + "unwords" -> Just $ appUnlexer unwords + "to_html" -> Just wrapHTML + "to_utf8" -> Just encodeUTF8 + "from_utf8" -> Just decodeUTF8 + "to_cp1251" -> Just encodeCP1251 + "from_cp1251" -> Just decodeCP1251 + _ -> transliterate name + +appLexer :: (String -> [String]) -> String -> String +appLexer f = unwords . filter (not . null) . f + +appUnlexer :: ([String] -> String) -> String -> String +appUnlexer f = unlines . map (f . words) . lines + +wrapHTML :: String -> String +wrapHTML = unlines . tag . intersperse "<br>" . lines where + tag ss = "<html>":"<body>" : ss ++ ["</body>","</html>"] + +lexText :: String -> [String] +lexText s = case s of + c:cs | isPunct c -> [c] : lexText cs + c:cs | isSpace c -> lexText cs + _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lexText cs + _ -> [s] + +-- | Haskell lexer, usable for much code +lexCode :: String -> [String] +lexCode ss = case lex ss of + [(w@(_:_),ws)] -> w : lexCode ws + _ -> [] + +-- | LaTeX style lexer, with "math" environment using Code between $...$ +lexMixed :: String -> [String] +lexMixed = concat . alternate False where + alternate env s = case s of + _:_ -> case break (=='$') s of + (t,[]) -> lex env t : [] + (t,c:m) -> lex env t : [[c]] : alternate (not env) m + _ -> [] + lex env = if env then lexCode else lexText + +bindTok :: [String] -> String +bindTok ws = case ws of + w:"&+":ws2 -> w ++ bindTok ws2 + w:[] -> w + w:ws2 -> w ++ " " ++ bindTok ws2 + [] -> "" + +unlexText :: [String] -> String +unlexText s = case s of + w:[] -> w + w:[c]:[] | isPunct c -> w ++ [c] + w:[c]:cs | isPunct c -> w ++ [c] ++ " " ++ unlexText cs + w:ws -> w ++ " " ++ unlexText ws + _ -> [] + +unlexCode :: [String] -> String +unlexCode s = case s of + w:[] -> w + [c]:cs | isParen c -> [c] ++ unlexCode cs + w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs + w:ws -> w ++ " " ++ unlexCode ws + _ -> [] + + +unlexMixed :: [String] -> String +unlexMixed = concat . alternate False where + alternate env s = case s of + _:_ -> case break (=="$") s of + (t,[]) -> unlex env t : [] + (t,c:m) -> unlex env t : sep env c : alternate (not env) m + _ -> [] + unlex env = if env then unlexCode else unlexText + sep env c = if env then c ++ " " else " " ++ c + +isPunct = flip elem ".?!,:;" +isParen = flip elem "()[]{}" +isClosing = flip elem ")]}" + + +-- might be in a file of its own: Windows Cyrillic, used in Bulgarian resource + +decodeCP1251 = map convert where + convert c + | c >= '\192' && c <= '\255' = chr (ord c + 848) + | otherwise = c + +encodeCP1251 = map convert where + convert c + | oc >= 1040 && oc <= 1103 = chr (oc - 848) + | otherwise = c + where oc = ord c + |
