diff options
| author | aarne <aarne@cs.chalmers.se> | 2008-06-25 16:54:35 +0000 |
|---|---|---|
| committer | aarne <aarne@cs.chalmers.se> | 2008-06-25 16:54:35 +0000 |
| commit | e9e80fc389365e24d4300d7d5390c7d833a96c50 (patch) | |
| tree | f0b58473adaa670bd8fc52ada419d8cad470ee03 /src-3.0/GF/Text/Lexing.hs | |
| parent | b96b36f43de3e2f8b58d5f539daa6f6d47f25870 (diff) | |
changed names of resource-1.3; added a note on homepage on release
Diffstat (limited to 'src-3.0/GF/Text/Lexing.hs')
| -rw-r--r-- | src-3.0/GF/Text/Lexing.hs | 115 |
1 files changed, 0 insertions, 115 deletions
diff --git a/src-3.0/GF/Text/Lexing.hs b/src-3.0/GF/Text/Lexing.hs deleted file mode 100644 index 2c6b417b8..000000000 --- a/src-3.0/GF/Text/Lexing.hs +++ /dev/null @@ -1,115 +0,0 @@ -module GF.Text.Lexing (stringOp) where - -import GF.Text.Transliterations -import GF.Text.UTF8 - -import Data.Char -import Data.List (intersperse) - --- lexers and unlexers - they work on space-separated word strings - -stringOp :: String -> Maybe (String -> String) -stringOp name = case name of - "chars" -> Just $ appLexer (filter (not . all isSpace) . map return) - "lextext" -> Just $ appLexer lexText - "lexcode" -> Just $ appLexer lexText - "lexmixed" -> Just $ appLexer lexMixed - "words" -> Just $ appLexer words - "bind" -> Just $ appUnlexer bindTok - "uncars" -> Just $ appUnlexer concat - "unlextext" -> Just $ appUnlexer unlexText - "unlexcode" -> Just $ appUnlexer unlexCode - "unlexmixed" -> Just $ appUnlexer unlexMixed - "unwords" -> Just $ appUnlexer unwords - "to_html" -> Just wrapHTML - "to_utf8" -> Just encodeUTF8 - "from_utf8" -> Just decodeUTF8 - "to_cp1251" -> Just encodeCP1251 - "from_cp1251" -> Just decodeCP1251 - _ -> transliterate name - -appLexer :: (String -> [String]) -> String -> String -appLexer f = unwords . filter (not . null) . f - -appUnlexer :: ([String] -> String) -> String -> String -appUnlexer f = unlines . map (f . words) . lines - -wrapHTML :: String -> String -wrapHTML = unlines . tag . intersperse "<br>" . lines where - tag ss = "<html>":"<body>" : ss ++ ["</body>","</html>"] - -lexText :: String -> [String] -lexText s = case s of - c:cs | isPunct c -> [c] : lexText cs - c:cs | isSpace c -> lexText cs - _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lexText cs - _ -> [s] - --- | Haskell lexer, usable for much code -lexCode :: String -> [String] -lexCode ss = case lex ss of - [(w@(_:_),ws)] -> w : lexCode ws - _ -> [] - --- | LaTeX style lexer, with "math" environment using Code between $...$ -lexMixed :: String -> [String] -lexMixed = concat . alternate False where - alternate env s = case s of - _:_ -> case break (=='$') s of - (t,[]) -> lex env t : [] - (t,c:m) -> lex env t : [[c]] : alternate (not env) m - _ -> [] - lex env = if env then lexCode else lexText - -bindTok :: [String] -> String -bindTok ws = case ws of - w:"&+":ws2 -> w ++ bindTok ws2 - w:[] -> w - w:ws2 -> w ++ " " ++ bindTok ws2 - [] -> "" - -unlexText :: [String] -> String -unlexText s = case s of - w:[] -> w - w:[c]:[] | isPunct c -> w ++ [c] - w:[c]:cs | isPunct c -> w ++ [c] ++ " " ++ unlexText cs - w:ws -> w ++ " " ++ unlexText ws - _ -> [] - -unlexCode :: [String] -> String -unlexCode s = case s of - w:[] -> w - [c]:cs | isParen c -> [c] ++ unlexCode cs - w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs - w:ws -> w ++ " " ++ unlexCode ws - _ -> [] - - -unlexMixed :: [String] -> String -unlexMixed = concat . alternate False where - alternate env s = case s of - _:_ -> case break (=="$") s of - (t,[]) -> unlex env t : [] - (t,c:m) -> unlex env t : sep env c : alternate (not env) m - _ -> [] - unlex env = if env then unlexCode else unlexText - sep env c = if env then c ++ " " else " " ++ c - -isPunct = flip elem ".?!,:;" -isParen = flip elem "()[]{}" -isClosing = flip elem ")]}" - - --- might be in a file of its own: Windows Cyrillic, used in Bulgarian resource - -decodeCP1251 = map convert where - convert c - | c >= '\192' && c <= '\255' = chr (ord c + 848) - | otherwise = c - -encodeCP1251 = map convert where - convert c - | oc >= 1040 && oc <= 1103 = chr (oc - 848) - | otherwise = c - where oc = ord c - |
