diff options
| author | krasimir <krasimir@chalmers.se> | 2009-12-13 18:50:29 +0000 |
|---|---|---|
| committer | krasimir <krasimir@chalmers.se> | 2009-12-13 18:50:29 +0000 |
| commit | f85232947e74ee7ef8c7b0ad2338212e7e68f1be (patch) | |
| tree | 667b886a5e3a4b026a63d4e3597f32497d824761 /src/compiler/GF/Text/Lexing.hs | |
| parent | d88a865faff59c98fc91556ff8700b10ee5f2df8 (diff) | |
reorganize the directories under src, and rescue the JavaScript interpreter from deprecated
Diffstat (limited to 'src/compiler/GF/Text/Lexing.hs')
| -rw-r--r-- | src/compiler/GF/Text/Lexing.hs | 131 |
1 files changed, 131 insertions, 0 deletions
diff --git a/src/compiler/GF/Text/Lexing.hs b/src/compiler/GF/Text/Lexing.hs new file mode 100644 index 000000000..3300d311e --- /dev/null +++ b/src/compiler/GF/Text/Lexing.hs @@ -0,0 +1,131 @@ +module GF.Text.Lexing (stringOp,opInEnv) where + +import GF.Text.Transliterations +import GF.Text.UTF8 +import GF.Text.CP1251 + +import Data.Char +import Data.List (intersperse) + +-- lexers and unlexers - they work on space-separated word strings + +stringOp :: String -> Maybe (String -> String) +stringOp name = case name of + "chars" -> Just $ appLexer (filter (not . all isSpace) . map return) + "lextext" -> Just $ appLexer lexText + "lexcode" -> Just $ appLexer lexCode + "lexmixed" -> Just $ appLexer lexMixed + "words" -> Just $ appLexer words + "bind" -> Just $ appUnlexer bindTok + "unchars" -> Just $ appUnlexer concat + "unlextext" -> Just $ appUnlexer unlexText + "unlexcode" -> Just $ appUnlexer unlexCode + "unlexmixed" -> Just $ appUnlexer unlexMixed + "unwords" -> Just $ appUnlexer unwords + "to_html" -> Just wrapHTML + "to_utf8" -> Just encodeUTF8 + "from_utf8" -> Just decodeUTF8 + "to_cp1251" -> Just encodeCP1251 + "from_cp1251" -> Just decodeCP1251 + _ -> transliterate name + +-- perform op in environments beg--end, t.ex. between "--" +--- suboptimal implementation +opInEnv :: String -> String -> (String -> String) -> (String -> String) +opInEnv beg end op = concat . altern False . chop (lbeg, beg) [] where + chop mk@(lg, mark) s0 s = + let (tag,rest) = splitAt lg s in + if tag==mark then (reverse s0) : mark : chop (switch mk) [] rest + else case s of + c:cs -> chop mk (c:s0) cs + [] -> [reverse s0] + switch (lg,mark) = if mark==beg then (lend,end) else (lbeg,beg) + (lbeg,lend) = (length beg, length end) + altern m ts = case ts of + t:ws | not m && t==beg -> t : altern True ws + t:ws | m && t==end -> t : altern False ws + t:ws -> (if m then op t else t) : altern m ws + [] -> [] + +appLexer :: (String -> [String]) -> String -> String +appLexer f = unwords . filter (not . null) . f + +appUnlexer :: ([String] -> String) -> String -> String +appUnlexer f = unlines . map (f . words) . lines + +wrapHTML :: String -> String +wrapHTML = unlines . tag . intersperse "<br>" . lines where + tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"] + +lexText :: String -> [String] +lexText = uncap . lext where + lext s = case s of + c:cs | isMajorPunct c -> [c] : uncap (lext cs) + c:cs | isMinorPunct c -> [c] : lext cs + c:cs | isSpace c -> lext cs + _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs + _ -> [s] + uncap s = case s of + (c:cs):ws -> (toLower c : cs):ws + _ -> s + +-- | Haskell lexer, usable for much code +lexCode :: String -> [String] +lexCode ss = case lex ss of + [(w@(_:_),ws)] -> w : lexCode ws + _ -> [] + +-- | LaTeX style lexer, with "math" environment using Code between $...$ +lexMixed :: String -> [String] +lexMixed = concat . alternate False where + alternate env s = case s of + _:_ -> case break (=='$') s of + (t,[]) -> lex env t : [] + (t,c:m) -> lex env t : [[c]] : alternate (not env) m + _ -> [] + lex env = if env then lexCode else lexText + +bindTok :: [String] -> String +bindTok ws = case ws of + w:"&+":ws2 -> w ++ bindTok ws2 + w:[] -> w + w:ws2 -> w ++ " " ++ bindTok ws2 + [] -> "" + +unlexText :: [String] -> String +unlexText = cap . unlext where + unlext s = case s of + w:[] -> w + w:[c]:[] | isPunct c -> w ++ [c] + w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ cap (unlext cs) + w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs + w:ws -> w ++ " " ++ unlext ws + _ -> [] + cap s = case s of + c:cs -> toUpper c : cs + _ -> s + +unlexCode :: [String] -> String +unlexCode s = case s of + w:[] -> w + [c]:cs | isParen c -> [c] ++ unlexCode cs + w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs + w:ws -> w ++ " " ++ unlexCode ws + _ -> [] + + +unlexMixed :: [String] -> String +unlexMixed = concat . alternate False where + alternate env s = case s of + _:_ -> case break (=="$") s of + (t,[]) -> unlex env t : [] + (t,c:m) -> unlex env t : sep env c : alternate (not env) m + _ -> [] + unlex env = if env then unlexCode else unlexText + sep env c = if env then c ++ " " else " " ++ c + +isPunct = flip elem ".?!,:;" +isMajorPunct = flip elem ".?!" +isMinorPunct = flip elem ",:;" +isParen = flip elem "()[]{}" +isClosing = flip elem ")]}" |
