1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
|
module GF.Text.Lexing (stringOp,opInEnv) where
import GF.Text.Transliterations
import Data.Char
import Data.List (intersperse)
-- lexers and unlexers - they work on space-separated word strings
stringOp :: String -> Maybe (String -> String)
stringOp name = case name of
"chars" -> Just $ appLexer (filter (not . all isSpace) . map return)
"lextext" -> Just $ appLexer lexText
"lexcode" -> Just $ appLexer lexCode
"lexmixed" -> Just $ appLexer lexMixed
"words" -> Just $ appLexer words
"bind" -> Just $ appUnlexer bindTok
"unchars" -> Just $ appUnlexer concat
"unlextext" -> Just $ capitInit . appUnlexer (unlexText . unquote)
"unlexcode" -> Just $ appUnlexer unlexCode
"unlexmixed" -> Just $ capitInit . appUnlexer (unlexMixed . unquote)
"unwords" -> Just $ appUnlexer unwords
"to_html" -> Just wrapHTML
_ -> transliterate name
-- perform op in environments beg--end, t.ex. between "--"
--- suboptimal implementation
opInEnv :: String -> String -> (String -> String) -> (String -> String)
opInEnv beg end op = concat . altern False . chop (lbeg, beg) [] where
chop mk@(lg, mark) s0 s =
let (tag,rest) = splitAt lg s in
if tag==mark then (reverse s0) : mark : chop (switch mk) [] rest
else case s of
c:cs -> chop mk (c:s0) cs
[] -> [reverse s0]
switch (lg,mark) = if mark==beg then (lend,end) else (lbeg,beg)
(lbeg,lend) = (length beg, length end)
altern m ts = case ts of
t:ws | not m && t==beg -> t : altern True ws
t:ws | m && t==end -> t : altern False ws
t:ws -> (if m then op t else t) : altern m ws
[] -> []
appLexer :: (String -> [String]) -> String -> String
appLexer f = unwords . filter (not . null) . f
appUnlexer :: ([String] -> String) -> String -> String
appUnlexer f = unlines . map (f . words) . lines
wrapHTML :: String -> String
wrapHTML = unlines . tag . intersperse "<br>" . lines where
tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"]
lexText :: String -> [String]
lexText = uncap . lext where
lext s = case s of
c:cs | isMajorPunct c -> [c] : uncap (lext cs)
c:cs | isMinorPunct c -> [c] : lext cs
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
_ -> [s]
uncap s = case s of
(c:cs):ws -> (toLower c : cs):ws
_ -> s
-- | Haskell lexer, usable for much code
lexCode :: String -> [String]
lexCode ss = case lex ss of
[(w@(_:_),ws)] -> w : lexCode ws
_ -> []
-- | LaTeX style lexer, with "math" environment using Code between $...$
lexMixed :: String -> [String]
lexMixed = concat . alternate False where
alternate env s = case s of
_:_ -> case break (=='$') s of
(t,[]) -> lex env t : []
(t,c:m) -> lex env t : [[c]] : alternate (not env) m
_ -> []
lex env = if env then lexCode else lexText
bindTok :: [String] -> String
bindTok ws = case ws of
w:"&+":ws2 -> w ++ bindTok ws2
w:[] -> w
w:ws2 -> w ++ " " ++ bindTok ws2
[] -> ""
unlexText :: [String] -> String
unlexText = unlext where
unlext s = case s of
w:[] -> w
w:[c]:[] | isPunct c -> w ++ [c]
w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
w:ws -> w ++ " " ++ unlext ws
_ -> []
-- capitalize first letter
capitInit s = case s of
c:cs -> toUpper c : cs
_ -> s
-- unquote each string of form "foo"
unquote = map unq where
unq s = case s of
'"':cs@(_:_) | last cs == '"' -> init cs
_ -> s
unlexCode :: [String] -> String
unlexCode s = case s of
w:[] -> w
[c]:cs | isParen c -> [c] ++ unlexCode cs
w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
w:ws -> w ++ " " ++ unlexCode ws
_ -> []
unlexMixed :: [String] -> String
unlexMixed = concat . alternate False where
alternate env s = case s of
_:_ -> case break (=="$") s of
(t,[]) -> unlex env t : []
(t,c:m) -> unlex env t : sep env c : alternate (not env) m
_ -> []
unlex env = if env then unlexCode else unlexText
sep env c = if env then c ++ " " else " " ++ c
isPunct = flip elem ".?!,:;"
isMajorPunct = flip elem ".?!"
isMinorPunct = flip elem ",:;"
isParen = flip elem "()[]{}"
isClosing = flip elem ")]}"
|