summaryrefslogtreecommitdiff
path: root/src/compiler/GF/Text/Lexing.hs
blob: 7195daacd7303e8388aedac153eb9bb8668fc865 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
-- | Lexers and unlexers - they work on space-separated word strings
module GF.Text.Lexing (stringOp,opInEnv) where

import GF.Text.Transliterations

import Data.Char (isSpace,toUpper,toLower)
import Data.List (intersperse)

stringOp :: (String -> Bool) -> String -> Maybe (String -> String)
stringOp good name = case name of
  "chars"      -> Just $ appLexer (filter (not . all isSpace) . map return)
  "lextext"    -> Just $ appLexer (lexText good)
  "lexcode"    -> Just $ appLexer lexCode
  "lexmixed"   -> Just $ appLexer (lexMixed good)
  "lexgreek"   -> Just $ appLexer lexAGreek
  "lexgreek2"  -> Just $ appLexer lexAGreek2
  "words"      -> Just $ appLexer words
  "bind"       -> Just $ appUnlexer (unwords . bindTok)
  "unchars"    -> Just $ appUnlexer concat
  "unlextext"  -> Just $ appUnlexer (unlexText . unquote . bindTok)
  "unlexcode"  -> Just $ appUnlexer unlexCode
  "unlexmixed" -> Just $ appUnlexer (unlexMixed good . unquote . bindTok)
  "unlexgreek" -> Just $ appUnlexer unlexAGreek
  "unlexnone"  -> Just id
  "unlexid"    -> Just id
  "unwords"    -> Just $ appUnlexer unwords
  "to_html"    -> Just wrapHTML
  _            -> transliterate name

-- perform op in environments beg--end, t.ex. between "--"
--- suboptimal implementation
opInEnv :: String -> String -> (String -> String) -> (String -> String)
opInEnv beg end op = concat . altern False . chop (lbeg, beg) [] where
  chop mk@(lg, mark) s0 s = 
    let (tag,rest) = splitAt lg s in
    if tag==mark then (reverse s0) : mark : chop (switch mk) [] rest 
      else case s of
        c:cs -> chop mk (c:s0) cs
        [] -> [reverse s0]
  switch (lg,mark) = if mark==beg then (lend,end) else (lbeg,beg)
  (lbeg,lend) = (length beg, length end)
  altern m ts = case ts of
    t:ws | not m && t==beg -> t : altern True ws
    t:ws | m     && t==end -> t : altern False ws
    t:ws -> (if m then op t else t) : altern m ws
    [] -> []

appLexer :: (String -> [String]) -> String -> String
appLexer f = unwords . filter (not . null) . f

appUnlexer :: ([String] -> String) -> String -> String
----appUnlexer f = unlines . map (f . words) . lines
appUnlexer f = f . words

wrapHTML :: String -> String
wrapHTML = unlines . tag . intersperse "<br>" . lines where
  tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"]


-- * Text lexing
-- | Text lexing with standard word capitalization of the first word of every sentence
lexText :: (String -> Bool) -> String -> [String]
lexText good = lexText' (uncapitInit good)

-- | Text lexing with custom treatment of the first word of every sentence.
lexText' :: (String->String) -> String -> [String]
lexText' uncap1 = uncap . lext where
  lext s = case s of
    c:cs | isMajorPunct c -> [c] : uncap (lext cs)
    c:cs | isMinorPunct c -> [c] : lext cs
    c:cs | isSpace c ->       lext cs
    _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
    _ -> [s]
  uncap s = case s of
    w:ws -> uncap1 w:ws
    _ -> s

unlexText :: [String] -> String
unlexText = capitInit . unlext where
  unlext s = case s of
    w:[] -> w
    w:[c]:[] | isPunct c -> w ++ [c]
    w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
    w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
    w:ws -> w ++ " " ++ unlext ws
    _ -> []

-- | Bind tokens separated by Prelude.BIND, i.e. &+
bindTok :: [String] -> [String]
bindTok ws = case ws of
               w1:"&+":w2:ws -> bindTok ((w1++w2):ws)
               "&+":ws       -> bindTok ws
               "&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws)
               "&|":ws       -> bindTok ws
               w:ws          -> w:bindTok ws
               []            -> []

-- * Code lexing

-- | Haskell lexer, usable for much code
lexCode :: String -> [String]
lexCode ss = case lex ss of
  [(w@(_:_),ws)] -> w : lexCode ws
  _ -> []
  

-- * Ancient Greek lexing

lexTextAGreek :: String -> [String]
lexTextAGreek s = lext s where
  lext s = case s of
    c:cs | isAGreekPunct c -> [c] : (lext cs)
    c:cs | isSpace c -> lext cs
    _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s 
           in w : lext cs
    [] -> []

-- Philological greek text may use vowel length indicators. Then '.' is not a sentence 
-- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c.

lexTextAGreek2 :: String -> [String]
lexTextAGreek2 s = lext s where
  lext s = case s of
    c:cs | isAGreekPunct c -> [c] : (lext cs)
    c:cs | isSpace c -> lext cs
    _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s 
           in case cs of 
                '.':'.':d:ds | isSpace d 
                  -> (w++['.']) : lext ('.':d:ds)
                '.':d:ds | isAGreekPunct d || isSpace d 
                  -> (w++['.']) : lext (d:ds)
                '.':d:ds | not (isSpace d) 
                  -> case lext (d:ds) of
                       e:es -> (w++['.']++e) : es
                       es -> (w++['.']) : es 
                '.':[] -> (w++['.']) : []
                _ -> w : lext cs      
    [] -> []

unlexTextAGreek :: [String] -> String
unlexTextAGreek = unlext where
  unlext s = case s of
    w:[] -> w
    w:[c]:[] | isAGreekPunct c -> w ++ [c]
    w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs
    w:ws -> w ++ " " ++ unlext ws
    [] -> []

isAGreekPunct = flip elem ".,;··"  -- colon: first version · not in charset,
                                   -- second version · = 00B7 standard code point

-- * Text lexing and unlexing for Ancient Greek: 
--   1. no capitalization of initial word, 
--   2. grave/acute accent switch on final syllables of words not followed by punctuation, 
--   3. accent move from/to support word to/from following clitic words (iterated).

lexAGreek :: String -> [String]
lexAGreek = fromAGreek . lexTextAGreek

lexAGreek2 :: String -> [String]
lexAGreek2 = fromAGreek . lexTextAGreek2

unlexAGreek :: [String] -> String
unlexAGreek = unlexTextAGreek . toAGreek

-- Note: unlexAGreek does not glue punctuation with the previous word, so that short
-- vowel indication (like a.) differs from sentence end (a .). 

-- | normalize = change grave accent on sentence internal words to acute, 
-- and shift inherited acutes to the following enclitic (where they are
-- visible only as shown in the list of enclitics above)

normalize :: String -> String
normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek) 

fromAGreek :: [String] -> [String]
fromAGreek s = case s of
  w:[]:vs -> w:[]:(fromAGreek vs)
  w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs)
  w:v:vs | wasEnclitic v && wasEnclitic w ->
    getEnclitic w : fromAGreek (v:vs)
  w:v:vs | wasEnclitic v && wasProclitic w ->  -- "ei)' tines*"
    getProclitic w : getEnclitic v : fromAGreek vs
  w:v:vs | wasEnclitic v && (hasEndCircum w || 
    (hasEndAcute w && hasSingleAccent w)) ->  
    w : getEnclitic v : fromAGreek vs        -- ok "sofoi' tines*"
  w:v:vs | wasEnclitic v && hasPrefinalAcute w ->
    w : getEnclitic v : fromAGreek vs
  w:v:vs | wasEnclitic v && hasEndAcute w ->  -- ok "a)'nvrwpoi' tines*"
    dropLastAccent w : getEnclitic v : fromAGreek vs
  w:v:vs | wasEnclitic w ->
    getEnclitic w : fromAGreek (v:vs)
  w:ws -> (toAcute w) : (fromAGreek ws)
  ws -> ws 

-- | de-normalize = change acute accent of end syllables in sentence internal 
--  (non-enclitic) words to grave accent, and move accents of enclitics to the 
--  previous word to produce ordinary ancient greek

denormalize :: String -> String
denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek) 

toAGreek :: [String] -> [String]
toAGreek s = case s of
  w:[]:vs -> w:[]:(toAGreek vs)
  w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs)  -- w:[] for following -to_ancientgreek
  w:v:vs | isEnclitic v && isEnclitic w -> 
    addAcute w : toAGreek (dropAccent v:vs)    -- BR 11 Anm.2
  w:v:vs | isEnclitic v && isProclitic w ->    -- BR 11 a.beta
    addAcute w: (toAGreek (dropAccent v:vs))
  w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) -> 
    w:(toAGreek (dropAccent v:vs))             -- BR 11 a.alpha,beta
  w:v:vs | isEnclitic v && hasPrefinalAcute w -> 
    w:v: toAGreek vs   -- bisyllabic v keeps its accent  BR 11 b.
  w:v:vs | isEnclitic v -> 
    (addAcute w):(toAGreek (dropAccent v:vs))  -- BR 11 a.gamma
  w:v:vs | isEnclitic w -> w:(toAGreek (v:vs))
  w:ws -> (toGrave w) : (toAGreek ws)
  ws -> ws 

-- | Change accent on the final syllable of a word

toGrave :: String -> String 
toGrave = reverse . grave . reverse where
  grave s = case s of
    '\'':cs -> '`':cs 
    c:cs | isAGreekVowel c -> c:cs 
    c:cs -> c: grave cs
    _ -> s

toAcute :: String -> String 
toAcute = reverse . acute . reverse where
  acute s = case s of
    '`':cs -> '\'':cs
    c:cs | isAGreekVowel c -> c:cs 
    c:cs -> c: acute cs
    _ -> s

isAGreekVowel = flip elem "aeioyhw"

-- | Accent moves for enclitics and proclitics (atona)

enclitics = [
  "moy","moi","me",     -- personal pronouns
  "soy","soi","se",
  "oy(","oi(","e(",
  "tis*","ti","tina'",  -- indefinite pronoun 
  "tino's*","tini'",
  "tine's*","tina's*",
  "tinw~n","tisi'","tisi'n",
  "poy","poi",          -- indefinite adverbs
  "pove'n","pws*",
  "ph|","pote'",
  "ge","te","toi",      -- particles
  "nyn","per","pw" 
   -- suffix -"de"
   -- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~)
  ] -- and more, BR 11

proclitics = [
  "o(","h(","oi(","ai(",     -- articles
  "e)n","ei)s*","e)x","e)k", -- prepositions
  "ei)","w(s*",              -- conjunctions
  "oy)","oy)k","oy)c"        -- negation
  ]

isEnclitic = flip elem enclitics
isProclitic = flip elem proclitics

-- Check if a word is an enclitic or accented enclitic and extract the enclitic

wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics) 
                    ++ (map dropAccent (filter hasAccent enclitics))
                  accented = (filter hasAccent enclitics) 
                    ++ map addAcute (filter (not . hasAccent) enclitics) 
              in flip elem (accented ++ unaccented)

wasProclitic = flip elem (map addAcute proclitics)

getEnclitic = 
  let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics))
                   ++ (map addAcute (filter (not . hasAccent) enclitics)))
                  (enclitics ++ (filter hasAccent enclitics)
                   ++ (filter (not . hasAccent) enclitics))
      find = \v -> lookup v pairs
  in \v -> case (find v) of 
    Just x -> x 
    _ -> v
getProclitic = 
  let pairs = zip (map addAcute proclitics) proclitics 
      find = \v -> lookup v pairs
  in \v -> case (find v) of 
    Just x -> x 
    _ -> v

-- | Accent manipulation

dropAccent = reverse . drop . reverse where 
  drop s = case s of
    [] -> []
    '\'':cs -> cs
    '`':cs -> cs
    '~':cs -> cs
    c:cs -> c:drop cs

dropLastAccent = reverse . drop . reverse where 
  drop s = case s of
    [] -> []
    '\'':cs -> cs
    '`':cs -> cs
    '~':cs -> cs
    c:cs -> c:drop cs

addAcute :: String -> String
addAcute = reverse . acu . reverse where
  acu w = case w of 
    c:cs | c == '\'' -> c:cs
    c:cs | c == '(' -> '\'':c:cs
    c:cs | c == ')' -> '\'':c:cs
    c:cs | isAGreekVowel c -> '\'':c:cs
    c:cs -> c : acu cs
    _ -> w

-- | Accent checking on end syllables

hasEndAcute = find . reverse where
  find s = case s of
    [] -> False
    '\'':cs -> True
    '`':cs -> False
    '~':cs -> False
    c:cs | isAGreekVowel c -> False
    _:cs -> find cs

hasEndCircum = find . reverse where
  find s = case s of
    [] -> False
    '\'':cs -> False
    '`':cs -> False
    '~':cs -> True
    c:cs | isAGreekVowel c -> False
    _:cs -> find cs

hasPrefinalAcute = find . reverse where
  find s = case s of
    [] -> False
    '\'':cs -> False  -- final acute
    '`':cs -> False
    '~':cs -> False
    c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs
    c:cs | isAGreekVowel c -> findNext cs
    _:cs -> find cs where
  findNext s = case s of 
    [] -> False
    '\'':cs -> True  -- prefinal acute
    '`':cs -> False
    '~':cs -> False
    c:cs | isAGreekVowel c -> False
    _:cs -> findNext cs where
    
hasSingleAccent v = 
  hasAccent v && not (hasAccent (dropLastAccent v))

hasAccent v = case v of 
  [] -> False
  c:cs -> elem c ['\'','`','~'] || hasAccent cs

{- Tests: 

-- denormalization. Examples in BR 11 work: 
-}
enclitics_expls = -- normalized
  "sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n":  -- a.alpha
  "sofo's tis*":"sofoi' tine's*":                 -- a.beta
  "ei) tis*":"ei) tine's*":
  "a)'nvrwpos* tis*":"a)'nvrwpoi tine's*":        -- a.gamma
  "doy~los* tis*":"doy~loi tine's*":
  "lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n":  -- b.
  "ei) poy tis* tina' i)'doi":                    -- Anm. 2.
  [] 


unlexCode :: [String] -> String
unlexCode s = case s of
  w:[] -> w
  [c]:cs | isParen c -> [c] ++ unlexCode cs
  w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
  w:ws -> w ++ " " ++ unlexCode ws
  _ -> []


-- | LaTeX lexer in the math mode: \ should not be separated from the next word

lexLatexCode :: String -> [String]
lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex
  restoreBackslash ws = case ws of
    "\\":w:ww -> ("\\" ++ w) : restoreBackslash ww
    w:ww -> w:restoreBackslash ww
    _ -> ws

-- * Mixed lexing

-- | LaTeX style lexer, with "math" environment using Code between $...$
lexMixed :: (String -> Bool) -> String -> [String]
lexMixed good = concat . alternate False [] where
  alternate env t s = case s of
    '$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs
    '\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs
    c:cs -> alternate env (c:t) cs
    _ -> [lex env (reverse t)]
  lex env = if env then lexLatexCode else lexText good

unlexMixed :: (String -> Bool) -> [String] -> String
unlexMixed good = capitInit . concat . alternate False where
  alternate env s = case s of
    _:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of
      (t,[])  -> unlex env t : []
      (t,c:m) -> unlex env t : sep env c m : alternate (not env) m
    _ -> []
  unlex env = if env then unlexCode else (uncapitInit good . unlexText)
  sep env c m = case (m,env) of
    ([p]:_,True) | isPunct p -> c   -- closing $ glued to next punct 
    (_,  True) -> c ++ " "   -- closing $ otherwise separated by space from what follows
    _ -> " " ++ c   -- put space before opening $

-- * Additional lexing uitilties

-- | Capitalize first letter
capitInit s = case s of
  c:cs -> toUpper c : cs
  _ -> s

-- | Uncapitalize first letter
uncapitInit good s = 
  case s of
    c:cs | not (good s) -> toLower c : cs
    _                   -> s

-- | Unquote each string wrapped in double quotes
unquote = map unq where 
  unq s = case s of
    '"':cs@(_:_) | last cs == '"' -> init cs
    _ -> s

isPunct = flip elem ".?!,:;"
isMajorPunct = flip elem ".?!"
isMinorPunct = flip elem ",:;"
isParen = flip elem "()[]{}"
isClosing = flip elem ")]}"