diff options
| author | Krasimir Angelov <kr.angelov@gmail.com> | 2017-09-04 11:43:37 +0200 |
|---|---|---|
| committer | Krasimir Angelov <kr.angelov@gmail.com> | 2017-09-04 11:43:37 +0200 |
| commit | 1f908fa7bf65f51540ccb2b70ca2bd00d9b3dedf (patch) | |
| tree | 6211e867d908c3e40bf29dfa0c5d5ab0cbaf2c38 /src/compiler/GF | |
| parent | cae52bb9af3f2735a31a4a64cb3b9d7750d0b2a9 (diff) | |
eliminate modules PGF.Lexing, PGF.LexingAGreek. Make PGF.Utilities an internal module in the runtime. These are not really part of the core runtime.
Diffstat (limited to 'src/compiler/GF')
| -rw-r--r-- | src/compiler/GF/Command/CommonCommands.hs | 2 | ||||
| -rw-r--r-- | src/compiler/GF/Data/Utilities.hs | 19 | ||||
| -rw-r--r-- | src/compiler/GF/Text/Lexing.hs | 412 |
3 files changed, 420 insertions, 13 deletions
diff --git a/src/compiler/GF/Command/CommonCommands.hs b/src/compiler/GF/Command/CommonCommands.hs index 0cafad531..69ccaf325 100644 --- a/src/compiler/GF/Command/CommonCommands.hs +++ b/src/compiler/GF/Command/CommonCommands.hs @@ -219,7 +219,7 @@ commonCommands = fmap (mapCommandExec liftSIO) $ Map.fromList [ stringOps menv opts s = foldr (menvop . app) s (reverse opts) where - app f = maybe id id (stringOp f) + app f = maybe id id (stringOp (const False) f) menvop op = maybe op (\ (b,e) -> opInEnv b e op) menv envFlag fs = diff --git a/src/compiler/GF/Data/Utilities.hs b/src/compiler/GF/Data/Utilities.hs index eac315508..29ed329dc 100644 --- a/src/compiler/GF/Data/Utilities.hs +++ b/src/compiler/GF/Data/Utilities.hs @@ -12,12 +12,12 @@ ----------------------------------------------------------------------------- -module GF.Data.Utilities(module GF.Data.Utilities, module PGF.Utilities) where +module GF.Data.Utilities(module GF.Data.Utilities) where import Data.Maybe import Data.List import Control.Monad (MonadPlus(..),liftM,when) -import PGF.Utilities +import qualified Data.Set as Set -- * functions on lists @@ -190,3 +190,18 @@ joinS glue = concatS . intersperse (showString glue) +-- | Like 'Data.List.nub', but O(n log n) instead of O(n^2), since it uses a set to lookup previous things. +-- The result list is stable (the elements are returned in the order they occur), and lazy. +-- Requires that the list elements can be compared by Ord. +-- Code ruthlessly taken from <http://hpaste.org/54411> +nub' :: Ord a => [a] -> [a] +nub' = loop Set.empty + where loop _ [] = [] + loop seen (x : xs) + | Set.member x seen = loop seen xs + | otherwise = x : loop (Set.insert x seen) xs + + +-- | Replace all occurences of an element by another element. +replace :: Eq a => a -> a -> [a] -> [a] +replace x y = map (\z -> if z == x then y else z) diff --git a/src/compiler/GF/Text/Lexing.hs b/src/compiler/GF/Text/Lexing.hs index 782e6ea9a..7195daacd 100644 --- a/src/compiler/GF/Text/Lexing.hs +++ b/src/compiler/GF/Text/Lexing.hs @@ -2,30 +2,30 @@ module GF.Text.Lexing (stringOp,opInEnv) where import GF.Text.Transliterations -import PGF.Lexing -import PGF.LexingAGreek(lexAGreek,unlexAGreek,lexAGreek2) -- HL 20.2.2016 -import Data.Char (isSpace) +import Data.Char (isSpace,toUpper,toLower) import Data.List (intersperse) -stringOp :: String -> Maybe (String -> String) -stringOp name = case name of +stringOp :: (String -> Bool) -> String -> Maybe (String -> String) +stringOp good name = case name of "chars" -> Just $ appLexer (filter (not . all isSpace) . map return) - "lextext" -> Just $ appLexer lexText + "lextext" -> Just $ appLexer (lexText good) "lexcode" -> Just $ appLexer lexCode - "lexmixed" -> Just $ appLexer lexMixed + "lexmixed" -> Just $ appLexer (lexMixed good) "lexgreek" -> Just $ appLexer lexAGreek "lexgreek2" -> Just $ appLexer lexAGreek2 "words" -> Just $ appLexer words "bind" -> Just $ appUnlexer (unwords . bindTok) "unchars" -> Just $ appUnlexer concat - "unlextext" -> Just $ appUnlexer (unlexText . unquote) + "unlextext" -> Just $ appUnlexer (unlexText . unquote . bindTok) "unlexcode" -> Just $ appUnlexer unlexCode - "unlexmixed" -> Just $ appUnlexer (unlexMixed . unquote) + "unlexmixed" -> Just $ appUnlexer (unlexMixed good . unquote . bindTok) "unlexgreek" -> Just $ appUnlexer unlexAGreek + "unlexnone" -> Just id + "unlexid" -> Just id "unwords" -> Just $ appUnlexer unwords "to_html" -> Just wrapHTML - _ -> transliterate name + _ -> transliterate name -- perform op in environments beg--end, t.ex. between "--" --- suboptimal implementation @@ -55,3 +55,395 @@ appUnlexer f = f . words wrapHTML :: String -> String wrapHTML = unlines . tag . intersperse "<br>" . lines where tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"] + + +-- * Text lexing +-- | Text lexing with standard word capitalization of the first word of every sentence +lexText :: (String -> Bool) -> String -> [String] +lexText good = lexText' (uncapitInit good) + +-- | Text lexing with custom treatment of the first word of every sentence. +lexText' :: (String->String) -> String -> [String] +lexText' uncap1 = uncap . lext where + lext s = case s of + c:cs | isMajorPunct c -> [c] : uncap (lext cs) + c:cs | isMinorPunct c -> [c] : lext cs + c:cs | isSpace c -> lext cs + _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs + _ -> [s] + uncap s = case s of + w:ws -> uncap1 w:ws + _ -> s + +unlexText :: [String] -> String +unlexText = capitInit . unlext where + unlext s = case s of + w:[] -> w + w:[c]:[] | isPunct c -> w ++ [c] + w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs) + w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs + w:ws -> w ++ " " ++ unlext ws + _ -> [] + +-- | Bind tokens separated by Prelude.BIND, i.e. &+ +bindTok :: [String] -> [String] +bindTok ws = case ws of + w1:"&+":w2:ws -> bindTok ((w1++w2):ws) + "&+":ws -> bindTok ws + "&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws) + "&|":ws -> bindTok ws + w:ws -> w:bindTok ws + [] -> [] + +-- * Code lexing + +-- | Haskell lexer, usable for much code +lexCode :: String -> [String] +lexCode ss = case lex ss of + [(w@(_:_),ws)] -> w : lexCode ws + _ -> [] + + +-- * Ancient Greek lexing + +lexTextAGreek :: String -> [String] +lexTextAGreek s = lext s where + lext s = case s of + c:cs | isAGreekPunct c -> [c] : (lext cs) + c:cs | isSpace c -> lext cs + _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s + in w : lext cs + [] -> [] + +-- Philological greek text may use vowel length indicators. Then '.' is not a sentence +-- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c. + +lexTextAGreek2 :: String -> [String] +lexTextAGreek2 s = lext s where + lext s = case s of + c:cs | isAGreekPunct c -> [c] : (lext cs) + c:cs | isSpace c -> lext cs + _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s + in case cs of + '.':'.':d:ds | isSpace d + -> (w++['.']) : lext ('.':d:ds) + '.':d:ds | isAGreekPunct d || isSpace d + -> (w++['.']) : lext (d:ds) + '.':d:ds | not (isSpace d) + -> case lext (d:ds) of + e:es -> (w++['.']++e) : es + es -> (w++['.']) : es + '.':[] -> (w++['.']) : [] + _ -> w : lext cs + [] -> [] + +unlexTextAGreek :: [String] -> String +unlexTextAGreek = unlext where + unlext s = case s of + w:[] -> w + w:[c]:[] | isAGreekPunct c -> w ++ [c] + w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs + w:ws -> w ++ " " ++ unlext ws + [] -> [] + +isAGreekPunct = flip elem ".,;··" -- colon: first version · not in charset, + -- second version · = 00B7 standard code point + +-- * Text lexing and unlexing for Ancient Greek: +-- 1. no capitalization of initial word, +-- 2. grave/acute accent switch on final syllables of words not followed by punctuation, +-- 3. accent move from/to support word to/from following clitic words (iterated). + +lexAGreek :: String -> [String] +lexAGreek = fromAGreek . lexTextAGreek + +lexAGreek2 :: String -> [String] +lexAGreek2 = fromAGreek . lexTextAGreek2 + +unlexAGreek :: [String] -> String +unlexAGreek = unlexTextAGreek . toAGreek + +-- Note: unlexAGreek does not glue punctuation with the previous word, so that short +-- vowel indication (like a.) differs from sentence end (a .). + +-- | normalize = change grave accent on sentence internal words to acute, +-- and shift inherited acutes to the following enclitic (where they are +-- visible only as shown in the list of enclitics above) + +normalize :: String -> String +normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek) + +fromAGreek :: [String] -> [String] +fromAGreek s = case s of + w:[]:vs -> w:[]:(fromAGreek vs) + w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs) + w:v:vs | wasEnclitic v && wasEnclitic w -> + getEnclitic w : fromAGreek (v:vs) + w:v:vs | wasEnclitic v && wasProclitic w -> -- "ei)' tines*" + getProclitic w : getEnclitic v : fromAGreek vs + w:v:vs | wasEnclitic v && (hasEndCircum w || + (hasEndAcute w && hasSingleAccent w)) -> + w : getEnclitic v : fromAGreek vs -- ok "sofoi' tines*" + w:v:vs | wasEnclitic v && hasPrefinalAcute w -> + w : getEnclitic v : fromAGreek vs + w:v:vs | wasEnclitic v && hasEndAcute w -> -- ok "a)'nvrwpoi' tines*" + dropLastAccent w : getEnclitic v : fromAGreek vs + w:v:vs | wasEnclitic w -> + getEnclitic w : fromAGreek (v:vs) + w:ws -> (toAcute w) : (fromAGreek ws) + ws -> ws + +-- | de-normalize = change acute accent of end syllables in sentence internal +-- (non-enclitic) words to grave accent, and move accents of enclitics to the +-- previous word to produce ordinary ancient greek + +denormalize :: String -> String +denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek) + +toAGreek :: [String] -> [String] +toAGreek s = case s of + w:[]:vs -> w:[]:(toAGreek vs) + w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs) -- w:[] for following -to_ancientgreek + w:v:vs | isEnclitic v && isEnclitic w -> + addAcute w : toAGreek (dropAccent v:vs) -- BR 11 Anm.2 + w:v:vs | isEnclitic v && isProclitic w -> -- BR 11 a.beta + addAcute w: (toAGreek (dropAccent v:vs)) + w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) -> + w:(toAGreek (dropAccent v:vs)) -- BR 11 a.alpha,beta + w:v:vs | isEnclitic v && hasPrefinalAcute w -> + w:v: toAGreek vs -- bisyllabic v keeps its accent BR 11 b. + w:v:vs | isEnclitic v -> + (addAcute w):(toAGreek (dropAccent v:vs)) -- BR 11 a.gamma + w:v:vs | isEnclitic w -> w:(toAGreek (v:vs)) + w:ws -> (toGrave w) : (toAGreek ws) + ws -> ws + +-- | Change accent on the final syllable of a word + +toGrave :: String -> String +toGrave = reverse . grave . reverse where + grave s = case s of + '\'':cs -> '`':cs + c:cs | isAGreekVowel c -> c:cs + c:cs -> c: grave cs + _ -> s + +toAcute :: String -> String +toAcute = reverse . acute . reverse where + acute s = case s of + '`':cs -> '\'':cs + c:cs | isAGreekVowel c -> c:cs + c:cs -> c: acute cs + _ -> s + +isAGreekVowel = flip elem "aeioyhw" + +-- | Accent moves for enclitics and proclitics (atona) + +enclitics = [ + "moy","moi","me", -- personal pronouns + "soy","soi","se", + "oy(","oi(","e(", + "tis*","ti","tina'", -- indefinite pronoun + "tino's*","tini'", + "tine's*","tina's*", + "tinw~n","tisi'","tisi'n", + "poy","poi", -- indefinite adverbs + "pove'n","pws*", + "ph|","pote'", + "ge","te","toi", -- particles + "nyn","per","pw" + -- suffix -"de" + -- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~) + ] -- and more, BR 11 + +proclitics = [ + "o(","h(","oi(","ai(", -- articles + "e)n","ei)s*","e)x","e)k", -- prepositions + "ei)","w(s*", -- conjunctions + "oy)","oy)k","oy)c" -- negation + ] + +isEnclitic = flip elem enclitics +isProclitic = flip elem proclitics + +-- Check if a word is an enclitic or accented enclitic and extract the enclitic + +wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics) + ++ (map dropAccent (filter hasAccent enclitics)) + accented = (filter hasAccent enclitics) + ++ map addAcute (filter (not . hasAccent) enclitics) + in flip elem (accented ++ unaccented) + +wasProclitic = flip elem (map addAcute proclitics) + +getEnclitic = + let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics)) + ++ (map addAcute (filter (not . hasAccent) enclitics))) + (enclitics ++ (filter hasAccent enclitics) + ++ (filter (not . hasAccent) enclitics)) + find = \v -> lookup v pairs + in \v -> case (find v) of + Just x -> x + _ -> v +getProclitic = + let pairs = zip (map addAcute proclitics) proclitics + find = \v -> lookup v pairs + in \v -> case (find v) of + Just x -> x + _ -> v + +-- | Accent manipulation + +dropAccent = reverse . drop . reverse where + drop s = case s of + [] -> [] + '\'':cs -> cs + '`':cs -> cs + '~':cs -> cs + c:cs -> c:drop cs + +dropLastAccent = reverse . drop . reverse where + drop s = case s of + [] -> [] + '\'':cs -> cs + '`':cs -> cs + '~':cs -> cs + c:cs -> c:drop cs + +addAcute :: String -> String +addAcute = reverse . acu . reverse where + acu w = case w of + c:cs | c == '\'' -> c:cs + c:cs | c == '(' -> '\'':c:cs + c:cs | c == ')' -> '\'':c:cs + c:cs | isAGreekVowel c -> '\'':c:cs + c:cs -> c : acu cs + _ -> w + +-- | Accent checking on end syllables + +hasEndAcute = find . reverse where + find s = case s of + [] -> False + '\'':cs -> True + '`':cs -> False + '~':cs -> False + c:cs | isAGreekVowel c -> False + _:cs -> find cs + +hasEndCircum = find . reverse where + find s = case s of + [] -> False + '\'':cs -> False + '`':cs -> False + '~':cs -> True + c:cs | isAGreekVowel c -> False + _:cs -> find cs + +hasPrefinalAcute = find . reverse where + find s = case s of + [] -> False + '\'':cs -> False -- final acute + '`':cs -> False + '~':cs -> False + c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs + c:cs | isAGreekVowel c -> findNext cs + _:cs -> find cs where + findNext s = case s of + [] -> False + '\'':cs -> True -- prefinal acute + '`':cs -> False + '~':cs -> False + c:cs | isAGreekVowel c -> False + _:cs -> findNext cs where + +hasSingleAccent v = + hasAccent v && not (hasAccent (dropLastAccent v)) + +hasAccent v = case v of + [] -> False + c:cs -> elem c ['\'','`','~'] || hasAccent cs + +{- Tests: + +-- denormalization. Examples in BR 11 work: +-} +enclitics_expls = -- normalized + "sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n": -- a.alpha + "sofo's tis*":"sofoi' tine's*": -- a.beta + "ei) tis*":"ei) tine's*": + "a)'nvrwpos* tis*":"a)'nvrwpoi tine's*": -- a.gamma + "doy~los* tis*":"doy~loi tine's*": + "lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n": -- b. + "ei) poy tis* tina' i)'doi": -- Anm. 2. + [] + + +unlexCode :: [String] -> String +unlexCode s = case s of + w:[] -> w + [c]:cs | isParen c -> [c] ++ unlexCode cs + w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs + w:ws -> w ++ " " ++ unlexCode ws + _ -> [] + + +-- | LaTeX lexer in the math mode: \ should not be separated from the next word + +lexLatexCode :: String -> [String] +lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex + restoreBackslash ws = case ws of + "\\":w:ww -> ("\\" ++ w) : restoreBackslash ww + w:ww -> w:restoreBackslash ww + _ -> ws + +-- * Mixed lexing + +-- | LaTeX style lexer, with "math" environment using Code between $...$ +lexMixed :: (String -> Bool) -> String -> [String] +lexMixed good = concat . alternate False [] where + alternate env t s = case s of + '$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs + '\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs + c:cs -> alternate env (c:t) cs + _ -> [lex env (reverse t)] + lex env = if env then lexLatexCode else lexText good + +unlexMixed :: (String -> Bool) -> [String] -> String +unlexMixed good = capitInit . concat . alternate False where + alternate env s = case s of + _:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of + (t,[]) -> unlex env t : [] + (t,c:m) -> unlex env t : sep env c m : alternate (not env) m + _ -> [] + unlex env = if env then unlexCode else (uncapitInit good . unlexText) + sep env c m = case (m,env) of + ([p]:_,True) | isPunct p -> c -- closing $ glued to next punct + (_, True) -> c ++ " " -- closing $ otherwise separated by space from what follows + _ -> " " ++ c -- put space before opening $ + +-- * Additional lexing uitilties + +-- | Capitalize first letter +capitInit s = case s of + c:cs -> toUpper c : cs + _ -> s + +-- | Uncapitalize first letter +uncapitInit good s = + case s of + c:cs | not (good s) -> toLower c : cs + _ -> s + +-- | Unquote each string wrapped in double quotes +unquote = map unq where + unq s = case s of + '"':cs@(_:_) | last cs == '"' -> init cs + _ -> s + +isPunct = flip elem ".?!,:;" +isMajorPunct = flip elem ".?!" +isMinorPunct = flip elem ",:;" +isParen = flip elem "()[]{}" +isClosing = flip elem ")]}" |
