From 1f908fa7bf65f51540ccb2b70ca2bd00d9b3dedf Mon Sep 17 00:00:00 2001 From: Krasimir Angelov Date: Mon, 4 Sep 2017 11:43:37 +0200 Subject: eliminate modules PGF.Lexing, PGF.LexingAGreek. Make PGF.Utilities an internal module in the runtime. These are not really part of the core runtime. --- src/runtime/haskell/PGF/Lexing.hs | 115 ------------ src/runtime/haskell/PGF/LexingAGreek.hs | 310 -------------------------------- 2 files changed, 425 deletions(-) delete mode 100644 src/runtime/haskell/PGF/Lexing.hs delete mode 100644 src/runtime/haskell/PGF/LexingAGreek.hs (limited to 'src/runtime/haskell/PGF') diff --git a/src/runtime/haskell/PGF/Lexing.hs b/src/runtime/haskell/PGF/Lexing.hs deleted file mode 100644 index 4dc352792..000000000 --- a/src/runtime/haskell/PGF/Lexing.hs +++ /dev/null @@ -1,115 +0,0 @@ -module PGF.Lexing where -import Data.Char(isSpace,toLower,toUpper) - --- * Text lexing --- | Text lexing with standard word capitalization of the first word of every sentence -lexText :: String -> [String] -lexText = lexText' uncapitInit - --- | Text lexing with custom treatment of the first word of every sentence. -lexText' :: (String->String) -> String -> [String] -lexText' uncap1 = uncap . lext where - lext s = case s of - c:cs | isMajorPunct c -> [c] : uncap (lext cs) - c:cs | isMinorPunct c -> [c] : lext cs - c:cs | isSpace c -> lext cs - _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs - _ -> [s] - uncap s = case s of - w:ws -> uncap1 w:ws - _ -> s - -unlexText :: [String] -> String -unlexText = capitInit . unlext where - unlext s = case s of - w:[] -> w - w:[c]:[] | isPunct c -> w ++ [c] - w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs) - w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs - w:ws -> w ++ " " ++ unlext ws - _ -> [] - --- | Bind tokens separated by Prelude.BIND, i.e. &+ -bindTok :: [String] -> [String] -bindTok ws = case ws of - w1:"&+":w2:ws -> bindTok ((w1++w2):ws) - "&+":ws -> bindTok ws - "&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws) - "&|":ws -> bindTok ws - w:ws -> w:bindTok ws - [] -> [] - --- * Code lexing - --- | Haskell lexer, usable for much code -lexCode :: String -> [String] -lexCode ss = case lex ss of - [(w@(_:_),ws)] -> w : lexCode ws - _ -> [] - -unlexCode :: [String] -> String -unlexCode s = case s of - w:[] -> w - [c]:cs | isParen c -> [c] ++ unlexCode cs - w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs - w:ws -> w ++ " " ++ unlexCode ws - _ -> [] - - --- | LaTeX lexer in the math mode: \ should not be separated from the next word - -lexLatexCode :: String -> [String] -lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex - restoreBackslash ws = case ws of - "\\":w:ww -> ("\\" ++ w) : restoreBackslash ww - w:ww -> w:restoreBackslash ww - _ -> ws - --- * Mixed lexing - --- | LaTeX style lexer, with "math" environment using Code between $...$ -lexMixed :: String -> [String] -lexMixed = concat . alternate False [] where - alternate env t s = case s of - '$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs - '\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs - c:cs -> alternate env (c:t) cs - _ -> [lex env (reverse t)] - lex env = if env then lexLatexCode else lexText - -unlexMixed :: [String] -> String -unlexMixed = capitInit . concat . alternate False where - alternate env s = case s of - _:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of - (t,[]) -> unlex env t : [] - (t,c:m) -> unlex env t : sep env c m : alternate (not env) m - _ -> [] - unlex env = if env then unlexCode else (uncapitInit . unlexText) - sep env c m = case (m,env) of - ([p]:_,True) | isPunct p -> c -- closing $ glued to next punct - (_, True) -> c ++ " " -- closing $ otherwise separated by space from what follows - _ -> " " ++ c -- put space before opening $ - --- * Additional lexing uitilties - --- | Capitalize first letter -capitInit s = case s of - c:cs -> toUpper c : cs - _ -> s - --- | Uncapitalize first letter -uncapitInit s = case s of - c:cs -> toLower c : cs - _ -> s - --- | Unquote each string wrapped in double quotes -unquote = map unq where - unq s = case s of - '"':cs@(_:_) | last cs == '"' -> init cs - _ -> s - -isPunct = flip elem ".?!,:;" -isMajorPunct = flip elem ".?!" -isMinorPunct = flip elem ",:;" -isParen = flip elem "()[]{}" -isClosing = flip elem ")]}" diff --git a/src/runtime/haskell/PGF/LexingAGreek.hs b/src/runtime/haskell/PGF/LexingAGreek.hs deleted file mode 100644 index a8ed19238..000000000 --- a/src/runtime/haskell/PGF/LexingAGreek.hs +++ /dev/null @@ -1,310 +0,0 @@ -module PGF.LexingAGreek where -- HL 2a1.2.2016 -import Data.Char(isSpace) - --- * Text lexing without word capitalization of the first word of every sentence. --- Greek sentences in (transliterated) texts don't start with capital character. - --- Ordinary greek text does not have vowel length indicators. We then use '.' as --- a sentence separator. - -lexTextAGreek :: String -> [String] -lexTextAGreek s = lext s where - lext s = case s of - c:cs | isAGreekPunct c -> [c] : (lext cs) - c:cs | isSpace c -> lext cs - _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s - in w : lext cs - [] -> [] - --- Philological greek text may use vowel length indicators. Then '.' is not a sentence --- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c. - -lexTextAGreek2 :: String -> [String] -lexTextAGreek2 s = lext s where - lext s = case s of - c:cs | isAGreekPunct c -> [c] : (lext cs) - c:cs | isSpace c -> lext cs - _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s - in case cs of - '.':'.':d:ds | isSpace d - -> (w++['.']) : lext ('.':d:ds) - '.':d:ds | isAGreekPunct d || isSpace d - -> (w++['.']) : lext (d:ds) - '.':d:ds | not (isSpace d) - -> case lext (d:ds) of - e:es -> (w++['.']++e) : es - es -> (w++['.']) : es - '.':[] -> (w++['.']) : [] - _ -> w : lext cs - [] -> [] - -unlexTextAGreek :: [String] -> String -unlexTextAGreek = unlext where - unlext s = case s of - w:[] -> w - w:[c]:[] | isAGreekPunct c -> w ++ [c] - w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs - w:ws -> w ++ " " ++ unlext ws - [] -> [] - -isAGreekPunct = flip elem ".,;··" -- colon: first version · not in charset, - -- second version · = 00B7 standard code point - --- * Text lexing and unlexing for Ancient Greek: --- 1. no capitalization of initial word, --- 2. grave/acute accent switch on final syllables of words not followed by punctuation, --- 3. accent move from/to support word to/from following clitic words (iterated). - -lexAGreek :: String -> [String] -lexAGreek = fromAGreek . lexTextAGreek - -lexAGreek2 :: String -> [String] -lexAGreek2 = fromAGreek . lexTextAGreek2 - -unlexAGreek :: [String] -> String -unlexAGreek = unlexTextAGreek . toAGreek - --- Note: unlexAGreek does not glue punctuation with the previous word, so that short --- vowel indication (like a.) differs from sentence end (a .). - --- | normalize = change grave accent on sentence internal words to acute, --- and shift inherited acutes to the following enclitic (where they are --- visible only as shown in the list of enclitics above) - -normalize :: String -> String -normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek) - -fromAGreek :: [String] -> [String] -fromAGreek s = case s of - w:[]:vs -> w:[]:(fromAGreek vs) - w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs) - w:v:vs | wasEnclitic v && wasEnclitic w -> - getEnclitic w : fromAGreek (v:vs) - w:v:vs | wasEnclitic v && wasProclitic w -> -- "ei)' tines*" - getProclitic w : getEnclitic v : fromAGreek vs - w:v:vs | wasEnclitic v && (hasEndCircum w || - (hasEndAcute w && hasSingleAccent w)) -> - w : getEnclitic v : fromAGreek vs -- ok "sofoi' tines*" - w:v:vs | wasEnclitic v && hasPrefinalAcute w -> - w : getEnclitic v : fromAGreek vs - w:v:vs | wasEnclitic v && hasEndAcute w -> -- ok "a)'nvrwpoi' tines*" - dropLastAccent w : getEnclitic v : fromAGreek vs - w:v:vs | wasEnclitic w -> - getEnclitic w : fromAGreek (v:vs) - w:ws -> (toAcute w) : (fromAGreek ws) - ws -> ws - --- | de-normalize = change acute accent of end syllables in sentence internal --- (non-enclitic) words to grave accent, and move accents of enclitics to the --- previous word to produce ordinary ancient greek - -denormalize :: String -> String -denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek) - -toAGreek :: [String] -> [String] -toAGreek s = case s of - w:[]:vs -> w:[]:(toAGreek vs) - w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs) -- w:[] for following -to_ancientgreek - w:v:vs | isEnclitic v && isEnclitic w -> - addAcute w : toAGreek (dropAccent v:vs) -- BR 11 Anm.2 - w:v:vs | isEnclitic v && isProclitic w -> -- BR 11 a.beta - addAcute w: (toAGreek (dropAccent v:vs)) - w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) -> - w:(toAGreek (dropAccent v:vs)) -- BR 11 a.alpha,beta - w:v:vs | isEnclitic v && hasPrefinalAcute w -> - w:v: toAGreek vs -- bisyllabic v keeps its accent BR 11 b. - w:v:vs | isEnclitic v -> - (addAcute w):(toAGreek (dropAccent v:vs)) -- BR 11 a.gamma - w:v:vs | isEnclitic w -> w:(toAGreek (v:vs)) - w:ws -> (toGrave w) : (toAGreek ws) - ws -> ws - --- | Change accent on the final syllable of a word - -toGrave :: String -> String -toGrave = reverse . grave . reverse where - grave s = case s of - '\'':cs -> '`':cs - c:cs | isAGreekVowel c -> c:cs - c:cs -> c: grave cs - _ -> s - -toAcute :: String -> String -toAcute = reverse . acute . reverse where - acute s = case s of - '`':cs -> '\'':cs - c:cs | isAGreekVowel c -> c:cs - c:cs -> c: acute cs - _ -> s - -isAGreekVowel = flip elem "aeioyhw" - --- | Accent moves for enclitics and proclitics (atona) - -enclitics = [ - "moy","moi","me", -- personal pronouns - "soy","soi","se", - "oy(","oi(","e(", - "tis*","ti","tina'", -- indefinite pronoun - "tino's*","tini'", - "tine's*","tina's*", - "tinw~n","tisi'","tisi'n", - "poy","poi", -- indefinite adverbs - "pove'n","pws*", - "ph|","pote'", - "ge","te","toi", -- particles - "nyn","per","pw" - -- suffix -"de" - -- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~) - ] -- and more, BR 11 - -proclitics = [ - "o(","h(","oi(","ai(", -- articles - "e)n","ei)s*","e)x","e)k", -- prepositions - "ei)","w(s*", -- conjunctions - "oy)","oy)k","oy)c" -- negation - ] - -isEnclitic = flip elem enclitics -isProclitic = flip elem proclitics - --- Check if a word is an enclitic or accented enclitic and extract the enclitic - -wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics) - ++ (map dropAccent (filter hasAccent enclitics)) - accented = (filter hasAccent enclitics) - ++ map addAcute (filter (not . hasAccent) enclitics) - in flip elem (accented ++ unaccented) - -wasProclitic = flip elem (map addAcute proclitics) - -getEnclitic = - let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics)) - ++ (map addAcute (filter (not . hasAccent) enclitics))) - (enclitics ++ (filter hasAccent enclitics) - ++ (filter (not . hasAccent) enclitics)) - find = \v -> lookup v pairs - in \v -> case (find v) of - Just x -> x - _ -> v -getProclitic = - let pairs = zip (map addAcute proclitics) proclitics - find = \v -> lookup v pairs - in \v -> case (find v) of - Just x -> x - _ -> v - --- | Accent manipulation - -dropAccent = reverse . drop . reverse where - drop s = case s of - [] -> [] - '\'':cs -> cs - '`':cs -> cs - '~':cs -> cs - c:cs -> c:drop cs - -dropLastAccent = reverse . drop . reverse where - drop s = case s of - [] -> [] - '\'':cs -> cs - '`':cs -> cs - '~':cs -> cs - c:cs -> c:drop cs - -addAcute :: String -> String -addAcute = reverse . acu . reverse where - acu w = case w of - c:cs | c == '\'' -> c:cs - c:cs | c == '(' -> '\'':c:cs - c:cs | c == ')' -> '\'':c:cs - c:cs | isAGreekVowel c -> '\'':c:cs - c:cs -> c : acu cs - _ -> w - --- | Accent checking on end syllables - -hasEndAcute = find . reverse where - find s = case s of - [] -> False - '\'':cs -> True - '`':cs -> False - '~':cs -> False - c:cs | isAGreekVowel c -> False - _:cs -> find cs - -hasEndCircum = find . reverse where - find s = case s of - [] -> False - '\'':cs -> False - '`':cs -> False - '~':cs -> True - c:cs | isAGreekVowel c -> False - _:cs -> find cs - -hasPrefinalAcute = find . reverse where - find s = case s of - [] -> False - '\'':cs -> False -- final acute - '`':cs -> False - '~':cs -> False - c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs - c:cs | isAGreekVowel c -> findNext cs - _:cs -> find cs where - findNext s = case s of - [] -> False - '\'':cs -> True -- prefinal acute - '`':cs -> False - '~':cs -> False - c:cs | isAGreekVowel c -> False - _:cs -> findNext cs where - -hasSingleAccent v = - hasAccent v && not (hasAccent (dropLastAccent v)) - -hasAccent v = case v of - [] -> False - c:cs -> elem c ['\'','`','~'] || hasAccent cs - -{- Tests: - --- denormalization. Examples in BR 11 work: --} -enclitics_expls = -- normalized - "sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n": -- a.alpha - "sofo's tis*":"sofoi' tine's*": -- a.beta - "ei) tis*":"ei) tine's*": - "a)'nvrwpos* tis*":"a)'nvrwpoi tine's*": -- a.gamma - "doy~los* tis*":"doy~loi tine's*": - "lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n": -- b. - "ei) poy tis* tina' i)'doi": -- Anm. 2. - [] -{- -test = map denormalize enclitics_expls - -*PGF.LexingAGreek> test - ["sofw~n tis*","sofw~n tines*","sof~n tinwn", - "sofo's tis*","sofoi' tines*", - "ei)' tis*","ei)' tines*", - "a)'nvrwpo's* tis*","a)'nvrwpoi' tines*", - "doy~lo's* tis*","doy~loi' tines*", - "lo'gos* tis* ","lo'goi tine's*","lo'gwn tinw~n ", - "ei)' poy' ti's* tina i)'doi"] - --- normalization: - -*PGF.LexingAGreek> map normalize test - ["sofw~n tis*","sofw~n tine's*","sof~n tinw~n", - "sofo's tis*","sofoi' tine's*", - "ei) tis*","ei) tine's*", - "a)'nvrwpos* tis*","a)'nvrwpoi tine's*", - "doy~los* tis*","doy~loi tine's*", - "lo'gos* tis*","lo'goi tine's*","lo'gwn tinw~n", - "ei) poy tis* tina' i)'doi"] - -*PGF.LexingAGreek> map (normalize . denormalize) enclitics_expls == enclitics_expls -True -*PGF.LexingAGreek> map (denormalize . normalize) test == test -True - --} -- cgit v1.2.3