diff options
Diffstat (limited to 'src/GF/Text')
| -rw-r--r-- | src/GF/Text/Arabic.hs | 48 | ||||
| -rw-r--r-- | src/GF/Text/Greek.hs | 158 | ||||
| -rw-r--r-- | src/GF/Text/Hebrew.hs | 21 | ||||
| -rw-r--r-- | src/GF/Text/Russian.hs | 31 | ||||
| -rw-r--r-- | src/GF/Text/Text.hs | 56 | ||||
| -rw-r--r-- | src/GF/Text/UTF8.hs | 35 | ||||
| -rw-r--r-- | src/GF/Text/Unicode.hs | 24 |
7 files changed, 373 insertions, 0 deletions
diff --git a/src/GF/Text/Arabic.hs b/src/GF/Text/Arabic.hs new file mode 100644 index 000000000..6df79c4a9 --- /dev/null +++ b/src/GF/Text/Arabic.hs @@ -0,0 +1,48 @@ +module Arabic where + +mkArabic :: String -> String +mkArabic = reverse . unwords . (map mkArabicWord) . words +--- reverse : assumes everything's on same line + +type ArabicChar = Char + +mkArabicWord :: String -> [ArabicChar] +mkArabicWord = map mkArabicChar . getLetterPos + +getLetterPos :: String -> [(Char,Int)] +getLetterPos [] = [] +getLetterPos ('I':cs) = ('*',7) : getLetterPos cs -- 0xfe80 +getLetterPos ('O':cs) = ('*',8) : getIn cs -- 0xfe8b +getLetterPos ('l':'a':cs) = ('*',5) : getLetterPos cs -- 0xfefb +getLetterPos [c] = [(c,1)] -- 1=isolated +getLetterPos (c:cs) | isReduced c = (c,1) : getLetterPos cs +getLetterPos (c:cs) = (c,3) : getIn cs -- 3=initial + + +getIn [] = [] +getIn ('I':cs) = ('*',7) : getLetterPos cs -- 0xfe80 +getIn ('O':cs) = ('*',9) : getIn cs -- 0xfe8c +getIn ('l':'a':cs) = ('*',6) : getLetterPos cs -- 0xfefc +getIn [c] = [(c,2)] -- 2=final +getIn (c:cs) | isReduced c = (c,2) : getLetterPos cs +getIn (c:cs) = (c,4) : getIn cs -- 4=medial + +isReduced :: Char -> Bool +isReduced c = c `elem` "UuWiYOaAdVrzwj" + +mkArabicChar ('*',p) | p > 4 && p < 10 = + (map toEnum [0xfefb,0xfefc,0xfe80,0xfe8b,0xfe8c]) !! (p-5) +mkArabicChar cp@(c,p) = case lookup c cc of Just c' -> (c' !! (p-1)) ; _ -> c + where + cc = mkArabicTab allArabicCodes allArabic + +mkArabicTab (c:cs) as = (c,as1) : mkArabicTab cs as2 where + (as1,as2) = if isReduced c then splitAt 2 as else splitAt 4 as +mkArabicTab [] _ = [] + +allArabicCodes = "UuWiYOabAtvgHCdVrzscSDTZoxfqklmnhwjy" + +allArabic :: String +allArabic = (map toEnum [0xfe81 .. 0xfef4]) -- I=0xfe80 + + diff --git a/src/GF/Text/Greek.hs b/src/GF/Text/Greek.hs new file mode 100644 index 000000000..8cbba8c54 --- /dev/null +++ b/src/GF/Text/Greek.hs @@ -0,0 +1,158 @@ +module Greek where + +mkGreek :: String -> String +mkGreek = unwords . (map mkGreekWord) . mkGravis . words + +--- TODO : optimize character formation by factorizing the case expressions + +type GreekChar = Char + +mkGreekWord :: String -> [GreekChar] +mkGreekWord = map (toEnum . mkGreekChar) . mkGreekSpec + +mkGravis :: [String] -> [String] +mkGravis [] = [] +mkGravis [w] = [w] +mkGravis (w1:w2:ws) + | stressed w2 = mkG w1 : mkGravis (w2:ws) + | otherwise = w1 : w2 : mkGravis ws + where + stressed w = any (`elem` "'~`") w + mkG :: String -> String + mkG w = let (w1,w2) = span (/='\'') w in + case w2 of + '\'':v:cs | not (any isVowel cs) -> w1 ++ "`" ++ [v] ++ cs + '\'':'!':v:cs | not (any isVowel cs) -> w1 ++ "`!" ++ [v] ++ cs + _ -> w + isVowel c = elem c "aehiouw" + +mkGreekSpec :: String -> [(Char,Int)] +mkGreekSpec str = case str of + [] -> [] + '(' :'\'': '!' : c : cs -> (c,25) : mkGreekSpec cs + '(' :'~' : '!' : c : cs -> (c,27) : mkGreekSpec cs + '(' :'`' : '!' : c : cs -> (c,23) : mkGreekSpec cs + '(' : '!' : c : cs -> (c,21) : mkGreekSpec cs + ')' :'\'': '!' : c : cs -> (c,24) : mkGreekSpec cs + ')' :'~' : '!' : c : cs -> (c,26) : mkGreekSpec cs + ')' :'`' : '!' : c : cs -> (c,22) : mkGreekSpec cs + ')' : '!' : c : cs -> (c,20) : mkGreekSpec cs + '\'': '!' : c : cs -> (c,30) : mkGreekSpec cs + '~' : '!' : c : cs -> (c,31) : mkGreekSpec cs + '`' : '!' : c : cs -> (c,32) : mkGreekSpec cs + '!' : c : cs -> (c,33) : mkGreekSpec cs + '(' :'\'': c : cs -> (c,5) : mkGreekSpec cs + '(' :'~' : c : cs -> (c,7) : mkGreekSpec cs + '(' :'`' : c : cs -> (c,3) : mkGreekSpec cs + '(' : c : cs -> (c,1) : mkGreekSpec cs + ')' :'\'': c : cs -> (c,4) : mkGreekSpec cs + ')' :'~' : c : cs -> (c,6) : mkGreekSpec cs + ')' :'`' : c : cs -> (c,2) : mkGreekSpec cs + ')' : c : cs -> (c,0) : mkGreekSpec cs + '\'': c : cs -> (c,10) : mkGreekSpec cs + '~' : c : cs -> (c,11) : mkGreekSpec cs + '`' : c : cs -> (c,12) : mkGreekSpec cs + c : cs -> (c,-1) : mkGreekSpec cs + +mkGreekChar (c,-1) = case lookup c cc of Just c' -> c' ; _ -> fromEnum c + where + cc = zip "abgdezhqiklmnxoprjstyfcuw" allGreekMin +mkGreekChar (c,n) = case (c,n) of + ('a',10) -> 0x03ac + ('a',11) -> 0x1fb6 + ('a',12) -> 0x1f70 + ('a',30) -> 0x1fb4 + ('a',31) -> 0x1fb7 + ('a',32) -> 0x1fb2 + ('a',33) -> 0x1fb3 + ('a',n) | n >19 -> 0x1f80 + n - 20 + ('a',n) -> 0x1f00 + n + ('e',10) -> 0x03ad -- ' +-- ('e',11) -> 0x1fb6 -- ~ can't happen + ('e',12) -> 0x1f72 -- ` + ('e',n) -> 0x1f10 + n + ('h',10) -> 0x03ae -- ' + ('h',11) -> 0x1fc6 -- ~ + ('h',12) -> 0x1f74 -- ` + + ('h',30) -> 0x1fc4 + ('h',31) -> 0x1fc7 + ('h',32) -> 0x1fc2 + ('h',33) -> 0x1fc3 + ('h',n) | n >19 -> 0x1f90 + n - 20 + + ('h',n) -> 0x1f20 + n + ('i',10) -> 0x03af -- ' + ('i',11) -> 0x1fd6 -- ~ + ('i',12) -> 0x1f76 -- ` + ('i',n) -> 0x1f30 + n + ('o',10) -> 0x03cc -- ' +-- ('o',11) -> 0x1fb6 -- ~ can't happen + ('o',12) -> 0x1f78 -- ` + ('o',n) -> 0x1f40 + n + ('y',10) -> 0x03cd -- ' + ('y',11) -> 0x1fe6 -- ~ + ('y',12) -> 0x1f7a -- ` + ('y',n) -> 0x1f50 + n + ('w',10) -> 0x03ce -- ' + ('w',11) -> 0x1ff6 -- ~ + ('w',12) -> 0x1f7c -- ` + + ('w',30) -> 0x1ff4 + ('w',31) -> 0x1ff7 + ('w',32) -> 0x1ff2 + ('w',33) -> 0x1ff3 + ('w',n) | n >19 -> 0x1fa0 + n - 20 + + ('w',n) -> 0x1f60 + n + ('r',1) -> 0x1fe5 + _ -> mkGreekChar (c,-1) --- should not happen + +allGreekMin :: [Int] +allGreekMin = [0x03b1 .. 0x03c9] + + +{- +encoding of Greek writing. Those hard to guess are marked with --- + + maj min +A a Alpha 0391 03b1 +B b Beta 0392 03b2 +G g Gamma 0393 03b3 +D d Delta 0394 03b4 +E e Epsilon 0395 03b5 +Z z Zeta 0396 03b6 +H h Eta --- 0397 03b7 +Q q Theta --- 0398 03b8 +I i Iota 0399 03b9 +K k Kappa 039a 03ba +L l Lambda 039b 03bb +M m My 039c 03bc +N n Ny 039d 03bd +X x Xi 039e 03be +O o Omikron 039f 03bf +P p Pi 03a0 03c0 +R r Rho 03a1 03c1 + j Sigma --- 03c2 +S s Sigma 03a3 03c3 +T t Tau 03a4 03c4 +Y y Ypsilon 03a5 03c5 +F f Phi 03a6 03c6 +C c Khi --- 03a7 03c7 +U u Psi 03a8 03c8 +W w Omega --- 03a9 03c9 + +( spiritus asper +) spiritus lenis +! iota subscriptum + +' acutus +~ circumflexus +` gravis + +-} + + + + + diff --git a/src/GF/Text/Hebrew.hs b/src/GF/Text/Hebrew.hs new file mode 100644 index 000000000..ebcc078e3 --- /dev/null +++ b/src/GF/Text/Hebrew.hs @@ -0,0 +1,21 @@ +module Hebrew where + +mkHebrew :: String -> String +mkHebrew = reverse . unwords . (map mkHebrewWord) . words +--- reverse : assumes everything's on same line + +type HebrewChar = Char + +mkHebrewWord :: String -> [HebrewChar] +mkHebrewWord = map mkHebrewChar + +mkHebrewChar c = case lookup c cc of Just c' -> c' ; _ -> c + where + cc = zip allHebrewCodes allHebrew + +allHebrewCodes = "-abgdhwzHTyKklMmNnSoPpCcqrst" + +allHebrew :: String +allHebrew = (map toEnum (0x05be : [0x05d0 .. 0x05ea])) + + diff --git a/src/GF/Text/Russian.hs b/src/GF/Text/Russian.hs new file mode 100644 index 000000000..07605a83a --- /dev/null +++ b/src/GF/Text/Russian.hs @@ -0,0 +1,31 @@ +module Russian where + +-- an ad hoc ASCII encoding. Delimiters: /_ _/ +mkRussian :: String -> String +mkRussian = unwords . (map mkRussianWord) . words + +-- the KOI8 encoding, incomplete. Delimiters: /* */ +mkRusKOI8 :: String -> String +mkRusKOI8 = unwords . (map mkRussianKOI8) . words + +type RussianChar = Char + +mkRussianWord :: String -> [RussianChar] +mkRussianWord = map (mkRussianChar allRussianCodes) + +mkRussianKOI8 :: String -> [RussianChar] +mkRussianKOI8 = map (mkRussianChar allRussianKOI8) + +mkRussianChar chars c = case lookup c cc of Just c' -> c' ; _ -> c + where + cc = zip chars allRussian + +allRussianCodes = + "ÅåABVGDEXZIJKLMNOPRSTUFHCQW£}!*ÖYÄabvgdexzijklmnoprstufhcqw#01'öyä" +allRussianKOI8 = + "^@áâ÷çäåöúéêëìíîïðòóôõæèãþûýøùÿüàñÁÂ×ÇÄÅÖÚÉÊËÌÍÎÏÐÒÓÔÕÆÈÃÞÛÝØÙßÜÀÑ" + +allRussian :: String +allRussian = (map toEnum (0x0401:0x0451:[0x0410 .. 0x044f])) -- Ëë in odd places + + diff --git a/src/GF/Text/Text.hs b/src/GF/Text/Text.hs new file mode 100644 index 000000000..08e897a9b --- /dev/null +++ b/src/GF/Text/Text.hs @@ -0,0 +1,56 @@ +module Text where + +import Operations +import Char + +-- elementary text postprocessing. AR 21/11/2001 +-- This is very primitive indeed. The functions should work on +-- token lists and not on strings. AR 5/12/2002 + + +formatAsTextLit :: String -> String +formatAsTextLit = formatAsText . unwords . map unStringLit . words +--- hope that there will be deforestation... + +formatAsCodeLit :: String -> String +formatAsCodeLit = formatAsCode . unwords . map unStringLit . words + +formatAsText :: String -> String +formatAsText = unwords . format . cap . words where + format ws = case ws of + w : c : ww | major c -> (w ++ c) : format (cap ww) + w : c : ww | minor c -> (w ++ c) : format ww + c : ww | para c -> "\n\n" : format ww + w : ww -> w : format ww + [] -> [] + cap (p:(c:cs):ww) | para p = p : (toUpper c : cs) : ww + cap ((c:cs):ww) = (toUpper c : cs) : ww + cap [] = [] + major = flip elem (map singleton ".!?") + minor = flip elem (map singleton ",:;") + para = (=="<p>") + +formatAsCode :: String -> String +formatAsCode = unwords . format . words where + format ws = case ws of + p : w : ww | parB p -> format ((p ++ w') : ww') where (w':ww') = format (w:ww) + w : p : ww | par p -> format ((w ++ p') : ww') where (p':ww') = format (p:ww) + w : ww -> w : format ww + [] -> [] + parB = flip elem (map singleton "([{") + parE = flip elem (map singleton "}])") + par t = parB t || parE t + +performBinds :: String -> String +performBinds = unwords . format . words where + format ws = case ws of + w : "&+" : u : ws -> format ((w ++ u) : ws) + w : ws -> w : format ws + [] -> [] + +unStringLit :: String -> String +unStringLit s = case s of + c : cs | strlim c && strlim (last cs) -> init cs + _ -> s + where + strlim = (=='\'') diff --git a/src/GF/Text/UTF8.hs b/src/GF/Text/UTF8.hs new file mode 100644 index 000000000..57b711b4b --- /dev/null +++ b/src/GF/Text/UTF8.hs @@ -0,0 +1,35 @@ +module UTF8 where + +-- From the Char module supplied with HBC. +-- code by Thomas Hallgren (Jul 10 1999) + +-- Take a Unicode string and encode it as a string +-- with the UTF8 method. +decodeUTF8 :: String -> String +decodeUTF8 "" = "" +decodeUTF8 (c:cs) | c < '\x80' = c : decodeUTF8 cs +decodeUTF8 (c:c':cs) | '\xc0' <= c && c <= '\xdf' && + '\x80' <= c' && c' <= '\xbf' = + toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : decodeUTF8 cs +decodeUTF8 (c:c':c'':cs) | '\xe0' <= c && c <= '\xef' && + '\x80' <= c' && c' <= '\xbf' && + '\x80' <= c'' && c'' <= '\xbf' = + toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : decodeUTF8 cs +decodeUTF8 _ = error "UniChar.decodeUTF8: bad data" + +encodeUTF8 :: String -> String +encodeUTF8 "" = "" +encodeUTF8 (c:cs) = + if c > '\x0000' && c < '\x0080' then + c : encodeUTF8 cs + else if c < toEnum 0x0800 then + let i = fromEnum c + in toEnum (0xc0 + i `div` 0x40) : + toEnum (0x80 + i `mod` 0x40) : + encodeUTF8 cs + else + let i = fromEnum c + in toEnum (0xe0 + i `div` 0x1000) : + toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) : + toEnum (0x80 + i `mod` 0x40) : + encodeUTF8 cs diff --git a/src/GF/Text/Unicode.hs b/src/GF/Text/Unicode.hs new file mode 100644 index 000000000..78aba0461 --- /dev/null +++ b/src/GF/Text/Unicode.hs @@ -0,0 +1,24 @@ +module Unicode where + +import Greek (mkGreek) +import Arabic (mkArabic) +import Hebrew (mkHebrew) +import Russian (mkRussian, mkRusKOI8) + +-- ad hoc Unicode conversions from different alphabets + +-- AR 12/4/2000, 18/9/2001, 30/5/2002 + +mkUnicode s = case s of + '/':'/':cs -> mkGreek (remClosing cs) + '/':'+':cs -> mkHebrew (remClosing cs) + '/':'-':cs -> mkArabic (remClosing cs) + '/':'_':cs -> mkRussian (remClosing cs) + '/':'*':cs -> mkRusKOI8 (remClosing cs) + _ -> s + +remClosing cs + | lcs > 1 && last cs == '/' = take (lcs-2) cs + | otherwise = cs + where lcs = length cs + |
