From b96b36f43de3e2f8b58d5f539daa6f6d47f25870 Mon Sep 17 00:00:00 2001 From: aarne Date: Wed, 25 Jun 2008 16:43:48 +0000 Subject: removed src for 2.9 --- src/GF/Text/Arabic.hs | 63 ------- src/GF/Text/Devanagari.hs | 97 ----------- src/GF/Text/Ethiopic.hs | 72 -------- src/GF/Text/ExtendedArabic.hs | 99 ----------- src/GF/Text/ExtraDiacritics.hs | 37 ---- src/GF/Text/Greek.hs | 172 ------------------- src/GF/Text/Hebrew.hs | 53 ------ src/GF/Text/Hiragana.hs | 95 ----------- src/GF/Text/LatinASupplement.hs | 69 -------- src/GF/Text/OCSCyrillic.hs | 47 ----- src/GF/Text/Russian.hs | 56 ------ src/GF/Text/Tamil.hs | 77 --------- src/GF/Text/Text.hs | 149 ---------------- src/GF/Text/Thai.hs | 368 ---------------------------------------- src/GF/Text/UTF8.hs | 48 ------ src/GF/Text/Unicode.hs | 69 -------- 16 files changed, 1571 deletions(-) delete mode 100644 src/GF/Text/Arabic.hs delete mode 100644 src/GF/Text/Devanagari.hs delete mode 100644 src/GF/Text/Ethiopic.hs delete mode 100644 src/GF/Text/ExtendedArabic.hs delete mode 100644 src/GF/Text/ExtraDiacritics.hs delete mode 100644 src/GF/Text/Greek.hs delete mode 100644 src/GF/Text/Hebrew.hs delete mode 100644 src/GF/Text/Hiragana.hs delete mode 100644 src/GF/Text/LatinASupplement.hs delete mode 100644 src/GF/Text/OCSCyrillic.hs delete mode 100644 src/GF/Text/Russian.hs delete mode 100644 src/GF/Text/Tamil.hs delete mode 100644 src/GF/Text/Text.hs delete mode 100644 src/GF/Text/Thai.hs delete mode 100644 src/GF/Text/UTF8.hs delete mode 100644 src/GF/Text/Unicode.hs (limited to 'src/GF/Text') diff --git a/src/GF/Text/Arabic.hs b/src/GF/Text/Arabic.hs deleted file mode 100644 index c482b1172..000000000 --- a/src/GF/Text/Arabic.hs +++ /dev/null @@ -1,63 +0,0 @@ ----------------------------------------------------------------------- --- | --- Module : Arabic --- Maintainer : (Maintainer) --- Stability : (stable) --- Portability : (portable) --- --- > CVS $Date: 2005/04/21 16:23:34 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.6 $ --- --- (Description of the module) ------------------------------------------------------------------------------ - -module GF.Text.Arabic (mkArabic) where - -mkArabic :: String -> String -mkArabic = unwords . (map mkArabicWord) . words -----mkArabic = reverse . unwords . (map mkArabicWord) . words ---- reverse : assumes everything's on same line - -type ArabicChar = Char - -mkArabicWord :: String -> [ArabicChar] -mkArabicWord = map mkArabicChar . getLetterPos - -getLetterPos :: String -> [(Char,Int)] -getLetterPos [] = [] -getLetterPos ('I':cs) = ('*',7) : getLetterPos cs -- 0xfe80 -getLetterPos ('O':cs) = ('*',8) : getIn cs -- 0xfe8b -getLetterPos ('l':'a':cs) = ('*',5) : getLetterPos cs -- 0xfefb -getLetterPos [c] = [(c,1)] -- 1=isolated -getLetterPos (c:cs) | isReduced c = (c,1) : getLetterPos cs -getLetterPos (c:cs) = (c,3) : getIn cs -- 3=initial - - -getIn [] = [] -getIn ('I':cs) = ('*',7) : getLetterPos cs -- 0xfe80 -getIn ('O':cs) = ('*',9) : getIn cs -- 0xfe8c -getIn ('l':'a':cs) = ('*',6) : getLetterPos cs -- 0xfefc -getIn [c] = [(c,2)] -- 2=final -getIn (c:cs) | isReduced c = (c,2) : getLetterPos cs -getIn (c:cs) = (c,4) : getIn cs -- 4=medial - -isReduced :: Char -> Bool -isReduced c = c `elem` "UuWiYOaAdVrzwj" - -mkArabicChar ('*',p) | p > 4 && p < 10 = - (map toEnum [0xfefb,0xfefc,0xfe80,0xfe8b,0xfe8c]) !! (p-5) -mkArabicChar cp@(c,p) = case lookup c cc of Just c' -> (c' !! (p-1)) ; _ -> c - where - cc = mkArabicTab allArabicCodes allArabic - -mkArabicTab (c:cs) as = (c,as1) : mkArabicTab cs as2 where - (as1,as2) = if isReduced c then splitAt 2 as else splitAt 4 as -mkArabicTab [] _ = [] - -allArabicCodes = "UuWiYOabAtvgHCdVrzscSDTZoxfqklmnhwjy" - -allArabic :: String -allArabic = (map toEnum [0xfe81 .. 0xfef4]) -- I=0xfe80 - - diff --git a/src/GF/Text/Devanagari.hs b/src/GF/Text/Devanagari.hs deleted file mode 100644 index bf4343cd0..000000000 --- a/src/GF/Text/Devanagari.hs +++ /dev/null @@ -1,97 +0,0 @@ ----------------------------------------------------------------------- --- | --- Module : Devanagari --- Maintainer : (Maintainer) --- Stability : (stable) --- Portability : (portable) --- --- > CVS $Date: 2005/04/21 16:23:34 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.5 $ --- --- (Description of the module) ------------------------------------------------------------------------------ - -module GF.Text.Devanagari (mkDevanagari) where - -mkDevanagari :: String -> String -mkDevanagari = digraphWordToUnicode . adHocToDigraphWord - -adHocToDigraphWord :: String -> [(Char, Char)] -adHocToDigraphWord str = case str of - [] -> [] - '<' : cs -> ('\\', '<') : spoolMarkup cs - ' ' : cs -> ('\\', ' ') : adHocToDigraphWord cs -- skip space - --- if c1 is a vowel - -- Two of the same vowel => lengthening - c1 : c2 : cs | c1 == c2 && isVowel c1 -> (cap c1, ':') : adHocToDigraphWord cs - -- digraphed or long vowel - c1 : c2 : cs | isVowel c1 && isVowel c2 -> (cap c1, cap c2) : adHocToDigraphWord cs - c1 : cs | isVowel c1 -> (' ', cap c1) : adHocToDigraphWord cs - --- c1 isn't a vowel - -- c1 : 'a' : [] -> [(' ', c1)] -- a inherent - -- c1 : c2 : [] | isVowel c2 -> (' ', c1) : [(' ', c2)] - - -- c1 is aspirated - c1 : 'H' : c2 : c3 : cs | c2 == c3 && isVowel c2 -> - (c1, 'H') : (c2, ':') : adHocToDigraphWord cs - c1 : 'H' : c2 : c3 : cs | isVowel c2 && isVowel c3 -> - (c1, 'H') : (c2, c3) : adHocToDigraphWord cs - c1 : 'H' : 'a' : cs -> (c1, 'H') : adHocToDigraphWord cs -- a inherent - c1 : 'H' : c2 : cs | isVowel c2 -> (c1, 'H') : (' ', c2) : adHocToDigraphWord cs - -- not vowelless at EOW - c1 : 'H' : ' ' : cs -> (c1, 'H') : ('\\', ' ') : adHocToDigraphWord cs - c1 : 'H' : [] -> [(c1, 'H')] - c1 : 'H' : cs -> (c1, 'H') : (' ', '^') : adHocToDigraphWord cs -- vowelless - - -- c1 unasp. - c1 : c2 : c3 : cs | c2 == c3 && isVowel c2 -> (' ', c1) : (c2, ':') : adHocToDigraphWord cs - c1 : c2 : c3 : cs | isVowel c2 && isVowel c3 -> (' ', c1) : (c2, c3) : adHocToDigraphWord cs - c1 : 'a' : cs -> (' ', c1) : adHocToDigraphWord cs -- a inherent - c1 : c2 : cs | isVowel c2 -> (' ', c1) : (' ', c2) : adHocToDigraphWord cs - -- not vowelless at EOW - c1 : ' ' : cs -> (' ', c1) : ('\\', ' '): adHocToDigraphWord cs - c1 : [] -> [(' ', c1)] - 'M' : cs -> (' ', 'M') : adHocToDigraphWord cs -- vowelless but no vowelless sign for anusvara - c1 : cs -> (' ', c1) : (' ', '^') : adHocToDigraphWord cs -- vowelless - -isVowel x = elem x "aeiou:" -cap :: Char -> Char -cap x = case x of - 'a' -> 'A' - 'e' -> 'E' - 'i' -> 'I' - 'o' -> 'O' - 'u' -> 'U' - c -> c - -spoolMarkup :: String -> [(Char, Char)] -spoolMarkup s = case s of - -- [] -> [] -- Shouldn't happen - '>' : cs -> ('\\', '>') : adHocToDigraphWord cs - c1 : cs -> ('\\', c1) : spoolMarkup cs - - -digraphWordToUnicode :: [(Char, Char)] -> String -digraphWordToUnicode = map digraphToUnicode - -digraphToUnicode :: (Char, Char) -> Char -digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2 - where - cc = zip allDevanagariCodes allDevanagari - -digraphedDevanagari = " ~ M ;__ AA: II: UU:RoLoEvE~ EE:AvA~ OAU kkH ggHNG ccH jjH \241 TTH DDH N ttH ddH nn. ppH bbH m y rr. l LL. v \231 S s h____ .-Sa: ii: uu:ror:eve~ eaiava~ oau ^____OM | -dddu______ Q X G zD.RH fy.R:L:mrmR#I#d#0#1#2#3#4#5#6#7#8#9#o" - -allDevanagariCodes :: [(Char, Char)] -allDevanagariCodes = mkPairs digraphedDevanagari - -allDevanagari :: String -allDevanagari = (map toEnum [0x0901 .. 0x0970]) - -mkPairs :: String -> [(Char, Char)] -mkPairs str = case str of - [] -> [] - c1 : c2 : cs -> (c1, c2) : mkPairs cs - diff --git a/src/GF/Text/Ethiopic.hs b/src/GF/Text/Ethiopic.hs deleted file mode 100644 index 81abbf719..000000000 --- a/src/GF/Text/Ethiopic.hs +++ /dev/null @@ -1,72 +0,0 @@ ----------------------------------------------------------------------- --- | --- Module : Ethiopic --- Maintainer : HH --- Stability : (stable) --- Portability : (portable) --- --- > CVS $Date: 2005/04/21 16:23:35 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.6 $ --- --- Ascii-Unicode decoding for Ethiopian. --- Copyright (c) Harald Hammarström 2003 under Gnu General Public License ------------------------------------------------------------------------------ - -module GF.Text.Ethiopic (mkEthiopic) where - -mkEthiopic :: String -> String -mkEthiopic = digraphWordToUnicode . adHocToDigraphWord - --- mkEthiopic :: String -> String --- mkEthiopic = reverse . unwords . (map (digraphWordToUnicode . adHocToDigraphWord)) . words ---- reverse : assumes everything's on same line - -adHocToDigraphWord :: String -> [(Char, Int)] -adHocToDigraphWord str = case str of - [] -> [] - '<' : cs -> ('<', -1) : spoolMarkup cs - c1 : cs | isVowel c1 -> (')', vowelOrder c1) : adHocToDigraphWord cs - -- c1 isn't a vowel - c1 : cs | not (elem c1 allEthiopicCodes) -> (c1, -1) : adHocToDigraphWord cs - c1 : c2 : cs | isVowel c2 -> (c1, vowelOrder c2) : adHocToDigraphWord cs - c1 : cs -> (c1, 5) : adHocToDigraphWord cs - -spoolMarkup :: String -> [(Char, Int)] -spoolMarkup s = case s of - -- [] -> [] -- Shouldn't happen - '>' : cs -> ('>', -1) : adHocToDigraphWord cs - c1 : cs -> (c1, -1) : spoolMarkup cs - -isVowel x = elem x "A\228ui\239aeoI" - -vowelOrder :: Char -> Int -vowelOrder x = case x of - 'A' -> 0 - '\228' -> 0 -- ä - 'u' -> 1 - 'i' -> 2 - 'a' -> 3 - 'e' -> 4 - 'I' -> 5 - '\239' -> 5 -- ï - 'o' -> 6 - c -> 5 -- vowelless - -digraphWordToUnicode = map digraphToUnicode - -digraphToUnicode :: (Char, Int) -> Char --- digraphToUnicode (c1, c2) = c1 - -digraphToUnicode (c1, -1) = c1 -digraphToUnicode (c1, c2) = toEnum (0x1200 + c2 + 8*case lookup c1 cc of Just c' -> c') - where - cc = zip allEthiopicCodes allEthiopic - -allEthiopic :: [Int] -allEthiopic = [0 .. 44] -- x 8 - -allEthiopicCodes = "hlHmLrs$KQ__bBtcxXnN)kW__w(zZyd_jgG_TCPSLfp" - --- Q = kW, X = xW, W = kW, G = gW - diff --git a/src/GF/Text/ExtendedArabic.hs b/src/GF/Text/ExtendedArabic.hs deleted file mode 100644 index d2c5faac5..000000000 --- a/src/GF/Text/ExtendedArabic.hs +++ /dev/null @@ -1,99 +0,0 @@ ----------------------------------------------------------------------- --- | --- Module : ExtendedArabic --- Maintainer : (Maintainer) --- Stability : (stable) --- Portability : (portable) --- --- > CVS $Date: 2005/04/21 16:23:36 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.6 $ --- --- (Description of the module) ------------------------------------------------------------------------------ - -module GF.Text.ExtendedArabic (mkArabic0600, mkExtendedArabic) where - -mkArabic0600 :: String -> String -mkArabic0600 = digraphWordToUnicode . aarnesToDigraphWord - -aarnesToDigraphWord :: String -> [(Char, Char)] -aarnesToDigraphWord str = case str of - [] -> [] - '<' : cs -> ('\\', '<') : spoolMarkup2 cs - - 'v' : cs -> ('T', 'H') : aarnesToDigraphWord cs - 'a' : cs -> (' ', 'A') : aarnesToDigraphWord cs - 'o' : cs -> (' ', '3') : aarnesToDigraphWord cs - 'O' : cs -> ('\'', 'i') : aarnesToDigraphWord cs - - 'u' : cs -> ('\'', 'A') : aarnesToDigraphWord cs - 'C' : cs -> (' ', 'X') : aarnesToDigraphWord cs - - 'U' : cs -> ('~', 'A') : aarnesToDigraphWord cs - 'A' : cs -> ('"', 't') : aarnesToDigraphWord cs - 'c' : cs -> ('s', 'h') : aarnesToDigraphWord cs - c : cs -> (' ', c) : aarnesToDigraphWord cs - -mkExtendedArabic :: String -> String -mkExtendedArabic = digraphWordToUnicode . adHocToDigraphWord - -adHocToDigraphWord :: String -> [(Char, Char)] -adHocToDigraphWord str = case str of - [] -> [] - '<' : cs -> ('\\', '<') : spoolMarkup cs - -- Sorani - 'W' : cs -> (':', 'w') : adHocToDigraphWord cs -- ?? Will do - 'E' : cs -> (' ', 'i') : adHocToDigraphWord cs -- ?? Letter missing! - 'j' : cs -> ('d', 'j') : adHocToDigraphWord cs - 'O' : cs -> ('v', 'w') : adHocToDigraphWord cs - 'F' : cs -> (' ', 'v') : adHocToDigraphWord cs - 'Z' : cs -> ('z', 'h') : adHocToDigraphWord cs - 'I' : cs -> (' ', 'i') : adHocToDigraphWord cs -- ?? Letter missing! - 'C' : cs -> ('c', 'h') : adHocToDigraphWord cs - -- Pashto - 'e' : cs -> (':', 'y') : adHocToDigraphWord cs - '$' : cs -> ('3', 'H') : adHocToDigraphWord cs - 'X' : cs -> ('s', '.') : adHocToDigraphWord cs - 'G' : cs -> ('z', '.') : adHocToDigraphWord cs - 'a' : cs -> (' ', 'A') : adHocToDigraphWord cs - 'P' : cs -> ('\'', 'H') : adHocToDigraphWord cs - 'R' : cs -> ('o', 'r') : adHocToDigraphWord cs - -- Shared - 'A' : cs -> (' ', 'h') : adHocToDigraphWord cs -- ?? Maybe to "t or 0x06d5 - 'c' : cs -> ('s', 'h') : adHocToDigraphWord cs - c : cs -> (' ', c) : adHocToDigraphWord cs - - --- Beginning 0x621 up and including 0x06d1 -digraphedExtendedArabic = " '~A'A'w,A'i A b\"t tTHdj H X dDH r z ssh S D T Z 3GH__________ - f q k l m n h w i y&a&w&i/a/w/i/W/o/~/'/,/|/6/v_____________#0#1#2#3#4#5#6#7#8#9#%#,#'#*>b>q$|> A2'2,3'A'w'w&y'Tb:b:BoT3b p4b4B'H:H2H\"H3Hch4HTdod.dTD:d:D3d3D4dTrvror.rvRz.:rzh4zs.+s*S:S3S3T33>ff.f: v4f.q3q-k~kok.k3k3K gog:g:G3Gvl.l3l3L:n>nTnon3n?h4H't>Y\"Yow-wvwww|w^w:w3w>y/yvy.w:y3y____ -ae" - -digraphWordToUnicode = map digraphToUnicode - -digraphToUnicode :: (Char, Char) -> Char -digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2 - where - cc = zip allExtendedArabicCodes allExtendedArabic - -allExtendedArabicCodes :: [(Char, Char)] -allExtendedArabicCodes = mkPairs digraphedExtendedArabic - -allExtendedArabic :: String -allExtendedArabic = (map toEnum [0x0621 .. 0x06d1]) - -mkPairs :: String -> [(Char, Char)] -mkPairs str = case str of - [] -> [] - c1 : c2 : cs -> (c1, c2) : mkPairs cs - -spoolMarkup :: String -> [(Char, Char)] -spoolMarkup s = case s of - [] -> [] -- Shouldn't happen - '>' : cs -> ('\\', '>') : adHocToDigraphWord cs - c1 : cs -> ('\\', c1) : spoolMarkup cs - -spoolMarkup2 :: String -> [(Char, Char)] -spoolMarkup2 s = case s of - [] -> [] -- Shouldn't happen - '>' : cs -> ('\\', '>') : aarnesToDigraphWord cs - c1 : cs -> ('\\', c1) : spoolMarkup2 cs diff --git a/src/GF/Text/ExtraDiacritics.hs b/src/GF/Text/ExtraDiacritics.hs deleted file mode 100644 index f3d811c2c..000000000 --- a/src/GF/Text/ExtraDiacritics.hs +++ /dev/null @@ -1,37 +0,0 @@ ----------------------------------------------------------------------- --- | --- Module : ExtraDiacritics --- Maintainer : (Maintainer) --- Stability : (stable) --- Portability : (portable) --- --- > CVS $Date: 2005/04/21 16:23:36 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.5 $ --- --- (Description of the module) ------------------------------------------------------------------------------ - -module GF.Text.ExtraDiacritics (mkExtraDiacritics) where - -mkExtraDiacritics :: String -> String -mkExtraDiacritics = mkExtraDiacriticsWord - -mkExtraDiacriticsWord :: String -> String -mkExtraDiacriticsWord str = case str of - [] -> [] - '<' : cs -> '<' : spoolMarkup cs - -- - '/' : cs -> toEnum 0x0301 : mkExtraDiacriticsWord cs - '~' : cs -> toEnum 0x0306 : mkExtraDiacriticsWord cs - ':' : cs -> toEnum 0x0304 : mkExtraDiacriticsWord cs -- some of these could be put in LatinA - '.' : cs -> toEnum 0x0323 : mkExtraDiacriticsWord cs - 'i' : '-' : cs -> toEnum 0x0268 : mkExtraDiacriticsWord cs -- in IPA extensions - -- Default - c : cs -> c : mkExtraDiacriticsWord cs - -spoolMarkup :: String -> String -spoolMarkup s = case s of - [] -> [] -- Shouldn't happen - '>' : cs -> '>' : mkExtraDiacriticsWord cs - c1 : cs -> c1 : spoolMarkup cs diff --git a/src/GF/Text/Greek.hs b/src/GF/Text/Greek.hs deleted file mode 100644 index 6b9361a29..000000000 --- a/src/GF/Text/Greek.hs +++ /dev/null @@ -1,172 +0,0 @@ ----------------------------------------------------------------------- --- | --- Module : Greek --- Maintainer : (Maintainer) --- Stability : (stable) --- Portability : (portable) --- --- > CVS $Date: 2005/04/21 16:23:37 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.5 $ --- --- (Description of the module) ------------------------------------------------------------------------------ - -module GF.Text.Greek (mkGreek) where - -mkGreek :: String -> String -mkGreek = unwords . (map mkGreekWord) . mkGravis . words - ---- TODO : optimize character formation by factorizing the case expressions - -type GreekChar = Char - -mkGreekWord :: String -> [GreekChar] -mkGreekWord = map (toEnum . mkGreekChar) . mkGreekSpec - -mkGravis :: [String] -> [String] -mkGravis [] = [] -mkGravis [w] = [w] -mkGravis (w1:w2:ws) - | stressed w2 = mkG w1 : mkGravis (w2:ws) - | otherwise = w1 : w2 : mkGravis ws - where - stressed w = any (`elem` "'~`") w - mkG :: String -> String - mkG w = let (w1,w2) = span (/='\'') w in - case w2 of - '\'':v:cs | not (any isVowel cs) -> w1 ++ "`" ++ [v] ++ cs - '\'':'!':v:cs | not (any isVowel cs) -> w1 ++ "`!" ++ [v] ++ cs - _ -> w - isVowel c = elem c "aehiouw" - -mkGreekSpec :: String -> [(Char,Int)] -mkGreekSpec str = case str of - [] -> [] - '(' :'\'': '!' : c : cs -> (c,25) : mkGreekSpec cs - '(' :'~' : '!' : c : cs -> (c,27) : mkGreekSpec cs - '(' :'`' : '!' : c : cs -> (c,23) : mkGreekSpec cs - '(' : '!' : c : cs -> (c,21) : mkGreekSpec cs - ')' :'\'': '!' : c : cs -> (c,24) : mkGreekSpec cs - ')' :'~' : '!' : c : cs -> (c,26) : mkGreekSpec cs - ')' :'`' : '!' : c : cs -> (c,22) : mkGreekSpec cs - ')' : '!' : c : cs -> (c,20) : mkGreekSpec cs - '\'': '!' : c : cs -> (c,30) : mkGreekSpec cs - '~' : '!' : c : cs -> (c,31) : mkGreekSpec cs - '`' : '!' : c : cs -> (c,32) : mkGreekSpec cs - '!' : c : cs -> (c,33) : mkGreekSpec cs - '(' :'\'': c : cs -> (c,5) : mkGreekSpec cs - '(' :'~' : c : cs -> (c,7) : mkGreekSpec cs - '(' :'`' : c : cs -> (c,3) : mkGreekSpec cs - '(' : c : cs -> (c,1) : mkGreekSpec cs - ')' :'\'': c : cs -> (c,4) : mkGreekSpec cs - ')' :'~' : c : cs -> (c,6) : mkGreekSpec cs - ')' :'`' : c : cs -> (c,2) : mkGreekSpec cs - ')' : c : cs -> (c,0) : mkGreekSpec cs - '\'': c : cs -> (c,10) : mkGreekSpec cs - '~' : c : cs -> (c,11) : mkGreekSpec cs - '`' : c : cs -> (c,12) : mkGreekSpec cs - c : cs -> (c,-1) : mkGreekSpec cs - -mkGreekChar (c,-1) = case lookup c cc of Just c' -> c' ; _ -> fromEnum c - where - cc = zip "abgdezhqiklmnxoprjstyfcuw" allGreekMin -mkGreekChar (c,n) = case (c,n) of - ('a',10) -> 0x03ac - ('a',11) -> 0x1fb6 - ('a',12) -> 0x1f70 - ('a',30) -> 0x1fb4 - ('a',31) -> 0x1fb7 - ('a',32) -> 0x1fb2 - ('a',33) -> 0x1fb3 - ('a',n) | n >19 -> 0x1f80 + n - 20 - ('a',n) -> 0x1f00 + n - ('e',10) -> 0x03ad -- ' --- ('e',11) -> 0x1fb6 -- ~ can't happen - ('e',12) -> 0x1f72 -- ` - ('e',n) -> 0x1f10 + n - ('h',10) -> 0x03ae -- ' - ('h',11) -> 0x1fc6 -- ~ - ('h',12) -> 0x1f74 -- ` - - ('h',30) -> 0x1fc4 - ('h',31) -> 0x1fc7 - ('h',32) -> 0x1fc2 - ('h',33) -> 0x1fc3 - ('h',n) | n >19 -> 0x1f90 + n - 20 - - ('h',n) -> 0x1f20 + n - ('i',10) -> 0x03af -- ' - ('i',11) -> 0x1fd6 -- ~ - ('i',12) -> 0x1f76 -- ` - ('i',n) -> 0x1f30 + n - ('o',10) -> 0x03cc -- ' --- ('o',11) -> 0x1fb6 -- ~ can't happen - ('o',12) -> 0x1f78 -- ` - ('o',n) -> 0x1f40 + n - ('y',10) -> 0x03cd -- ' - ('y',11) -> 0x1fe6 -- ~ - ('y',12) -> 0x1f7a -- ` - ('y',n) -> 0x1f50 + n - ('w',10) -> 0x03ce -- ' - ('w',11) -> 0x1ff6 -- ~ - ('w',12) -> 0x1f7c -- ` - - ('w',30) -> 0x1ff4 - ('w',31) -> 0x1ff7 - ('w',32) -> 0x1ff2 - ('w',33) -> 0x1ff3 - ('w',n) | n >19 -> 0x1fa0 + n - 20 - - ('w',n) -> 0x1f60 + n - ('r',1) -> 0x1fe5 - _ -> mkGreekChar (c,-1) --- should not happen - -allGreekMin :: [Int] -allGreekMin = [0x03b1 .. 0x03c9] - - -{- -encoding of Greek writing. Those hard to guess are marked with --- - - maj min -A a Alpha 0391 03b1 -B b Beta 0392 03b2 -G g Gamma 0393 03b3 -D d Delta 0394 03b4 -E e Epsilon 0395 03b5 -Z z Zeta 0396 03b6 -H h Eta --- 0397 03b7 -Q q Theta --- 0398 03b8 -I i Iota 0399 03b9 -K k Kappa 039a 03ba -L l Lambda 039b 03bb -M m My 039c 03bc -N n Ny 039d 03bd -X x Xi 039e 03be -O o Omikron 039f 03bf -P p Pi 03a0 03c0 -R r Rho 03a1 03c1 - j Sigma --- 03c2 -S s Sigma 03a3 03c3 -T t Tau 03a4 03c4 -Y y Ypsilon 03a5 03c5 -F f Phi 03a6 03c6 -C c Khi --- 03a7 03c7 -U u Psi 03a8 03c8 -W w Omega --- 03a9 03c9 - -( spiritus asper -) spiritus lenis -! iota subscriptum - -' acutus -~ circumflexus -` gravis - --} - - - - - diff --git a/src/GF/Text/Hebrew.hs b/src/GF/Text/Hebrew.hs deleted file mode 100644 index c7026d8da..000000000 --- a/src/GF/Text/Hebrew.hs +++ /dev/null @@ -1,53 +0,0 @@ ----------------------------------------------------------------------- --- | --- Module : Hebrew --- Maintainer : (Maintainer) --- Stability : (stable) --- Portability : (portable) --- --- > CVS $Date: 2005/04/21 16:23:37 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.8 $ --- --- (Description of the module) ------------------------------------------------------------------------------ - -module GF.Text.Hebrew (mkHebrew) where - -mkHebrew :: String -> String -mkHebrew = mkHebrewWord -----mkHebrew = reverse . mkHebrewWord ---- reverse : assumes everything's on same line - -type HebrewChar = Char - --- HH 031103 added code for spooling the markup --- removed reverse, words, unwords --- (seemed obsolete and come out wrong on the screen) --- AR 26/1/2004 put reverse back - needed in Fudgets (but not in Java?) - -mkHebrewWord :: String -> [HebrewChar] --- mkHebrewWord = map mkHebrewChar - -mkHebrewWord s = case s of - [] -> [] - '<' : cs -> '<' : spoolMarkup cs - ' ' : cs -> ' ' : mkHebrewWord cs - c1 : cs -> mkHebrewChar c1 : mkHebrewWord cs - -spoolMarkup :: String -> String -spoolMarkup s = case s of - [] -> [] -- Shouldn't happen - '>' : cs -> '>' : mkHebrewWord cs - c1 : cs -> c1 : spoolMarkup cs - -mkHebrewChar c = case lookup c cc of Just c' -> c' ; _ -> c - where - cc = zip allHebrewCodes allHebrew - -allHebrewCodes = "-abgdhwzHTyKklMmNnSoPpCcqrst" - -allHebrew :: String -allHebrew = (map toEnum (0x05be : [0x05d0 .. 0x05ea])) - - diff --git a/src/GF/Text/Hiragana.hs b/src/GF/Text/Hiragana.hs deleted file mode 100644 index ba74fc83c..000000000 --- a/src/GF/Text/Hiragana.hs +++ /dev/null @@ -1,95 +0,0 @@ ----------------------------------------------------------------------- --- | --- Module : Hiragana --- Maintainer : (Maintainer) --- Stability : (stable) --- Portability : (portable) --- --- > CVS $Date: 2005/04/21 16:23:38 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.5 $ --- --- (Description of the module) ------------------------------------------------------------------------------ - -module GF.Text.Hiragana (mkJapanese) where - --- long vowel romaaji must be ei, ou not ee, oo - -mkJapanese :: String -> String -mkJapanese = digraphWordToUnicode . romaajiToDigraphWord - -romaajiToDigraphWord :: String -> [(Char, Char)] -romaajiToDigraphWord str = case str of - [] -> [] - '<' : cs -> ('\\', '<') : spoolMarkup cs - ' ' : cs -> ('\\', ' ') : romaajiToDigraphWord cs - - c1 : cs | isVowel c1 -> (' ', cap c1) : romaajiToDigraphWord cs - - -- The combinations - c1 : 'y' : c2 : cs -> (c1, 'i') : ('y', cap c2) : romaajiToDigraphWord cs - - 's' : 'h' : 'a' : cs -> ('S', 'i') : ('y', 'A') : romaajiToDigraphWord cs - 'c' : 'h' : 'a' : cs -> ('C', 'i') : ('y', 'A') : romaajiToDigraphWord cs - 'j' : 'a' : cs -> ('j', 'i') : ('y', 'A') : romaajiToDigraphWord cs - - 's' : 'h' : 'u' : cs -> ('S', 'i') : ('y', 'U') : romaajiToDigraphWord cs - 'c' : 'h' : 'u' : cs -> ('C', 'i') : ('y', 'U') : romaajiToDigraphWord cs - 'j' : 'u' : cs -> ('j', 'i') : ('y', 'U') : romaajiToDigraphWord cs - - 's' : 'h' : 'o' : cs -> ('S', 'i') : ('y', 'O') : romaajiToDigraphWord cs - 'c' : 'h' : 'o' : cs -> ('C', 'i') : ('y', 'O') : romaajiToDigraphWord cs - 'j' : 'o' : cs -> ('j', 'i') : ('y', 'O') : romaajiToDigraphWord cs - - 'd' : 'z' : c3 : cs -> ('D', c3) : romaajiToDigraphWord cs - 't' : 's' : c3 : cs -> ('T', c3) : romaajiToDigraphWord cs - 'c' : 'h' : c3 : cs -> ('C', c3) : romaajiToDigraphWord cs - 's' : 'h' : c3 : cs -> ('S', c3) : romaajiToDigraphWord cs - 'z' : 'h' : c3 : cs -> ('Z', c3) : romaajiToDigraphWord cs - - c1 : ' ' : cs -> (' ', c1) : ('\\', ' ') : romaajiToDigraphWord cs -- n - c1 : [] -> [(' ', c1)] -- n - - c1 : c2 : cs | isVowel c2 -> (c1, c2) : romaajiToDigraphWord cs - c1 : c2 : cs | c1 == c2 -> ('T', 'U') : romaajiToDigraphWord (c2 : cs) -- double cons - c1 : cs -> (' ', c1) : romaajiToDigraphWord cs -- n - -isVowel x = elem x "aeiou" -cap :: Char -> Char -cap x = case x of - 'a' -> 'A' - 'e' -> 'E' - 'i' -> 'I' - 'o' -> 'O' - 'u' -> 'U' - c -> c - -spoolMarkup :: String -> [(Char, Char)] -spoolMarkup s = case s of - -- [] -> [] -- Shouldn't happen - '>' : cs -> ('\\', '>') : romaajiToDigraphWord cs - c1 : cs -> ('\\', c1) : spoolMarkup cs - -digraphWordToUnicode :: [(Char, Char)] -> String -digraphWordToUnicode = map digraphToUnicode - -digraphToUnicode :: (Char, Char) -> Char -digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2 - where - cc = zip allHiraganaCodes allHiragana - -allHiraganaCodes :: [(Char, Char)] -allHiraganaCodes = mkPairs digraphedHiragana - -allHiragana :: String -allHiragana = (map toEnum [0x3041 .. 0x309f]) - -mkPairs :: String -> [(Char, Char)] -mkPairs str = case str of - [] -> [] - c1 : c2 : cs -> (c1, c2) : mkPairs cs - -digraphedHiragana = " a A i I u U e E o OkagakigikugukegekogosazaSiZisuzusezesozotadaCijiTUTuDutedetodonaninunenohabapahibipihubupuhebepehobopomamimumemoyAyayUyuyOyorarirurerowaWawiwewo nvukAkE____<< o>>o >'> b" - - diff --git a/src/GF/Text/LatinASupplement.hs b/src/GF/Text/LatinASupplement.hs deleted file mode 100644 index f42423c91..000000000 --- a/src/GF/Text/LatinASupplement.hs +++ /dev/null @@ -1,69 +0,0 @@ ----------------------------------------------------------------------- --- | --- Module : LatinASupplement --- Maintainer : (Maintainer) --- Stability : (stable) --- Portability : (portable) --- --- > CVS $Date: 2005/04/21 16:23:39 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.5 $ --- --- (Description of the module) ------------------------------------------------------------------------------ - -module GF.Text.LatinASupplement (mkLatinASupplement) where - -mkLatinASupplement :: String -> String -mkLatinASupplement = mkLatinASupplementWord - -mkLatinASupplementWord :: String -> String -mkLatinASupplementWord str = case str of - [] -> [] - '<' : cs -> '<' : spoolMarkup cs - -- Romanian & partly Turkish - 's' : ',' : cs -> toEnum 0x015f : mkLatinASupplementWord cs - 'a' : '%' : cs -> toEnum 0x0103 : mkLatinASupplementWord cs - -- Slavic and more - 'c' : '^' : cs -> toEnum 0x010d : mkLatinASupplementWord cs - 's' : '^' : cs -> toEnum 0x0161 : mkLatinASupplementWord cs - 'c' : '\'' : cs -> toEnum 0x0107 : mkLatinASupplementWord cs - 'z' : '^' : cs -> toEnum 0x017e : mkLatinASupplementWord cs - -- Turkish - 'g' : '%' : cs -> toEnum 0x011f : mkLatinASupplementWord cs - 'I' : cs -> toEnum 0x0131 : mkLatinASupplementWord cs - 'c' : ',' : cs -> toEnum 0x00e7 : mkLatinASupplementWord cs - -- Polish - 'e' : ',' : cs -> toEnum 0x0119 : mkLatinASupplementWord cs - 'a' : ',' : cs -> toEnum 0x0105 : mkLatinASupplementWord cs - 'l' : '/' : cs -> toEnum 0x0142 : mkLatinASupplementWord cs - 'z' : '.' : cs -> toEnum 0x017c : mkLatinASupplementWord cs - 'n' : '\'' : cs -> toEnum 0x0144 : mkLatinASupplementWord cs - 's' : '\'' : cs -> toEnum 0x015b : mkLatinASupplementWord cs --- 'c' : '\'' : cs -> toEnum 0x0107 : mkLatinASupplementWord cs - - -- Hungarian - 'o' : '%' : cs -> toEnum 0x0151 : mkLatinASupplementWord cs - 'u' : '%' : cs -> toEnum 0x0171 : mkLatinASupplementWord cs - - -- Mongolian - 'j' : '^' : cs -> toEnum 0x0135 : mkLatinASupplementWord cs - - -- Khowar (actually in Combining diacritical marks not Latin-A Suppl.) - 'o' : '.' : cs -> 'o' : (toEnum 0x0323 : mkLatinASupplementWord cs) - - -- Length bars over vowels e.g korean - 'a' : ':' : cs -> toEnum 0x0101 : mkLatinASupplementWord cs - 'e' : ':' : cs -> toEnum 0x0113 : mkLatinASupplementWord cs - 'i' : ':' : cs -> toEnum 0x012b : mkLatinASupplementWord cs - 'o' : ':' : cs -> toEnum 0x014d : mkLatinASupplementWord cs - 'u' : ':' : cs -> toEnum 0x016b : mkLatinASupplementWord cs - - -- Default - c : cs -> c : mkLatinASupplementWord cs - -spoolMarkup :: String -> String -spoolMarkup s = case s of - [] -> [] -- Shouldn't happen - '>' : cs -> '>' : mkLatinASupplementWord cs - c1 : cs -> c1 : spoolMarkup cs diff --git a/src/GF/Text/OCSCyrillic.hs b/src/GF/Text/OCSCyrillic.hs deleted file mode 100644 index 0d4696944..000000000 --- a/src/GF/Text/OCSCyrillic.hs +++ /dev/null @@ -1,47 +0,0 @@ ----------------------------------------------------------------------- --- | --- Maintainer : (Maintainer) --- Stability : (stable) --- Portability : (portable) --- --- > CVS $Date: 2005/04/21 16:23:39 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.6 $ --- --- (Description of the module) ------------------------------------------------------------------------------ - -module GF.Text.OCSCyrillic (mkOCSCyrillic) where - -mkOCSCyrillic :: String -> String -mkOCSCyrillic = mkOCSCyrillicWord - -mkOCSCyrillicWord :: String -> String -mkOCSCyrillicWord str = case str of - [] -> [] - ' ' : cs -> ' ' : mkOCSCyrillicWord cs - '<' : cs -> '<' : spoolMarkup cs - '\228' : cs -> toEnum 0x0463 : mkOCSCyrillicWord cs -- ä - 'j' : 'e' : '~' : cs -> toEnum 0x0469 : mkOCSCyrillicWord cs - 'j' : 'o' : '~' : cs -> toEnum 0x046d : mkOCSCyrillicWord cs - 'j' : 'e' : cs -> toEnum 0x0465 : mkOCSCyrillicWord cs - 'e' : '~' : cs -> toEnum 0x0467 : mkOCSCyrillicWord cs - 'o' : '~' : cs -> toEnum 0x046b : mkOCSCyrillicWord cs - 'j' : 'u' : cs -> toEnum 0x044e : mkOCSCyrillicWord cs - 'j' : 'a' : cs -> toEnum 0x044f : mkOCSCyrillicWord cs - 'u' : cs -> toEnum 0x0479 : mkOCSCyrillicWord cs - c : cs -> (mkOCSCyrillicChar c) : mkOCSCyrillicWord cs - -spoolMarkup :: String -> String -spoolMarkup s = case s of - [] -> [] -- Shouldn't happen - '>' : cs -> '>' : mkOCSCyrillicWord cs - c1 : cs -> c1 : spoolMarkup cs - -mkOCSCyrillicChar :: Char -> Char -mkOCSCyrillicChar c = case lookup c cc of Just c' -> c' ; _ -> c - where - cc = zip "abvgdeZziJklmnoprstYfxCqwWUyIE" allOCSCyrillic - -allOCSCyrillic :: String -allOCSCyrillic = (map toEnum [0x0430 .. 0x044e]) diff --git a/src/GF/Text/Russian.hs b/src/GF/Text/Russian.hs deleted file mode 100644 index c4f1bfd89..000000000 --- a/src/GF/Text/Russian.hs +++ /dev/null @@ -1,56 +0,0 @@ ----------------------------------------------------------------------- --- | --- Module : Russian --- Maintainer : (Maintainer) --- Stability : (stable) --- Portability : (portable) --- --- > CVS $Date: 2005/04/21 16:23:40 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.5 $ --- --- (Description of the module) ------------------------------------------------------------------------------ - -module GF.Text.Russian (mkRussian, mkRusKOI8) where - --- | an ad hoc ASCII encoding. Delimiters: @\/_ _\/@ -mkRussian :: String -> String -mkRussian = unwords . (map mkRussianWord) . words - --- | the KOI8 encoding, incomplete. Delimiters: @\/* *\/@ -mkRusKOI8 :: String -> String -mkRusKOI8 = unwords . (map mkRussianKOI8) . words - -type RussianChar = Char - -mkRussianWord :: String -> [RussianChar] -mkRussianWord = map (mkRussianChar allRussianCodes) - -mkRussianKOI8 :: String -> [RussianChar] -mkRussianKOI8 = map (mkRussianChar allRussianKOI8) - -mkRussianChar chars c = case lookup c cc of Just c' -> c' ; _ -> c - where - cc = zip chars allRussian - -allRussianCodes :: [Char] -allRussianCodes = - -- changed to Ints to work with Haskell compilers e.g. GHC 6.5 CVS - -- which expect source files to be in UTF-8 - -- /bringert 2006-05-19 - -- "ÅåABVGDEXZIJKLMNOPRSTUFHCQW£}!*ÖYÄabvgdexzijklmnoprstufhcqw#01'öyä" - map toEnum [197,229,65,66,86,71,68,69,88,90,73,74,75,76,77,78,79,80,82,83,84,85,70,72,67,81,87,163,125,33,42,214,89,196,97,98,118,103,100,101,120,122,105,106,107,108,109,110,111,112,114,115,116,117,102,104,99,113,119,35,48,49,39,246,121,228] - -allRussianKOI8 :: [Char] -allRussianKOI8 = - -- changed to Ints to work with Haskell compilers e.g. GHC 6.5 CVS - -- which expect source files to be in UTF-8 - -- /bringert 2006-05-19 - -- "^@áâ÷çäåöúéêëìíîïðòóôõæèãþûýøùÿüàñÁÂ×ÇÄÅÖÚÉÊËÌÍÎÏÐÒÓÔÕÆÈÃÞÛÝØÙßÜÀÑ" - map toEnum [94,64,225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,242,243,244,245,230,232,227,254,251,253,248,249,255,252,224,241,193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,210,211,212,213,198,200,195,222,219,221,216,217,223,220,192,209] - -allRussian :: String -allRussian = (map toEnum (0x0401:0x0451:[0x0410 .. 0x044f])) -- Ëë in odd places - - diff --git a/src/GF/Text/Tamil.hs b/src/GF/Text/Tamil.hs deleted file mode 100644 index 8ee171acf..000000000 --- a/src/GF/Text/Tamil.hs +++ /dev/null @@ -1,77 +0,0 @@ ----------------------------------------------------------------------- --- | --- Module : Tamil --- Maintainer : (Maintainer) --- Stability : (stable) --- Portability : (portable) --- --- > CVS $Date: 2005/04/21 16:23:40 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.5 $ --- --- (Description of the module) ------------------------------------------------------------------------------ - -module GF.Text.Tamil (mkTamil) where - -mkTamil :: String -> String -mkTamil = digraphWordToUnicode . adHocToDigraphWord - -adHocToDigraphWord :: String -> [(Char, Char)] -adHocToDigraphWord str = case str of - [] -> [] - '<' : cs -> ('\\', '<') : spoolMarkup cs - ' ' : cs -> ('\\', ' ') : adHocToDigraphWord cs -- skip space - --- if c1 is a vowel - -- Two of the same vowel => lengthening - c1 : c2 : cs | c1 == c2 && isVowel c1 -> (cap c1, ':') : adHocToDigraphWord cs - -- digraphed or long vowel - c1 : c2 : cs | isVowel c1 && isVowel c2 -> (cap c1, cap c2) : adHocToDigraphWord cs - c1 : cs | isVowel c1 -> (' ', cap c1) : adHocToDigraphWord cs - --- c1 isn't a vowel - c1 : c2 : c3 : cs | c2 == c3 && isVowel c2 -> (' ', c1) : (c2, ':') : adHocToDigraphWord cs - c1 : c2 : c3 : cs | isVowel c2 && isVowel c3 -> (' ', c1) : (c2, c3) : adHocToDigraphWord cs - c1 : 'a' : cs -> (' ', c1) : adHocToDigraphWord cs -- a inherent - c1 : c2 : cs | isVowel c2 -> (' ', c1) : (' ', c2) : adHocToDigraphWord cs - - c1 : cs -> (' ', c1) : (' ', '.') : adHocToDigraphWord cs -- vowelless - -isVowel x = elem x "aeiou:" -cap :: Char -> Char -cap x = case x of - 'a' -> 'A' - 'e' -> 'E' - 'i' -> 'I' - 'o' -> 'O' - 'u' -> 'U' - c -> c - -spoolMarkup :: String -> [(Char, Char)] -spoolMarkup s = case s of - -- [] -> [] -- Shouldn't happen - '>' : cs -> ('\\', '>') : adHocToDigraphWord cs - c1 : cs -> ('\\', c1) : spoolMarkup cs - -digraphWordToUnicode :: [(Char, Char)] -> String -digraphWordToUnicode = map digraphToUnicode - -digraphToUnicode :: (Char, Char) -> Char -digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2 - where - cc = zip allTamilCodes allTamil - -mkPairs :: String -> [(Char, Char)] -mkPairs str = case str of - [] -> [] - c1 : c2 : cs -> (c1, c2) : mkPairs cs - -allTamilCodes :: [(Char, Char)] -allTamilCodes = mkPairs digraphedTamil - -allTamil :: String -allTamil = (map toEnum [0x0b85 .. 0x0bfa]) - -digraphedTamil = " AA: II: UU:______ EE:AI__ OO:AU k______ G c__ j__ \241 T______ N t______ V n p______ m y r l L M v__ s S h________a: ii: uu:______ ee:ai__ oo:au .__________________ :______________________________#1#2#3#4#5#6#7#8#9^1^2^3=d=m=y=d=c==ru##" - diff --git a/src/GF/Text/Text.hs b/src/GF/Text/Text.hs deleted file mode 100644 index b55355c20..000000000 --- a/src/GF/Text/Text.hs +++ /dev/null @@ -1,149 +0,0 @@ ----------------------------------------------------------------------- --- | --- Module : Text --- Maintainer : AR --- Stability : (stable) --- Portability : (portable) --- --- > CVS $Date: 2005/06/23 14:32:44 $ --- > CVS $Author: aarne $ --- > CVS $Revision: 1.10 $ --- --- elementary text postprocessing. AR 21\/11\/2001. --- --- This is very primitive indeed. The functions should work on --- token lists and not on strings. AR 5\/12\/2002 --- --- XML hack 14\/8\/2004; not in use yet ------------------------------------------------------------------------------ - -module GF.Text.Text (untokWithXML, - exceptXML, - formatAsTextLit, - formatAsCodeLit, - formatAsText, - formatAsHTML, - formatAsLatex, - formatAsCode, - performBinds, - performBindsFinnish, - unStringLit, - concatRemSpace - ) where - -import GF.Data.Operations -import Data.Char - --- | does not apply untokenizer within XML tags --- heuristic "< " --- this function is applied from top level... -untokWithXML :: (String -> String) -> String -> String -untokWithXML unt s = case s of - '<':cs@(c:_) | isAlpha c -> '<':beg ++ ">" ++ unto (drop 1 rest) where - (beg,rest) = span (/='>') cs - '<':cs -> '<':unto cs --- - [] -> [] - _ -> unt beg ++ unto rest where - (beg,rest) = span (/='<') s - where - unto = untokWithXML unt - --- | ... whereas this one is embedded on a branch -exceptXML :: (String -> String) -> String -> String -exceptXML unt s = '<':beg ++ ">" ++ unt (drop 1 rest) where - (beg,rest) = span (/='>') s - -formatAsTextLit :: String -> String -formatAsTextLit = formatAsText . unwords . map unStringLit . words ---- hope that there will be deforestation... - -formatAsCodeLit :: String -> String -formatAsCodeLit = formatAsCode . unwords . map unStringLit . words - -formatAsText,formatAsHTML,formatAsLatex :: String -> String -formatAsText = formatAsTextGen (const False) (=="&-") -formatAsHTML = formatAsTextGen (\s -> take 1 s == "<" || last s == '>') (const False) -formatAsLatex = formatAsTextGen ((=="\\") . take 1) (const False) - -formatAsTextGen :: (String -> Bool) -> (String -> Bool) -> String -> String -formatAsTextGen tag para = unwords . format . cap . words where - format ws = case ws of - w : ww | capit w -> format $ (cap ww) - w : c : ww | major c -> format $ (w ++ c) :(cap ww) - w : c : ww | minor c -> format $ (w ++ c) : ww - p : c : ww | openp p -> format $ (p ++ c) :ww - p : c : ww | spanish p -> format $ (p ++ concat (cap [c])) :ww - c : ww | para c -> "\n\n" : format ww - w : ww -> w : format ww - [] -> [] - cap (p:ww) | tag p = p : cap ww - cap ((c:cs):ww) = (toUpper c : cs) : ww - cap [] = [] - capit = (=="&|") - major = flip elem (map singleton ".!?") - minor = flip elem (map singleton ",:;)") - openp = all (flip elem "(") - spanish = all (flip elem "\161\191") - -formatAsCode :: String -> String -formatAsCode = rend 0 . words where - -- render from BNF Converter - rend i ss = case ss of - "[" :ts -> cons "[" $ rend i ts - "(" :ts -> cons "(" $ rend i ts - "{" :ts -> cons "{" $ new (i+1) $ rend (i+1) ts - "}" : ";":ts -> new (i-1) $ space "}" $ cons ";" $ new (i-1) $ rend (i-1) ts - "}" :ts -> new (i-1) $ cons "}" $ new (i-1) $ rend (i-1) ts - ";" :ts -> cons ";" $ new i $ rend i ts - t : "," :ts -> cons t $ space "," $ rend i ts - t : ")" :ts -> cons t $ cons ")" $ rend i ts - t : "]" :ts -> cons t $ cons "]" $ rend i ts - t :ts -> space t $ rend i ts - _ -> "" - cons s t = s ++ t - new i s = '\n' : replicate (2*i) ' ' ++ dropWhile isSpace s - space t s = if null s then t else t ++ " " ++ s - -performBinds :: String -> String -performBinds = performBindsOpt (\x y -> y) - - --- The function defines an effect of the former on the latter part, --- such as in vowel harmony. It is triggered by the binder token "&*" - -performBindsOpt :: (String -> String -> String) -> String -> String -performBindsOpt harm = unwords . format . words where - format ws = case ws of - w : "&+" : u : ws -> format ((w ++ u) : ws) - w : "&*" : u : ws -> format ((w ++ harm w u) : ws) - w : ws -> w : format ws - [] -> [] - --- unlexer for Finnish particles --- Notice: left associativity crucial for "tie &* ko &* han" --> "tieköhän" - -performBindsFinnish :: String -> String -performBindsFinnish = performBindsOpt vowelHarmony where - vowelHarmony w p = if any (flip elem "aouAOU") w then p else map toFront p - toFront c = case c of - 'A' -> '\196' - 'O' -> '\214' - 'a' -> '\228' - 'o' -> '\246' - _ -> c - -unStringLit :: String -> String -unStringLit s = case s of - c : cs | strlim c && strlim (last cs) -> init cs - _ -> s - where - strlim = (=='\'') - -concatRemSpace :: String -> String -concatRemSpace = concat . words -{- -concatRemSpace s = case s of - '<':cs -> exceptXML concatRemSpace cs - c : cs | isSpace c -> concatRemSpace cs - c :cs -> c : concatRemSpace cs - _ -> s --} diff --git a/src/GF/Text/Thai.hs b/src/GF/Text/Thai.hs deleted file mode 100644 index 1b186cb3a..000000000 --- a/src/GF/Text/Thai.hs +++ /dev/null @@ -1,368 +0,0 @@ ----------------------------------------------------------------------- --- | --- Module : Thai --- Maintainer : (Maintainer) --- Stability : (experimental) --- Portability : (portable) --- --- --- Thai transliteration and other alphabet information. ------------------------------------------------------------------------------ - --- AR 27/12/2006. Execute test2 to see the transliteration table. - -module GF.Text.Thai ( - mkThai,mkThaiWord,mkThaiPron,mkThaiFake,thaiFile,thaiPronFile,thaiFakeFile - ) where - -import qualified Data.Map as Map -import Data.Char - --- for testing -import GF.Text.UTF8 -import Data.List - -import Debug.Trace - - -mkThai :: String -> String -mkThai = concat . map mkThaiWord . words -mkThaiPron = unwords . map mkPronSyllable . words -mkThaiFake = unwords . map (fakeEnglish . mkPronSyllable) . words - - -type ThaiChar = Char - -mkThaiWord :: String -> [ThaiChar] -mkThaiWord = map (toEnum . mkThaiChar) . unchar . snd . pronAndOrth - -mkThaiChar :: String -> Int -mkThaiChar c = maybe 0 id $ Map.lookup c thaiMap - -thaiMap :: Map.Map String Int -thaiMap = Map.fromList $ zip allThaiTrans allThaiCodes - --- convert all string literals in a text - -thaiStrings :: String -> String -thaiStrings = convStrings mkThai - -thaiPronStrings :: String -> String -thaiPronStrings = convStrings mkThaiPron - -convStrings conv s = case s of - '"':cs -> let (t,_:r) = span (/='"') cs in - '"': conv t ++ "\"" ++ convStrings conv r - c:cs -> c : convStrings conv cs - _ -> s - - --- each character is either [letter] or [letter+nonletter] - -unchar :: String -> [String] -unchar s = case s of - c:d:cs - | isAlpha d -> [c] : unchar (d:cs) - | d == '?' -> unchar cs -- use "o?" to represent implicit 'o' - | otherwise -> [c,d] : unchar cs - [_] -> [s] - _ -> [] - --- you can prefix transliteration by irregular phonology in [] - -pronAndOrth :: String -> (Maybe String, String) -pronAndOrth s = case s of - '[':cs -> case span (/=']') cs of - (p,_:o) -> (Just p,o) - _ -> (Nothing,s) - _ -> (Nothing,s) - -allThaiTrans :: [String] -allThaiTrans = words $ - "- k k1 - k2 - k3 g c c1 c2 s' c3 y' d' t' " ++ - "t1 t2 t3 n' d t t4 t5 t6 n b p p1 f p2 f' " ++ - "p3 m y r - l - w s- s. s h l' O h' - " ++ - "a. a a: a+ i i: v v: u u: - - - - - - " ++ - "e e' o: a% a& L R S T1 T2 T3 T4 K - - - " ++ - "N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 - - - - - - " - -allThaiCodes :: [Int] -allThaiCodes = [0x0e00 .. 0x0e7f] - - ---------------------- --- heuristic pronunciation of codes ---------------------- - --- fake English for TTS, a la Teach Yourself Thai - -fakeEnglish :: String -> String -fakeEnglish s = case s of - 'a':'a':cs -> "ah" ++ fakeEnglish cs - 'a':'y':cs -> "ai" ++ fakeEnglish cs - 'a' :cs -> "ah" ++ fakeEnglish cs - 'c':'h':cs -> "ch" ++ fakeEnglish cs - 'c' :cs -> "j" ++ fakeEnglish cs - 'e':'e':cs -> "aih" ++ fakeEnglish cs - 'g' :cs -> "ng" ++ fakeEnglish cs - 'i':'i':cs -> "ee" ++ fakeEnglish cs - 'k':'h':cs -> "k" ++ fakeEnglish cs - 'k' :cs -> "g" ++ fakeEnglish cs - 'O':'O':cs -> "or" ++ fakeEnglish cs - 'O' :cs -> "or" ++ fakeEnglish cs - 'o':'o':cs -> "or" ++ fakeEnglish cs - 'p':'h':cs -> "p" ++ fakeEnglish cs - 'p' :cs -> "b" ++ fakeEnglish cs - 't':'h':cs -> "t" ++ fakeEnglish cs - 't' :cs -> "d" ++ fakeEnglish cs - 'u':'u':cs -> "oo" ++ fakeEnglish cs - 'u' :cs -> "oo" ++ fakeEnglish cs - 'v':'v':cs -> "eu" ++ fakeEnglish cs - 'v' :cs -> "eu" ++ fakeEnglish cs - '\228':'\228':cs -> "air" ++ fakeEnglish cs - '\228' :cs -> "a" ++ fakeEnglish cs - '\246':'\246':cs -> "er" ++ fakeEnglish cs - '\246' :cs -> "er" ++ fakeEnglish cs - c:cs | isTone c -> fakeEnglish cs - c:cs -> c : fakeEnglish cs - _ -> s - where - isTone = flip elem "'`^~" - - --- this works for one syllable - -mkPronSyllable s = case fst $ pronAndOrth s of - Just p -> p - _ -> pronSyllable $ getSyllable $ map mkThaiChar $ unchar s - -data Syllable = Syll { - initv :: [Int], - initc :: [Int], - midv :: [Int], - finalc :: [Int], - finalv :: [Int], - tone :: [Int], - shorten :: Bool, - kill :: Bool - } - deriving Show - -data Tone = TMid | TLow | THigh | TRise | TFall - deriving Show - -data CClass = CLow | CMid | CHigh - deriving Show - -pronSyllable :: Syllable -> String -pronSyllable s = - initCons ++ tonem ++ vowel ++ finalCons - where - - vowel = case (initv s, midv s, finalv s, finalc s, shorten s, tone s) of - ([0x0e40],[0x0e35],_,[0x0e22],_,_) -> "ia" -- e-i:y - ([0x0e40],[0x0e2d,0x0e35],_,_,_,_) -> "va" -- e-i:O - ([0x0e40],[0x0e30,0x0e35],_,[0x0e22],_,_) -> "ia" -- e-i:ya. - ([0x0e40],[0x0e30,0x0e2d],_,_,_,_) -> "\246" -- e-Oa. - ([0x0e40],[0x0e30,0x0e32],_,_,_,_) -> "O" -- e-a:a. -- open o - ([0x0e40],[0x0e2d],_,_,_,_) -> "\246\246" -- e-O - ([0x0e40],[0x0e34],_,_,_,_) -> "\246\246" -- e-i - ([0x0e40],[0x0e30],_,_,_,_) -> "e" -- e-a. - ([0x0e40],[0x0e32],_,_,_,_) -> "aw" -- e-a: - ([0x0e40],[],[],[0x0e22],_,_) -> "\246\246y" -- e-y - ([0x0e40],[],[],_,True,_) -> "e" - - ([0x0e41],[0x0e30],_,_,_,_) -> "\228" -- ä-a. - ([0x0e41],[],[],_,True,_) -> "\228" - - ([0x0e42],[0x0e30],_,_,_,_) -> "o" -- o:-a. - - ([],[0x0e2d],_,[0x0e22],_,_) -> "OOy" -- Oy - ([],[0x0e2d],_,_,_,_) -> "OO" -- O - - ([],[],[],_,_,_) -> "o" - - (i,m,f,_,_,_) -> concatMap pronThaiChar (reverse $ f ++ m ++ i) ---- - - initCons = concatMap pronThaiChar $ case (reverse $ initc s) of - 0x0e2b:cs@(_:_) -> cs -- high h - 0x0e2d:cs@(_:_) -> cs -- O - cs -> cs - - finalCons = - let (c,cs) = splitAt 1 $ finalc s - in - case c of - [] -> [] - [0x0e22] -> [] --- y - [k] -> concatMap pronThaiChar (reverse cs) ++ finalThai k - - iclass = case take 1 (reverse $ initc s) of - [c] -> classThai c - [] -> CMid -- O - - isLong = not (shorten s) && case vowel of - _:_:_ -> True ---- - _ -> False - - isLive = case finalCons of - c | elem c ["n","m","g"] -> True - "" -> isLong - _ -> False - - tonem = case (iclass,isLive,isLong,tone s) of - (_,_,_, [0x0e4a]) -> tHigh - (_,_,_, [0x0e4b]) -> tRise - (CLow,_,_,[0x0e49]) -> tRise - (_,_,_, [0x0e49]) -> tFall - (CLow,_,_,[0x0e48]) -> tFall - (_, _,_,[0x0e48]) -> tLow - (CHigh,True,_,_) -> tRise - (_, True,_,_) -> tMid - (CLow,False,False,_) -> tHigh - (CLow,False,_,_) -> tFall - _ -> tLow - -(tMid,tHigh,tLow,tRise,tFall) = ("-","'","`","~","^") - -isVowel c = 0x0e30 <= c && c <= 0x0e44 ---- -isCons c = 0x0e01 <= c && c <= 0x0e2f ---- -isTone c = 0x0e48 <= c && c <= 0x0e4b - -getSyllable :: [Int] -> Syllable -getSyllable = foldl get (Syll [] [] [] [] [] [] False False) where - get syll c = case c of - 0x0e47 -> syll {shorten = True} - 0x0e4c -> syll {kill = True, finalc = tail (finalc syll)} --- always last - 0x0e2d - | null (initc syll) -> syll {initc = [c]} -- "O" - | otherwise -> syll {midv = c : midv syll} - _ - | isVowel c -> if null (initc syll) - then syll {initv = c : initv syll} - else syll {midv = c : midv syll} - | isCons c -> if null (initc syll) || - (null (midv syll) && isCluster (initc syll) c) - then syll {initc = c : initc syll} - else syll {finalc = c : finalc syll} - | isTone c -> syll {tone = [c]} - _ -> syll ---- check this - - isCluster s c = length s == 1 && (c == 0x0e23 || s == [0x0e2b]) - --- to test - -test1 = testThai "k2wa:mrak" -test2 = putStrLn $ thaiTable -test3 = do - writeFile "thai.txt" "Thai Character Coding in GF\nAR 2007\n" - appendFile "thai.txt" thaiTable -test4 = do - writeFile "alphthai.txt" "Thai Characters by Pronunciation\nAR 2007\n" - appendFile "alphthai.txt" thaiTableAlph - - -testThai :: String -> IO () -testThai s = do - putStrLn $ encodeUTF8 $ mkThai s - putStrLn $ unwords $ map mkPronSyllable $ words s - -testSyllable s = - let y = getSyllable $ map mkThaiChar $ unchar s - in - putStrLn $ pronSyllable $ trace (show y) y - -thaiFile :: FilePath -> Maybe FilePath -> IO () -thaiFile f mo = do - s <- readFile f - let put = maybe putStr writeFile mo - put $ encodeUTF8 $ thaiStrings s - -thaiPronFile :: FilePath -> Maybe FilePath -> IO () -thaiPronFile f mo = do - s <- readFile f - let put = maybe putStr writeFile mo - put $ encodeUTF8 $ thaiPronStrings s - -thaiFakeFile :: FilePath -> Maybe FilePath -> IO () -thaiFakeFile f mo = do - s <- readFile f - let put = maybe putStr writeFile mo - put $ encodeUTF8 $ (convStrings mkThaiFake) s - -finalThai c = maybe "" return (Map.lookup c thaiFinalMap) -thaiFinalMap = Map.fromList $ zip allThaiCodes finals - -classThai c = maybe CLow readClass (Map.lookup c thaiClassMap) -thaiClassMap = Map.fromList $ zip allThaiCodes heights - -readClass s = case s of - 'L' -> CLow - 'M' -> CMid - 'H' -> CHigh - - -thaiTable :: String -thaiTable = unlines $ ("\n|| hex | thai | trans | pron | fin | class |" ) : [ - "| " ++ - hex c ++ " | " ++ - encodeUTF8 (showThai s) ++ " | " ++ - s ++ " | " ++ - pronThai s ++ " | " ++ - [f] ++ " | " ++ - [q] ++ " | " - | - (c,q,f,s) <- zip4 allThaiCodes heights finals allThaiTrans - ] - -thaiTableAlph :: String -thaiTableAlph = unlines $ ("\n|| pron | thai | trans |" ) : [ - "| " ++ a ++ - " | " ++ unwords (map (encodeUTF8 . showThai) ss) ++ - " | " ++ unwords ss ++ - " |" - | - (a,ss) <- allProns - ] - where - prons = sort $ nub - [p | s <- allThaiTrans, let p = pronThai s, not (null p),isAlpha (head p)] - allProns = - [(a,[s | s <- allThaiTrans, pronThai s == a]) | a <- prons] - -showThai s = case s of - "-" -> "-" ---- v:_ | elem v "ivu" -> map (toEnum . mkThaiChar) ["O",s] - _ -> [toEnum $ mkThaiChar s] - - -pronThaiChar = pronThai . recodeThai - -recodeThai c = allThaiTrans !! (c - 0x0e00) - -pronThai s = case s of - [c,p] - | c == 'N' && isDigit p -> [p] - | c == 'T' && isDigit p -> ['\'',p] - | isDigit p -> c:"h" - | p==':' -> c:[c] - | elem p "%&" -> c:"y" - | p=='+' -> c:"m" - | s == "e'" -> "\228\228" - | otherwise -> [c] - "O" -> "O" - "e" -> "ee" - [c] | isUpper c -> "" - _ -> s - -hex = map hx . reverse . digs where - digs 0 = [0] - digs n = n `mod` 16 : digs (n `div` 16) - hx d = "0123456789ABCDEF" !! d - -heights :: String -finals :: String -heights = - " MHHLLLLMHLLLLMMHLLLMMHLLLMMHHLLLLLL-L-LHHHHLML" ++ replicate 99 ' ' -finals = - " kkkkkkgt-tt-ntttttntttttnpp--pppmyn-n-wttt-n--" ++ replicate 99 ' ' diff --git a/src/GF/Text/UTF8.hs b/src/GF/Text/UTF8.hs deleted file mode 100644 index 5e9687684..000000000 --- a/src/GF/Text/UTF8.hs +++ /dev/null @@ -1,48 +0,0 @@ ----------------------------------------------------------------------- --- | --- Module : UTF8 --- Maintainer : AR --- Stability : (stable) --- Portability : (portable) --- --- > CVS $Date: 2005/04/21 16:23:42 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.5 $ --- --- From the Char module supplied with HBC. --- code by Thomas Hallgren (Jul 10 1999) ------------------------------------------------------------------------------ - -module GF.Text.UTF8 (decodeUTF8, encodeUTF8) where - --- | Take a Unicode string and encode it as a string --- with the UTF8 method. -decodeUTF8 :: String -> String -decodeUTF8 "" = "" -decodeUTF8 (c:cs) | c < '\x80' = c : decodeUTF8 cs -decodeUTF8 (c:c':cs) | '\xc0' <= c && c <= '\xdf' && - '\x80' <= c' && c' <= '\xbf' = - toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : decodeUTF8 cs -decodeUTF8 (c:c':c'':cs) | '\xe0' <= c && c <= '\xef' && - '\x80' <= c' && c' <= '\xbf' && - '\x80' <= c'' && c'' <= '\xbf' = - toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : decodeUTF8 cs -decodeUTF8 s = s ---- AR workaround 22/6/2006 -----decodeUTF8 _ = error "UniChar.decodeUTF8: bad data" - -encodeUTF8 :: String -> String -encodeUTF8 "" = "" -encodeUTF8 (c:cs) = - if c > '\x0000' && c < '\x0080' then - c : encodeUTF8 cs - else if c < toEnum 0x0800 then - let i = fromEnum c - in toEnum (0xc0 + i `div` 0x40) : - toEnum (0x80 + i `mod` 0x40) : - encodeUTF8 cs - else - let i = fromEnum c - in toEnum (0xe0 + i `div` 0x1000) : - toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) : - toEnum (0x80 + i `mod` 0x40) : - encodeUTF8 cs diff --git a/src/GF/Text/Unicode.hs b/src/GF/Text/Unicode.hs deleted file mode 100644 index 9d0b9d1a8..000000000 --- a/src/GF/Text/Unicode.hs +++ /dev/null @@ -1,69 +0,0 @@ ----------------------------------------------------------------------- --- | --- Module : Unicode --- Maintainer : AR --- Stability : (stable) --- Portability : (portable) --- --- > CVS $Date: 2005/04/21 16:23:42 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.12 $ --- --- ad hoc Unicode conversions from different alphabets. --- AR 12\/4\/2000, 18\/9\/2001, 30\/5\/2002, 26\/1\/2004 ------------------------------------------------------------------------------ - -module GF.Text.Unicode (mkUnicode, treat) where - -import GF.Text.Greek (mkGreek) -import GF.Text.Arabic (mkArabic) -import GF.Text.Hebrew (mkHebrew) -import GF.Text.Russian (mkRussian, mkRusKOI8) -import GF.Text.Ethiopic (mkEthiopic) -import GF.Text.Tamil (mkTamil) -import GF.Text.OCSCyrillic (mkOCSCyrillic) -import GF.Text.LatinASupplement (mkLatinASupplement) -import GF.Text.Devanagari (mkDevanagari) -import GF.Text.Hiragana (mkJapanese) -import GF.Text.ExtendedArabic (mkArabic0600) -import GF.Text.ExtendedArabic (mkExtendedArabic) -import GF.Text.ExtraDiacritics (mkExtraDiacritics) - -import Data.Char - -mkUnicode :: String -> String -mkUnicode s = case s of - '/':'/':cs -> treat [] mkGreek unic ++ mkUnicode rest - '/':'+':cs -> mkHebrew unic ++ mkUnicode rest - '/':'-':cs -> mkArabic unic ++ mkUnicode rest - '/':'_':cs -> treat [] mkRussian unic ++ mkUnicode rest - '/':'*':cs -> mkRusKOI8 unic ++ mkUnicode rest - '/':'E':cs -> mkEthiopic unic ++ mkUnicode rest - '/':'T':cs -> mkTamil unic ++ mkUnicode rest - '/':'C':cs -> mkOCSCyrillic unic ++ mkUnicode rest - '/':'&':cs -> mkDevanagari unic ++ mkUnicode rest - '/':'L':cs -> mkLatinASupplement unic ++ mkUnicode rest - '/':'J':cs -> mkJapanese unic ++ mkUnicode rest - '/':'6':cs -> mkArabic0600 unic ++ mkUnicode rest - '/':'A':cs -> mkExtendedArabic unic ++ mkUnicode rest - '/':'X':cs -> mkExtraDiacritics unic ++ mkUnicode rest - c:cs -> c:mkUnicode cs - _ -> s - where - (unic,rest) = remClosing [] $ dropWhile isSpace $ drop 2 s - remClosing u s = case s of - c:'/':s | elem c "/+-_*ETC&LJ6AX" -> (reverse u, s) --- end need not match - c:cs -> remClosing (c:u) cs - _ -> (reverse u,[]) -- forgiving missing end - --- | don't convert XML tags --- assumes \<\> always means XML tags -treat :: String -> (String -> String) -> String -> String -treat old mk s = case s of - '<':cs -> mk (reverse old) ++ '<':noTreat cs - c:cs -> treat (c:old) mk cs - _ -> mk (reverse old) - where - noTreat s = case s of - '>':cs -> '>' : treat [] mk cs - c:cs -> c : noTreat cs - _ -> s -- cgit v1.2.3