diff options
| author | aarne <aarne@cs.chalmers.se> | 2008-05-21 09:26:44 +0000 |
|---|---|---|
| committer | aarne <aarne@cs.chalmers.se> | 2008-05-21 09:26:44 +0000 |
| commit | 055c0d0d5a5bb0dc75904fe53df7f2e4f5732a8f (patch) | |
| tree | 0e63fb68c69c8f6ad0f78893c63420f0a3600e1c /src-3.0/GF/Text | |
| parent | 915a1de71783ab8446b1af9e72c7ba7dfbc12d3f (diff) | |
GF/src is now for 2.9, and the new sources are in src-3.0 - keep it this way until the release of GF 3
Diffstat (limited to 'src-3.0/GF/Text')
| -rw-r--r-- | src-3.0/GF/Text/Arabic.hs | 63 | ||||
| -rw-r--r-- | src-3.0/GF/Text/Devanagari.hs | 97 | ||||
| -rw-r--r-- | src-3.0/GF/Text/Ethiopic.hs | 72 | ||||
| -rw-r--r-- | src-3.0/GF/Text/ExtendedArabic.hs | 99 | ||||
| -rw-r--r-- | src-3.0/GF/Text/ExtraDiacritics.hs | 37 | ||||
| -rw-r--r-- | src-3.0/GF/Text/Greek.hs | 172 | ||||
| -rw-r--r-- | src-3.0/GF/Text/Hebrew.hs | 53 | ||||
| -rw-r--r-- | src-3.0/GF/Text/Hiragana.hs | 95 | ||||
| -rw-r--r-- | src-3.0/GF/Text/LatinASupplement.hs | 69 | ||||
| -rw-r--r-- | src-3.0/GF/Text/OCSCyrillic.hs | 47 | ||||
| -rw-r--r-- | src-3.0/GF/Text/Russian.hs | 56 | ||||
| -rw-r--r-- | src-3.0/GF/Text/Tamil.hs | 77 | ||||
| -rw-r--r-- | src-3.0/GF/Text/Text.hs | 149 | ||||
| -rw-r--r-- | src-3.0/GF/Text/Thai.hs | 368 | ||||
| -rw-r--r-- | src-3.0/GF/Text/UTF8.hs | 48 | ||||
| -rw-r--r-- | src-3.0/GF/Text/Unicode.hs | 69 |
16 files changed, 1571 insertions, 0 deletions
diff --git a/src-3.0/GF/Text/Arabic.hs b/src-3.0/GF/Text/Arabic.hs new file mode 100644 index 000000000..c482b1172 --- /dev/null +++ b/src-3.0/GF/Text/Arabic.hs @@ -0,0 +1,63 @@ +---------------------------------------------------------------------- +-- | +-- Module : Arabic +-- Maintainer : (Maintainer) +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/04/21 16:23:34 $ +-- > CVS $Author: bringert $ +-- > CVS $Revision: 1.6 $ +-- +-- (Description of the module) +----------------------------------------------------------------------------- + +module GF.Text.Arabic (mkArabic) where + +mkArabic :: String -> String +mkArabic = unwords . (map mkArabicWord) . words +----mkArabic = reverse . unwords . (map mkArabicWord) . words +--- reverse : assumes everything's on same line + +type ArabicChar = Char + +mkArabicWord :: String -> [ArabicChar] +mkArabicWord = map mkArabicChar . getLetterPos + +getLetterPos :: String -> [(Char,Int)] +getLetterPos [] = [] +getLetterPos ('I':cs) = ('*',7) : getLetterPos cs -- 0xfe80 +getLetterPos ('O':cs) = ('*',8) : getIn cs -- 0xfe8b +getLetterPos ('l':'a':cs) = ('*',5) : getLetterPos cs -- 0xfefb +getLetterPos [c] = [(c,1)] -- 1=isolated +getLetterPos (c:cs) | isReduced c = (c,1) : getLetterPos cs +getLetterPos (c:cs) = (c,3) : getIn cs -- 3=initial + + +getIn [] = [] +getIn ('I':cs) = ('*',7) : getLetterPos cs -- 0xfe80 +getIn ('O':cs) = ('*',9) : getIn cs -- 0xfe8c +getIn ('l':'a':cs) = ('*',6) : getLetterPos cs -- 0xfefc +getIn [c] = [(c,2)] -- 2=final +getIn (c:cs) | isReduced c = (c,2) : getLetterPos cs +getIn (c:cs) = (c,4) : getIn cs -- 4=medial + +isReduced :: Char -> Bool +isReduced c = c `elem` "UuWiYOaAdVrzwj" + +mkArabicChar ('*',p) | p > 4 && p < 10 = + (map toEnum [0xfefb,0xfefc,0xfe80,0xfe8b,0xfe8c]) !! (p-5) +mkArabicChar cp@(c,p) = case lookup c cc of Just c' -> (c' !! (p-1)) ; _ -> c + where + cc = mkArabicTab allArabicCodes allArabic + +mkArabicTab (c:cs) as = (c,as1) : mkArabicTab cs as2 where + (as1,as2) = if isReduced c then splitAt 2 as else splitAt 4 as +mkArabicTab [] _ = [] + +allArabicCodes = "UuWiYOabAtvgHCdVrzscSDTZoxfqklmnhwjy" + +allArabic :: String +allArabic = (map toEnum [0xfe81 .. 0xfef4]) -- I=0xfe80 + + diff --git a/src-3.0/GF/Text/Devanagari.hs b/src-3.0/GF/Text/Devanagari.hs new file mode 100644 index 000000000..bf4343cd0 --- /dev/null +++ b/src-3.0/GF/Text/Devanagari.hs @@ -0,0 +1,97 @@ +---------------------------------------------------------------------- +-- | +-- Module : Devanagari +-- Maintainer : (Maintainer) +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/04/21 16:23:34 $ +-- > CVS $Author: bringert $ +-- > CVS $Revision: 1.5 $ +-- +-- (Description of the module) +----------------------------------------------------------------------------- + +module GF.Text.Devanagari (mkDevanagari) where + +mkDevanagari :: String -> String +mkDevanagari = digraphWordToUnicode . adHocToDigraphWord + +adHocToDigraphWord :: String -> [(Char, Char)] +adHocToDigraphWord str = case str of + [] -> [] + '<' : cs -> ('\\', '<') : spoolMarkup cs + ' ' : cs -> ('\\', ' ') : adHocToDigraphWord cs -- skip space + +-- if c1 is a vowel + -- Two of the same vowel => lengthening + c1 : c2 : cs | c1 == c2 && isVowel c1 -> (cap c1, ':') : adHocToDigraphWord cs + -- digraphed or long vowel + c1 : c2 : cs | isVowel c1 && isVowel c2 -> (cap c1, cap c2) : adHocToDigraphWord cs + c1 : cs | isVowel c1 -> (' ', cap c1) : adHocToDigraphWord cs + +-- c1 isn't a vowel + -- c1 : 'a' : [] -> [(' ', c1)] -- a inherent + -- c1 : c2 : [] | isVowel c2 -> (' ', c1) : [(' ', c2)] + + -- c1 is aspirated + c1 : 'H' : c2 : c3 : cs | c2 == c3 && isVowel c2 -> + (c1, 'H') : (c2, ':') : adHocToDigraphWord cs + c1 : 'H' : c2 : c3 : cs | isVowel c2 && isVowel c3 -> + (c1, 'H') : (c2, c3) : adHocToDigraphWord cs + c1 : 'H' : 'a' : cs -> (c1, 'H') : adHocToDigraphWord cs -- a inherent + c1 : 'H' : c2 : cs | isVowel c2 -> (c1, 'H') : (' ', c2) : adHocToDigraphWord cs + -- not vowelless at EOW + c1 : 'H' : ' ' : cs -> (c1, 'H') : ('\\', ' ') : adHocToDigraphWord cs + c1 : 'H' : [] -> [(c1, 'H')] + c1 : 'H' : cs -> (c1, 'H') : (' ', '^') : adHocToDigraphWord cs -- vowelless + + -- c1 unasp. + c1 : c2 : c3 : cs | c2 == c3 && isVowel c2 -> (' ', c1) : (c2, ':') : adHocToDigraphWord cs + c1 : c2 : c3 : cs | isVowel c2 && isVowel c3 -> (' ', c1) : (c2, c3) : adHocToDigraphWord cs + c1 : 'a' : cs -> (' ', c1) : adHocToDigraphWord cs -- a inherent + c1 : c2 : cs | isVowel c2 -> (' ', c1) : (' ', c2) : adHocToDigraphWord cs + -- not vowelless at EOW + c1 : ' ' : cs -> (' ', c1) : ('\\', ' '): adHocToDigraphWord cs + c1 : [] -> [(' ', c1)] + 'M' : cs -> (' ', 'M') : adHocToDigraphWord cs -- vowelless but no vowelless sign for anusvara + c1 : cs -> (' ', c1) : (' ', '^') : adHocToDigraphWord cs -- vowelless + +isVowel x = elem x "aeiou:" +cap :: Char -> Char +cap x = case x of + 'a' -> 'A' + 'e' -> 'E' + 'i' -> 'I' + 'o' -> 'O' + 'u' -> 'U' + c -> c + +spoolMarkup :: String -> [(Char, Char)] +spoolMarkup s = case s of + -- [] -> [] -- Shouldn't happen + '>' : cs -> ('\\', '>') : adHocToDigraphWord cs + c1 : cs -> ('\\', c1) : spoolMarkup cs + + +digraphWordToUnicode :: [(Char, Char)] -> String +digraphWordToUnicode = map digraphToUnicode + +digraphToUnicode :: (Char, Char) -> Char +digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2 + where + cc = zip allDevanagariCodes allDevanagari + +digraphedDevanagari = " ~ M ;__ AA: II: UU:RoLoEvE~ EE:AvA~ OAU kkH ggHNG ccH jjH \241 TTH DDH N ttH ddH nn. ppH bbH m y rr. l LL. v \231 S s h____ .-Sa: ii: uu:ror:eve~ eaiava~ oau ^____OM | -dddu______ Q X G zD.RH fy.R:L:mrmR#I#d#0#1#2#3#4#5#6#7#8#9#o" + +allDevanagariCodes :: [(Char, Char)] +allDevanagariCodes = mkPairs digraphedDevanagari + +allDevanagari :: String +allDevanagari = (map toEnum [0x0901 .. 0x0970]) + +mkPairs :: String -> [(Char, Char)] +mkPairs str = case str of + [] -> [] + c1 : c2 : cs -> (c1, c2) : mkPairs cs + diff --git a/src-3.0/GF/Text/Ethiopic.hs b/src-3.0/GF/Text/Ethiopic.hs new file mode 100644 index 000000000..81abbf719 --- /dev/null +++ b/src-3.0/GF/Text/Ethiopic.hs @@ -0,0 +1,72 @@ +---------------------------------------------------------------------- +-- | +-- Module : Ethiopic +-- Maintainer : HH +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/04/21 16:23:35 $ +-- > CVS $Author: bringert $ +-- > CVS $Revision: 1.6 $ +-- +-- Ascii-Unicode decoding for Ethiopian. +-- Copyright (c) Harald Hammarström 2003 under Gnu General Public License +----------------------------------------------------------------------------- + +module GF.Text.Ethiopic (mkEthiopic) where + +mkEthiopic :: String -> String +mkEthiopic = digraphWordToUnicode . adHocToDigraphWord + +-- mkEthiopic :: String -> String +-- mkEthiopic = reverse . unwords . (map (digraphWordToUnicode . adHocToDigraphWord)) . words +--- reverse : assumes everything's on same line + +adHocToDigraphWord :: String -> [(Char, Int)] +adHocToDigraphWord str = case str of + [] -> [] + '<' : cs -> ('<', -1) : spoolMarkup cs + c1 : cs | isVowel c1 -> (')', vowelOrder c1) : adHocToDigraphWord cs + -- c1 isn't a vowel + c1 : cs | not (elem c1 allEthiopicCodes) -> (c1, -1) : adHocToDigraphWord cs + c1 : c2 : cs | isVowel c2 -> (c1, vowelOrder c2) : adHocToDigraphWord cs + c1 : cs -> (c1, 5) : adHocToDigraphWord cs + +spoolMarkup :: String -> [(Char, Int)] +spoolMarkup s = case s of + -- [] -> [] -- Shouldn't happen + '>' : cs -> ('>', -1) : adHocToDigraphWord cs + c1 : cs -> (c1, -1) : spoolMarkup cs + +isVowel x = elem x "A\228ui\239aeoI" + +vowelOrder :: Char -> Int +vowelOrder x = case x of + 'A' -> 0 + '\228' -> 0 -- ä + 'u' -> 1 + 'i' -> 2 + 'a' -> 3 + 'e' -> 4 + 'I' -> 5 + '\239' -> 5 -- ï + 'o' -> 6 + c -> 5 -- vowelless + +digraphWordToUnicode = map digraphToUnicode + +digraphToUnicode :: (Char, Int) -> Char +-- digraphToUnicode (c1, c2) = c1 + +digraphToUnicode (c1, -1) = c1 +digraphToUnicode (c1, c2) = toEnum (0x1200 + c2 + 8*case lookup c1 cc of Just c' -> c') + where + cc = zip allEthiopicCodes allEthiopic + +allEthiopic :: [Int] +allEthiopic = [0 .. 44] -- x 8 + +allEthiopicCodes = "hlHmLrs$KQ__bBtcxXnN)kW__w(zZyd_jgG_TCPSLfp" + +-- Q = kW, X = xW, W = kW, G = gW + diff --git a/src-3.0/GF/Text/ExtendedArabic.hs b/src-3.0/GF/Text/ExtendedArabic.hs new file mode 100644 index 000000000..d2c5faac5 --- /dev/null +++ b/src-3.0/GF/Text/ExtendedArabic.hs @@ -0,0 +1,99 @@ +---------------------------------------------------------------------- +-- | +-- Module : ExtendedArabic +-- Maintainer : (Maintainer) +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/04/21 16:23:36 $ +-- > CVS $Author: bringert $ +-- > CVS $Revision: 1.6 $ +-- +-- (Description of the module) +----------------------------------------------------------------------------- + +module GF.Text.ExtendedArabic (mkArabic0600, mkExtendedArabic) where + +mkArabic0600 :: String -> String +mkArabic0600 = digraphWordToUnicode . aarnesToDigraphWord + +aarnesToDigraphWord :: String -> [(Char, Char)] +aarnesToDigraphWord str = case str of + [] -> [] + '<' : cs -> ('\\', '<') : spoolMarkup2 cs + + 'v' : cs -> ('T', 'H') : aarnesToDigraphWord cs + 'a' : cs -> (' ', 'A') : aarnesToDigraphWord cs + 'o' : cs -> (' ', '3') : aarnesToDigraphWord cs + 'O' : cs -> ('\'', 'i') : aarnesToDigraphWord cs + + 'u' : cs -> ('\'', 'A') : aarnesToDigraphWord cs + 'C' : cs -> (' ', 'X') : aarnesToDigraphWord cs + + 'U' : cs -> ('~', 'A') : aarnesToDigraphWord cs + 'A' : cs -> ('"', 't') : aarnesToDigraphWord cs + 'c' : cs -> ('s', 'h') : aarnesToDigraphWord cs + c : cs -> (' ', c) : aarnesToDigraphWord cs + +mkExtendedArabic :: String -> String +mkExtendedArabic = digraphWordToUnicode . adHocToDigraphWord + +adHocToDigraphWord :: String -> [(Char, Char)] +adHocToDigraphWord str = case str of + [] -> [] + '<' : cs -> ('\\', '<') : spoolMarkup cs + -- Sorani + 'W' : cs -> (':', 'w') : adHocToDigraphWord cs -- ?? Will do + 'E' : cs -> (' ', 'i') : adHocToDigraphWord cs -- ?? Letter missing! + 'j' : cs -> ('d', 'j') : adHocToDigraphWord cs + 'O' : cs -> ('v', 'w') : adHocToDigraphWord cs + 'F' : cs -> (' ', 'v') : adHocToDigraphWord cs + 'Z' : cs -> ('z', 'h') : adHocToDigraphWord cs + 'I' : cs -> (' ', 'i') : adHocToDigraphWord cs -- ?? Letter missing! + 'C' : cs -> ('c', 'h') : adHocToDigraphWord cs + -- Pashto + 'e' : cs -> (':', 'y') : adHocToDigraphWord cs + '$' : cs -> ('3', 'H') : adHocToDigraphWord cs + 'X' : cs -> ('s', '.') : adHocToDigraphWord cs + 'G' : cs -> ('z', '.') : adHocToDigraphWord cs + 'a' : cs -> (' ', 'A') : adHocToDigraphWord cs + 'P' : cs -> ('\'', 'H') : adHocToDigraphWord cs + 'R' : cs -> ('o', 'r') : adHocToDigraphWord cs + -- Shared + 'A' : cs -> (' ', 'h') : adHocToDigraphWord cs -- ?? Maybe to "t or 0x06d5 + 'c' : cs -> ('s', 'h') : adHocToDigraphWord cs + c : cs -> (' ', c) : adHocToDigraphWord cs + + +-- Beginning 0x621 up and including 0x06d1 +digraphedExtendedArabic = " '~A'A'w,A'i A b\"t tTHdj H X dDH r z ssh S D T Z 3GH__________ - f q k l m n h w i y&a&w&i/a/w/i/W/o/~/'/,/|/6/v_____________#0#1#2#3#4#5#6#7#8#9#%#,#'#*>b>q$|> A2'2,3'A'w'w&y'Tb:b:BoT3b p4b4B'H:H2H\"H3Hch4HTdod.dTD:d:D3d3D4dTrvror.rvRz.:rzh4zs.+s*S:S3S3T33>ff.f: v4f.q3q-k~kok.k3k3K gog:g:G3Gvl.l3l3L:n>nTnon3n?h4H't>Y\"Yow-wvwww|w^w:w3w>y/yvy.w:y3y____ -ae" + +digraphWordToUnicode = map digraphToUnicode + +digraphToUnicode :: (Char, Char) -> Char +digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2 + where + cc = zip allExtendedArabicCodes allExtendedArabic + +allExtendedArabicCodes :: [(Char, Char)] +allExtendedArabicCodes = mkPairs digraphedExtendedArabic + +allExtendedArabic :: String +allExtendedArabic = (map toEnum [0x0621 .. 0x06d1]) + +mkPairs :: String -> [(Char, Char)] +mkPairs str = case str of + [] -> [] + c1 : c2 : cs -> (c1, c2) : mkPairs cs + +spoolMarkup :: String -> [(Char, Char)] +spoolMarkup s = case s of + [] -> [] -- Shouldn't happen + '>' : cs -> ('\\', '>') : adHocToDigraphWord cs + c1 : cs -> ('\\', c1) : spoolMarkup cs + +spoolMarkup2 :: String -> [(Char, Char)] +spoolMarkup2 s = case s of + [] -> [] -- Shouldn't happen + '>' : cs -> ('\\', '>') : aarnesToDigraphWord cs + c1 : cs -> ('\\', c1) : spoolMarkup2 cs diff --git a/src-3.0/GF/Text/ExtraDiacritics.hs b/src-3.0/GF/Text/ExtraDiacritics.hs new file mode 100644 index 000000000..f3d811c2c --- /dev/null +++ b/src-3.0/GF/Text/ExtraDiacritics.hs @@ -0,0 +1,37 @@ +---------------------------------------------------------------------- +-- | +-- Module : ExtraDiacritics +-- Maintainer : (Maintainer) +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/04/21 16:23:36 $ +-- > CVS $Author: bringert $ +-- > CVS $Revision: 1.5 $ +-- +-- (Description of the module) +----------------------------------------------------------------------------- + +module GF.Text.ExtraDiacritics (mkExtraDiacritics) where + +mkExtraDiacritics :: String -> String +mkExtraDiacritics = mkExtraDiacriticsWord + +mkExtraDiacriticsWord :: String -> String +mkExtraDiacriticsWord str = case str of + [] -> [] + '<' : cs -> '<' : spoolMarkup cs + -- + '/' : cs -> toEnum 0x0301 : mkExtraDiacriticsWord cs + '~' : cs -> toEnum 0x0306 : mkExtraDiacriticsWord cs + ':' : cs -> toEnum 0x0304 : mkExtraDiacriticsWord cs -- some of these could be put in LatinA + '.' : cs -> toEnum 0x0323 : mkExtraDiacriticsWord cs + 'i' : '-' : cs -> toEnum 0x0268 : mkExtraDiacriticsWord cs -- in IPA extensions + -- Default + c : cs -> c : mkExtraDiacriticsWord cs + +spoolMarkup :: String -> String +spoolMarkup s = case s of + [] -> [] -- Shouldn't happen + '>' : cs -> '>' : mkExtraDiacriticsWord cs + c1 : cs -> c1 : spoolMarkup cs diff --git a/src-3.0/GF/Text/Greek.hs b/src-3.0/GF/Text/Greek.hs new file mode 100644 index 000000000..6b9361a29 --- /dev/null +++ b/src-3.0/GF/Text/Greek.hs @@ -0,0 +1,172 @@ +---------------------------------------------------------------------- +-- | +-- Module : Greek +-- Maintainer : (Maintainer) +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/04/21 16:23:37 $ +-- > CVS $Author: bringert $ +-- > CVS $Revision: 1.5 $ +-- +-- (Description of the module) +----------------------------------------------------------------------------- + +module GF.Text.Greek (mkGreek) where + +mkGreek :: String -> String +mkGreek = unwords . (map mkGreekWord) . mkGravis . words + +--- TODO : optimize character formation by factorizing the case expressions + +type GreekChar = Char + +mkGreekWord :: String -> [GreekChar] +mkGreekWord = map (toEnum . mkGreekChar) . mkGreekSpec + +mkGravis :: [String] -> [String] +mkGravis [] = [] +mkGravis [w] = [w] +mkGravis (w1:w2:ws) + | stressed w2 = mkG w1 : mkGravis (w2:ws) + | otherwise = w1 : w2 : mkGravis ws + where + stressed w = any (`elem` "'~`") w + mkG :: String -> String + mkG w = let (w1,w2) = span (/='\'') w in + case w2 of + '\'':v:cs | not (any isVowel cs) -> w1 ++ "`" ++ [v] ++ cs + '\'':'!':v:cs | not (any isVowel cs) -> w1 ++ "`!" ++ [v] ++ cs + _ -> w + isVowel c = elem c "aehiouw" + +mkGreekSpec :: String -> [(Char,Int)] +mkGreekSpec str = case str of + [] -> [] + '(' :'\'': '!' : c : cs -> (c,25) : mkGreekSpec cs + '(' :'~' : '!' : c : cs -> (c,27) : mkGreekSpec cs + '(' :'`' : '!' : c : cs -> (c,23) : mkGreekSpec cs + '(' : '!' : c : cs -> (c,21) : mkGreekSpec cs + ')' :'\'': '!' : c : cs -> (c,24) : mkGreekSpec cs + ')' :'~' : '!' : c : cs -> (c,26) : mkGreekSpec cs + ')' :'`' : '!' : c : cs -> (c,22) : mkGreekSpec cs + ')' : '!' : c : cs -> (c,20) : mkGreekSpec cs + '\'': '!' : c : cs -> (c,30) : mkGreekSpec cs + '~' : '!' : c : cs -> (c,31) : mkGreekSpec cs + '`' : '!' : c : cs -> (c,32) : mkGreekSpec cs + '!' : c : cs -> (c,33) : mkGreekSpec cs + '(' :'\'': c : cs -> (c,5) : mkGreekSpec cs + '(' :'~' : c : cs -> (c,7) : mkGreekSpec cs + '(' :'`' : c : cs -> (c,3) : mkGreekSpec cs + '(' : c : cs -> (c,1) : mkGreekSpec cs + ')' :'\'': c : cs -> (c,4) : mkGreekSpec cs + ')' :'~' : c : cs -> (c,6) : mkGreekSpec cs + ')' :'`' : c : cs -> (c,2) : mkGreekSpec cs + ')' : c : cs -> (c,0) : mkGreekSpec cs + '\'': c : cs -> (c,10) : mkGreekSpec cs + '~' : c : cs -> (c,11) : mkGreekSpec cs + '`' : c : cs -> (c,12) : mkGreekSpec cs + c : cs -> (c,-1) : mkGreekSpec cs + +mkGreekChar (c,-1) = case lookup c cc of Just c' -> c' ; _ -> fromEnum c + where + cc = zip "abgdezhqiklmnxoprjstyfcuw" allGreekMin +mkGreekChar (c,n) = case (c,n) of + ('a',10) -> 0x03ac + ('a',11) -> 0x1fb6 + ('a',12) -> 0x1f70 + ('a',30) -> 0x1fb4 + ('a',31) -> 0x1fb7 + ('a',32) -> 0x1fb2 + ('a',33) -> 0x1fb3 + ('a',n) | n >19 -> 0x1f80 + n - 20 + ('a',n) -> 0x1f00 + n + ('e',10) -> 0x03ad -- ' +-- ('e',11) -> 0x1fb6 -- ~ can't happen + ('e',12) -> 0x1f72 -- ` + ('e',n) -> 0x1f10 + n + ('h',10) -> 0x03ae -- ' + ('h',11) -> 0x1fc6 -- ~ + ('h',12) -> 0x1f74 -- ` + + ('h',30) -> 0x1fc4 + ('h',31) -> 0x1fc7 + ('h',32) -> 0x1fc2 + ('h',33) -> 0x1fc3 + ('h',n) | n >19 -> 0x1f90 + n - 20 + + ('h',n) -> 0x1f20 + n + ('i',10) -> 0x03af -- ' + ('i',11) -> 0x1fd6 -- ~ + ('i',12) -> 0x1f76 -- ` + ('i',n) -> 0x1f30 + n + ('o',10) -> 0x03cc -- ' +-- ('o',11) -> 0x1fb6 -- ~ can't happen + ('o',12) -> 0x1f78 -- ` + ('o',n) -> 0x1f40 + n + ('y',10) -> 0x03cd -- ' + ('y',11) -> 0x1fe6 -- ~ + ('y',12) -> 0x1f7a -- ` + ('y',n) -> 0x1f50 + n + ('w',10) -> 0x03ce -- ' + ('w',11) -> 0x1ff6 -- ~ + ('w',12) -> 0x1f7c -- ` + + ('w',30) -> 0x1ff4 + ('w',31) -> 0x1ff7 + ('w',32) -> 0x1ff2 + ('w',33) -> 0x1ff3 + ('w',n) | n >19 -> 0x1fa0 + n - 20 + + ('w',n) -> 0x1f60 + n + ('r',1) -> 0x1fe5 + _ -> mkGreekChar (c,-1) --- should not happen + +allGreekMin :: [Int] +allGreekMin = [0x03b1 .. 0x03c9] + + +{- +encoding of Greek writing. Those hard to guess are marked with --- + + maj min +A a Alpha 0391 03b1 +B b Beta 0392 03b2 +G g Gamma 0393 03b3 +D d Delta 0394 03b4 +E e Epsilon 0395 03b5 +Z z Zeta 0396 03b6 +H h Eta --- 0397 03b7 +Q q Theta --- 0398 03b8 +I i Iota 0399 03b9 +K k Kappa 039a 03ba +L l Lambda 039b 03bb +M m My 039c 03bc +N n Ny 039d 03bd +X x Xi 039e 03be +O o Omikron 039f 03bf +P p Pi 03a0 03c0 +R r Rho 03a1 03c1 + j Sigma --- 03c2 +S s Sigma 03a3 03c3 +T t Tau 03a4 03c4 +Y y Ypsilon 03a5 03c5 +F f Phi 03a6 03c6 +C c Khi --- 03a7 03c7 +U u Psi 03a8 03c8 +W w Omega --- 03a9 03c9 + +( spiritus asper +) spiritus lenis +! iota subscriptum + +' acutus +~ circumflexus +` gravis + +-} + + + + + diff --git a/src-3.0/GF/Text/Hebrew.hs b/src-3.0/GF/Text/Hebrew.hs new file mode 100644 index 000000000..c7026d8da --- /dev/null +++ b/src-3.0/GF/Text/Hebrew.hs @@ -0,0 +1,53 @@ +---------------------------------------------------------------------- +-- | +-- Module : Hebrew +-- Maintainer : (Maintainer) +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/04/21 16:23:37 $ +-- > CVS $Author: bringert $ +-- > CVS $Revision: 1.8 $ +-- +-- (Description of the module) +----------------------------------------------------------------------------- + +module GF.Text.Hebrew (mkHebrew) where + +mkHebrew :: String -> String +mkHebrew = mkHebrewWord +----mkHebrew = reverse . mkHebrewWord +--- reverse : assumes everything's on same line + +type HebrewChar = Char + +-- HH 031103 added code for spooling the markup +-- removed reverse, words, unwords +-- (seemed obsolete and come out wrong on the screen) +-- AR 26/1/2004 put reverse back - needed in Fudgets (but not in Java?) + +mkHebrewWord :: String -> [HebrewChar] +-- mkHebrewWord = map mkHebrewChar + +mkHebrewWord s = case s of + [] -> [] + '<' : cs -> '<' : spoolMarkup cs + ' ' : cs -> ' ' : mkHebrewWord cs + c1 : cs -> mkHebrewChar c1 : mkHebrewWord cs + +spoolMarkup :: String -> String +spoolMarkup s = case s of + [] -> [] -- Shouldn't happen + '>' : cs -> '>' : mkHebrewWord cs + c1 : cs -> c1 : spoolMarkup cs + +mkHebrewChar c = case lookup c cc of Just c' -> c' ; _ -> c + where + cc = zip allHebrewCodes allHebrew + +allHebrewCodes = "-abgdhwzHTyKklMmNnSoPpCcqrst" + +allHebrew :: String +allHebrew = (map toEnum (0x05be : [0x05d0 .. 0x05ea])) + + diff --git a/src-3.0/GF/Text/Hiragana.hs b/src-3.0/GF/Text/Hiragana.hs new file mode 100644 index 000000000..ba74fc83c --- /dev/null +++ b/src-3.0/GF/Text/Hiragana.hs @@ -0,0 +1,95 @@ +---------------------------------------------------------------------- +-- | +-- Module : Hiragana +-- Maintainer : (Maintainer) +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/04/21 16:23:38 $ +-- > CVS $Author: bringert $ +-- > CVS $Revision: 1.5 $ +-- +-- (Description of the module) +----------------------------------------------------------------------------- + +module GF.Text.Hiragana (mkJapanese) where + +-- long vowel romaaji must be ei, ou not ee, oo + +mkJapanese :: String -> String +mkJapanese = digraphWordToUnicode . romaajiToDigraphWord + +romaajiToDigraphWord :: String -> [(Char, Char)] +romaajiToDigraphWord str = case str of + [] -> [] + '<' : cs -> ('\\', '<') : spoolMarkup cs + ' ' : cs -> ('\\', ' ') : romaajiToDigraphWord cs + + c1 : cs | isVowel c1 -> (' ', cap c1) : romaajiToDigraphWord cs + + -- The combinations + c1 : 'y' : c2 : cs -> (c1, 'i') : ('y', cap c2) : romaajiToDigraphWord cs + + 's' : 'h' : 'a' : cs -> ('S', 'i') : ('y', 'A') : romaajiToDigraphWord cs + 'c' : 'h' : 'a' : cs -> ('C', 'i') : ('y', 'A') : romaajiToDigraphWord cs + 'j' : 'a' : cs -> ('j', 'i') : ('y', 'A') : romaajiToDigraphWord cs + + 's' : 'h' : 'u' : cs -> ('S', 'i') : ('y', 'U') : romaajiToDigraphWord cs + 'c' : 'h' : 'u' : cs -> ('C', 'i') : ('y', 'U') : romaajiToDigraphWord cs + 'j' : 'u' : cs -> ('j', 'i') : ('y', 'U') : romaajiToDigraphWord cs + + 's' : 'h' : 'o' : cs -> ('S', 'i') : ('y', 'O') : romaajiToDigraphWord cs + 'c' : 'h' : 'o' : cs -> ('C', 'i') : ('y', 'O') : romaajiToDigraphWord cs + 'j' : 'o' : cs -> ('j', 'i') : ('y', 'O') : romaajiToDigraphWord cs + + 'd' : 'z' : c3 : cs -> ('D', c3) : romaajiToDigraphWord cs + 't' : 's' : c3 : cs -> ('T', c3) : romaajiToDigraphWord cs + 'c' : 'h' : c3 : cs -> ('C', c3) : romaajiToDigraphWord cs + 's' : 'h' : c3 : cs -> ('S', c3) : romaajiToDigraphWord cs + 'z' : 'h' : c3 : cs -> ('Z', c3) : romaajiToDigraphWord cs + + c1 : ' ' : cs -> (' ', c1) : ('\\', ' ') : romaajiToDigraphWord cs -- n + c1 : [] -> [(' ', c1)] -- n + + c1 : c2 : cs | isVowel c2 -> (c1, c2) : romaajiToDigraphWord cs + c1 : c2 : cs | c1 == c2 -> ('T', 'U') : romaajiToDigraphWord (c2 : cs) -- double cons + c1 : cs -> (' ', c1) : romaajiToDigraphWord cs -- n + +isVowel x = elem x "aeiou" +cap :: Char -> Char +cap x = case x of + 'a' -> 'A' + 'e' -> 'E' + 'i' -> 'I' + 'o' -> 'O' + 'u' -> 'U' + c -> c + +spoolMarkup :: String -> [(Char, Char)] +spoolMarkup s = case s of + -- [] -> [] -- Shouldn't happen + '>' : cs -> ('\\', '>') : romaajiToDigraphWord cs + c1 : cs -> ('\\', c1) : spoolMarkup cs + +digraphWordToUnicode :: [(Char, Char)] -> String +digraphWordToUnicode = map digraphToUnicode + +digraphToUnicode :: (Char, Char) -> Char +digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2 + where + cc = zip allHiraganaCodes allHiragana + +allHiraganaCodes :: [(Char, Char)] +allHiraganaCodes = mkPairs digraphedHiragana + +allHiragana :: String +allHiragana = (map toEnum [0x3041 .. 0x309f]) + +mkPairs :: String -> [(Char, Char)] +mkPairs str = case str of + [] -> [] + c1 : c2 : cs -> (c1, c2) : mkPairs cs + +digraphedHiragana = " a A i I u U e E o OkagakigikugukegekogosazaSiZisuzusezesozotadaCijiTUTuDutedetodonaninunenohabapahibipihubupuhebepehobopomamimumemoyAyayUyuyOyorarirurerowaWawiwewo nvukAkE____<< o>>o >'> b" + + diff --git a/src-3.0/GF/Text/LatinASupplement.hs b/src-3.0/GF/Text/LatinASupplement.hs new file mode 100644 index 000000000..f42423c91 --- /dev/null +++ b/src-3.0/GF/Text/LatinASupplement.hs @@ -0,0 +1,69 @@ +---------------------------------------------------------------------- +-- | +-- Module : LatinASupplement +-- Maintainer : (Maintainer) +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/04/21 16:23:39 $ +-- > CVS $Author: bringert $ +-- > CVS $Revision: 1.5 $ +-- +-- (Description of the module) +----------------------------------------------------------------------------- + +module GF.Text.LatinASupplement (mkLatinASupplement) where + +mkLatinASupplement :: String -> String +mkLatinASupplement = mkLatinASupplementWord + +mkLatinASupplementWord :: String -> String +mkLatinASupplementWord str = case str of + [] -> [] + '<' : cs -> '<' : spoolMarkup cs + -- Romanian & partly Turkish + 's' : ',' : cs -> toEnum 0x015f : mkLatinASupplementWord cs + 'a' : '%' : cs -> toEnum 0x0103 : mkLatinASupplementWord cs + -- Slavic and more + 'c' : '^' : cs -> toEnum 0x010d : mkLatinASupplementWord cs + 's' : '^' : cs -> toEnum 0x0161 : mkLatinASupplementWord cs + 'c' : '\'' : cs -> toEnum 0x0107 : mkLatinASupplementWord cs + 'z' : '^' : cs -> toEnum 0x017e : mkLatinASupplementWord cs + -- Turkish + 'g' : '%' : cs -> toEnum 0x011f : mkLatinASupplementWord cs + 'I' : cs -> toEnum 0x0131 : mkLatinASupplementWord cs + 'c' : ',' : cs -> toEnum 0x00e7 : mkLatinASupplementWord cs + -- Polish + 'e' : ',' : cs -> toEnum 0x0119 : mkLatinASupplementWord cs + 'a' : ',' : cs -> toEnum 0x0105 : mkLatinASupplementWord cs + 'l' : '/' : cs -> toEnum 0x0142 : mkLatinASupplementWord cs + 'z' : '.' : cs -> toEnum 0x017c : mkLatinASupplementWord cs + 'n' : '\'' : cs -> toEnum 0x0144 : mkLatinASupplementWord cs + 's' : '\'' : cs -> toEnum 0x015b : mkLatinASupplementWord cs +-- 'c' : '\'' : cs -> toEnum 0x0107 : mkLatinASupplementWord cs + + -- Hungarian + 'o' : '%' : cs -> toEnum 0x0151 : mkLatinASupplementWord cs + 'u' : '%' : cs -> toEnum 0x0171 : mkLatinASupplementWord cs + + -- Mongolian + 'j' : '^' : cs -> toEnum 0x0135 : mkLatinASupplementWord cs + + -- Khowar (actually in Combining diacritical marks not Latin-A Suppl.) + 'o' : '.' : cs -> 'o' : (toEnum 0x0323 : mkLatinASupplementWord cs) + + -- Length bars over vowels e.g korean + 'a' : ':' : cs -> toEnum 0x0101 : mkLatinASupplementWord cs + 'e' : ':' : cs -> toEnum 0x0113 : mkLatinASupplementWord cs + 'i' : ':' : cs -> toEnum 0x012b : mkLatinASupplementWord cs + 'o' : ':' : cs -> toEnum 0x014d : mkLatinASupplementWord cs + 'u' : ':' : cs -> toEnum 0x016b : mkLatinASupplementWord cs + + -- Default + c : cs -> c : mkLatinASupplementWord cs + +spoolMarkup :: String -> String +spoolMarkup s = case s of + [] -> [] -- Shouldn't happen + '>' : cs -> '>' : mkLatinASupplementWord cs + c1 : cs -> c1 : spoolMarkup cs diff --git a/src-3.0/GF/Text/OCSCyrillic.hs b/src-3.0/GF/Text/OCSCyrillic.hs new file mode 100644 index 000000000..0d4696944 --- /dev/null +++ b/src-3.0/GF/Text/OCSCyrillic.hs @@ -0,0 +1,47 @@ +---------------------------------------------------------------------- +-- | +-- Maintainer : (Maintainer) +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/04/21 16:23:39 $ +-- > CVS $Author: bringert $ +-- > CVS $Revision: 1.6 $ +-- +-- (Description of the module) +----------------------------------------------------------------------------- + +module GF.Text.OCSCyrillic (mkOCSCyrillic) where + +mkOCSCyrillic :: String -> String +mkOCSCyrillic = mkOCSCyrillicWord + +mkOCSCyrillicWord :: String -> String +mkOCSCyrillicWord str = case str of + [] -> [] + ' ' : cs -> ' ' : mkOCSCyrillicWord cs + '<' : cs -> '<' : spoolMarkup cs + '\228' : cs -> toEnum 0x0463 : mkOCSCyrillicWord cs -- ä + 'j' : 'e' : '~' : cs -> toEnum 0x0469 : mkOCSCyrillicWord cs + 'j' : 'o' : '~' : cs -> toEnum 0x046d : mkOCSCyrillicWord cs + 'j' : 'e' : cs -> toEnum 0x0465 : mkOCSCyrillicWord cs + 'e' : '~' : cs -> toEnum 0x0467 : mkOCSCyrillicWord cs + 'o' : '~' : cs -> toEnum 0x046b : mkOCSCyrillicWord cs + 'j' : 'u' : cs -> toEnum 0x044e : mkOCSCyrillicWord cs + 'j' : 'a' : cs -> toEnum 0x044f : mkOCSCyrillicWord cs + 'u' : cs -> toEnum 0x0479 : mkOCSCyrillicWord cs + c : cs -> (mkOCSCyrillicChar c) : mkOCSCyrillicWord cs + +spoolMarkup :: String -> String +spoolMarkup s = case s of + [] -> [] -- Shouldn't happen + '>' : cs -> '>' : mkOCSCyrillicWord cs + c1 : cs -> c1 : spoolMarkup cs + +mkOCSCyrillicChar :: Char -> Char +mkOCSCyrillicChar c = case lookup c cc of Just c' -> c' ; _ -> c + where + cc = zip "abvgdeZziJklmnoprstYfxCqwWUyIE" allOCSCyrillic + +allOCSCyrillic :: String +allOCSCyrillic = (map toEnum [0x0430 .. 0x044e]) diff --git a/src-3.0/GF/Text/Russian.hs b/src-3.0/GF/Text/Russian.hs new file mode 100644 index 000000000..c4f1bfd89 --- /dev/null +++ b/src-3.0/GF/Text/Russian.hs @@ -0,0 +1,56 @@ +---------------------------------------------------------------------- +-- | +-- Module : Russian +-- Maintainer : (Maintainer) +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/04/21 16:23:40 $ +-- > CVS $Author: bringert $ +-- > CVS $Revision: 1.5 $ +-- +-- (Description of the module) +----------------------------------------------------------------------------- + +module GF.Text.Russian (mkRussian, mkRusKOI8) where + +-- | an ad hoc ASCII encoding. Delimiters: @\/_ _\/@ +mkRussian :: String -> String +mkRussian = unwords . (map mkRussianWord) . words + +-- | the KOI8 encoding, incomplete. Delimiters: @\/* *\/@ +mkRusKOI8 :: String -> String +mkRusKOI8 = unwords . (map mkRussianKOI8) . words + +type RussianChar = Char + +mkRussianWord :: String -> [RussianChar] +mkRussianWord = map (mkRussianChar allRussianCodes) + +mkRussianKOI8 :: String -> [RussianChar] +mkRussianKOI8 = map (mkRussianChar allRussianKOI8) + +mkRussianChar chars c = case lookup c cc of Just c' -> c' ; _ -> c + where + cc = zip chars allRussian + +allRussianCodes :: [Char] +allRussianCodes = + -- changed to Ints to work with Haskell compilers e.g. GHC 6.5 CVS + -- which expect source files to be in UTF-8 + -- /bringert 2006-05-19 + -- "ÅåABVGDEXZIJKLMNOPRSTUFHCQW£}!*ÖYÄabvgdexzijklmnoprstufhcqw#01'öyä" + map toEnum [197,229,65,66,86,71,68,69,88,90,73,74,75,76,77,78,79,80,82,83,84,85,70,72,67,81,87,163,125,33,42,214,89,196,97,98,118,103,100,101,120,122,105,106,107,108,109,110,111,112,114,115,116,117,102,104,99,113,119,35,48,49,39,246,121,228] + +allRussianKOI8 :: [Char] +allRussianKOI8 = + -- changed to Ints to work with Haskell compilers e.g. GHC 6.5 CVS + -- which expect source files to be in UTF-8 + -- /bringert 2006-05-19 + -- "^@áâ÷çäåöúéêëìíîïðòóôõæèãþûýøùÿüàñÁÂ×ÇÄÅÖÚÉÊËÌÍÎÏÐÒÓÔÕÆÈÃÞÛÝØÙßÜÀÑ" + map toEnum [94,64,225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,242,243,244,245,230,232,227,254,251,253,248,249,255,252,224,241,193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,210,211,212,213,198,200,195,222,219,221,216,217,223,220,192,209] + +allRussian :: String +allRussian = (map toEnum (0x0401:0x0451:[0x0410 .. 0x044f])) -- Ëë in odd places + + diff --git a/src-3.0/GF/Text/Tamil.hs b/src-3.0/GF/Text/Tamil.hs new file mode 100644 index 000000000..8ee171acf --- /dev/null +++ b/src-3.0/GF/Text/Tamil.hs @@ -0,0 +1,77 @@ +---------------------------------------------------------------------- +-- | +-- Module : Tamil +-- Maintainer : (Maintainer) +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/04/21 16:23:40 $ +-- > CVS $Author: bringert $ +-- > CVS $Revision: 1.5 $ +-- +-- (Description of the module) +----------------------------------------------------------------------------- + +module GF.Text.Tamil (mkTamil) where + +mkTamil :: String -> String +mkTamil = digraphWordToUnicode . adHocToDigraphWord + +adHocToDigraphWord :: String -> [(Char, Char)] +adHocToDigraphWord str = case str of + [] -> [] + '<' : cs -> ('\\', '<') : spoolMarkup cs + ' ' : cs -> ('\\', ' ') : adHocToDigraphWord cs -- skip space + +-- if c1 is a vowel + -- Two of the same vowel => lengthening + c1 : c2 : cs | c1 == c2 && isVowel c1 -> (cap c1, ':') : adHocToDigraphWord cs + -- digraphed or long vowel + c1 : c2 : cs | isVowel c1 && isVowel c2 -> (cap c1, cap c2) : adHocToDigraphWord cs + c1 : cs | isVowel c1 -> (' ', cap c1) : adHocToDigraphWord cs + +-- c1 isn't a vowel + c1 : c2 : c3 : cs | c2 == c3 && isVowel c2 -> (' ', c1) : (c2, ':') : adHocToDigraphWord cs + c1 : c2 : c3 : cs | isVowel c2 && isVowel c3 -> (' ', c1) : (c2, c3) : adHocToDigraphWord cs + c1 : 'a' : cs -> (' ', c1) : adHocToDigraphWord cs -- a inherent + c1 : c2 : cs | isVowel c2 -> (' ', c1) : (' ', c2) : adHocToDigraphWord cs + + c1 : cs -> (' ', c1) : (' ', '.') : adHocToDigraphWord cs -- vowelless + +isVowel x = elem x "aeiou:" +cap :: Char -> Char +cap x = case x of + 'a' -> 'A' + 'e' -> 'E' + 'i' -> 'I' + 'o' -> 'O' + 'u' -> 'U' + c -> c + +spoolMarkup :: String -> [(Char, Char)] +spoolMarkup s = case s of + -- [] -> [] -- Shouldn't happen + '>' : cs -> ('\\', '>') : adHocToDigraphWord cs + c1 : cs -> ('\\', c1) : spoolMarkup cs + +digraphWordToUnicode :: [(Char, Char)] -> String +digraphWordToUnicode = map digraphToUnicode + +digraphToUnicode :: (Char, Char) -> Char +digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2 + where + cc = zip allTamilCodes allTamil + +mkPairs :: String -> [(Char, Char)] +mkPairs str = case str of + [] -> [] + c1 : c2 : cs -> (c1, c2) : mkPairs cs + +allTamilCodes :: [(Char, Char)] +allTamilCodes = mkPairs digraphedTamil + +allTamil :: String +allTamil = (map toEnum [0x0b85 .. 0x0bfa]) + +digraphedTamil = " AA: II: UU:______ EE:AI__ OO:AU k______ G c__ j__ \241 T______ N t______ V n p______ m y r l L M v__ s S h________a: ii: uu:______ ee:ai__ oo:au .__________________ :______________________________#1#2#3#4#5#6#7#8#9^1^2^3=d=m=y=d=c==ru##" + diff --git a/src-3.0/GF/Text/Text.hs b/src-3.0/GF/Text/Text.hs new file mode 100644 index 000000000..b55355c20 --- /dev/null +++ b/src-3.0/GF/Text/Text.hs @@ -0,0 +1,149 @@ +---------------------------------------------------------------------- +-- | +-- Module : Text +-- Maintainer : AR +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/06/23 14:32:44 $ +-- > CVS $Author: aarne $ +-- > CVS $Revision: 1.10 $ +-- +-- elementary text postprocessing. AR 21\/11\/2001. +-- +-- This is very primitive indeed. The functions should work on +-- token lists and not on strings. AR 5\/12\/2002 +-- +-- XML hack 14\/8\/2004; not in use yet +----------------------------------------------------------------------------- + +module GF.Text.Text (untokWithXML, + exceptXML, + formatAsTextLit, + formatAsCodeLit, + formatAsText, + formatAsHTML, + formatAsLatex, + formatAsCode, + performBinds, + performBindsFinnish, + unStringLit, + concatRemSpace + ) where + +import GF.Data.Operations +import Data.Char + +-- | does not apply untokenizer within XML tags --- heuristic "< " +-- this function is applied from top level... +untokWithXML :: (String -> String) -> String -> String +untokWithXML unt s = case s of + '<':cs@(c:_) | isAlpha c -> '<':beg ++ ">" ++ unto (drop 1 rest) where + (beg,rest) = span (/='>') cs + '<':cs -> '<':unto cs --- + [] -> [] + _ -> unt beg ++ unto rest where + (beg,rest) = span (/='<') s + where + unto = untokWithXML unt + +-- | ... whereas this one is embedded on a branch +exceptXML :: (String -> String) -> String -> String +exceptXML unt s = '<':beg ++ ">" ++ unt (drop 1 rest) where + (beg,rest) = span (/='>') s + +formatAsTextLit :: String -> String +formatAsTextLit = formatAsText . unwords . map unStringLit . words +--- hope that there will be deforestation... + +formatAsCodeLit :: String -> String +formatAsCodeLit = formatAsCode . unwords . map unStringLit . words + +formatAsText,formatAsHTML,formatAsLatex :: String -> String +formatAsText = formatAsTextGen (const False) (=="&-") +formatAsHTML = formatAsTextGen (\s -> take 1 s == "<" || last s == '>') (const False) +formatAsLatex = formatAsTextGen ((=="\\") . take 1) (const False) + +formatAsTextGen :: (String -> Bool) -> (String -> Bool) -> String -> String +formatAsTextGen tag para = unwords . format . cap . words where + format ws = case ws of + w : ww | capit w -> format $ (cap ww) + w : c : ww | major c -> format $ (w ++ c) :(cap ww) + w : c : ww | minor c -> format $ (w ++ c) : ww + p : c : ww | openp p -> format $ (p ++ c) :ww + p : c : ww | spanish p -> format $ (p ++ concat (cap [c])) :ww + c : ww | para c -> "\n\n" : format ww + w : ww -> w : format ww + [] -> [] + cap (p:ww) | tag p = p : cap ww + cap ((c:cs):ww) = (toUpper c : cs) : ww + cap [] = [] + capit = (=="&|") + major = flip elem (map singleton ".!?") + minor = flip elem (map singleton ",:;)") + openp = all (flip elem "(") + spanish = all (flip elem "\161\191") + +formatAsCode :: String -> String +formatAsCode = rend 0 . words where + -- render from BNF Converter + rend i ss = case ss of + "[" :ts -> cons "[" $ rend i ts + "(" :ts -> cons "(" $ rend i ts + "{" :ts -> cons "{" $ new (i+1) $ rend (i+1) ts + "}" : ";":ts -> new (i-1) $ space "}" $ cons ";" $ new (i-1) $ rend (i-1) ts + "}" :ts -> new (i-1) $ cons "}" $ new (i-1) $ rend (i-1) ts + ";" :ts -> cons ";" $ new i $ rend i ts + t : "," :ts -> cons t $ space "," $ rend i ts + t : ")" :ts -> cons t $ cons ")" $ rend i ts + t : "]" :ts -> cons t $ cons "]" $ rend i ts + t :ts -> space t $ rend i ts + _ -> "" + cons s t = s ++ t + new i s = '\n' : replicate (2*i) ' ' ++ dropWhile isSpace s + space t s = if null s then t else t ++ " " ++ s + +performBinds :: String -> String +performBinds = performBindsOpt (\x y -> y) + + +-- The function defines an effect of the former on the latter part, +-- such as in vowel harmony. It is triggered by the binder token "&*" + +performBindsOpt :: (String -> String -> String) -> String -> String +performBindsOpt harm = unwords . format . words where + format ws = case ws of + w : "&+" : u : ws -> format ((w ++ u) : ws) + w : "&*" : u : ws -> format ((w ++ harm w u) : ws) + w : ws -> w : format ws + [] -> [] + +-- unlexer for Finnish particles +-- Notice: left associativity crucial for "tie &* ko &* han" --> "tieköhän" + +performBindsFinnish :: String -> String +performBindsFinnish = performBindsOpt vowelHarmony where + vowelHarmony w p = if any (flip elem "aouAOU") w then p else map toFront p + toFront c = case c of + 'A' -> '\196' + 'O' -> '\214' + 'a' -> '\228' + 'o' -> '\246' + _ -> c + +unStringLit :: String -> String +unStringLit s = case s of + c : cs | strlim c && strlim (last cs) -> init cs + _ -> s + where + strlim = (=='\'') + +concatRemSpace :: String -> String +concatRemSpace = concat . words +{- +concatRemSpace s = case s of + '<':cs -> exceptXML concatRemSpace cs + c : cs | isSpace c -> concatRemSpace cs + c :cs -> c : concatRemSpace cs + _ -> s +-} diff --git a/src-3.0/GF/Text/Thai.hs b/src-3.0/GF/Text/Thai.hs new file mode 100644 index 000000000..1b186cb3a --- /dev/null +++ b/src-3.0/GF/Text/Thai.hs @@ -0,0 +1,368 @@ +---------------------------------------------------------------------- +-- | +-- Module : Thai +-- Maintainer : (Maintainer) +-- Stability : (experimental) +-- Portability : (portable) +-- +-- +-- Thai transliteration and other alphabet information. +----------------------------------------------------------------------------- + +-- AR 27/12/2006. Execute test2 to see the transliteration table. + +module GF.Text.Thai ( + mkThai,mkThaiWord,mkThaiPron,mkThaiFake,thaiFile,thaiPronFile,thaiFakeFile + ) where + +import qualified Data.Map as Map +import Data.Char + +-- for testing +import GF.Text.UTF8 +import Data.List + +import Debug.Trace + + +mkThai :: String -> String +mkThai = concat . map mkThaiWord . words +mkThaiPron = unwords . map mkPronSyllable . words +mkThaiFake = unwords . map (fakeEnglish . mkPronSyllable) . words + + +type ThaiChar = Char + +mkThaiWord :: String -> [ThaiChar] +mkThaiWord = map (toEnum . mkThaiChar) . unchar . snd . pronAndOrth + +mkThaiChar :: String -> Int +mkThaiChar c = maybe 0 id $ Map.lookup c thaiMap + +thaiMap :: Map.Map String Int +thaiMap = Map.fromList $ zip allThaiTrans allThaiCodes + +-- convert all string literals in a text + +thaiStrings :: String -> String +thaiStrings = convStrings mkThai + +thaiPronStrings :: String -> String +thaiPronStrings = convStrings mkThaiPron + +convStrings conv s = case s of + '"':cs -> let (t,_:r) = span (/='"') cs in + '"': conv t ++ "\"" ++ convStrings conv r + c:cs -> c : convStrings conv cs + _ -> s + + +-- each character is either [letter] or [letter+nonletter] + +unchar :: String -> [String] +unchar s = case s of + c:d:cs + | isAlpha d -> [c] : unchar (d:cs) + | d == '?' -> unchar cs -- use "o?" to represent implicit 'o' + | otherwise -> [c,d] : unchar cs + [_] -> [s] + _ -> [] + +-- you can prefix transliteration by irregular phonology in [] + +pronAndOrth :: String -> (Maybe String, String) +pronAndOrth s = case s of + '[':cs -> case span (/=']') cs of + (p,_:o) -> (Just p,o) + _ -> (Nothing,s) + _ -> (Nothing,s) + +allThaiTrans :: [String] +allThaiTrans = words $ + "- k k1 - k2 - k3 g c c1 c2 s' c3 y' d' t' " ++ + "t1 t2 t3 n' d t t4 t5 t6 n b p p1 f p2 f' " ++ + "p3 m y r - l - w s- s. s h l' O h' - " ++ + "a. a a: a+ i i: v v: u u: - - - - - - " ++ + "e e' o: a% a& L R S T1 T2 T3 T4 K - - - " ++ + "N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 - - - - - - " + +allThaiCodes :: [Int] +allThaiCodes = [0x0e00 .. 0x0e7f] + + +--------------------- +-- heuristic pronunciation of codes +--------------------- + +-- fake English for TTS, a la Teach Yourself Thai + +fakeEnglish :: String -> String +fakeEnglish s = case s of + 'a':'a':cs -> "ah" ++ fakeEnglish cs + 'a':'y':cs -> "ai" ++ fakeEnglish cs + 'a' :cs -> "ah" ++ fakeEnglish cs + 'c':'h':cs -> "ch" ++ fakeEnglish cs + 'c' :cs -> "j" ++ fakeEnglish cs + 'e':'e':cs -> "aih" ++ fakeEnglish cs + 'g' :cs -> "ng" ++ fakeEnglish cs + 'i':'i':cs -> "ee" ++ fakeEnglish cs + 'k':'h':cs -> "k" ++ fakeEnglish cs + 'k' :cs -> "g" ++ fakeEnglish cs + 'O':'O':cs -> "or" ++ fakeEnglish cs + 'O' :cs -> "or" ++ fakeEnglish cs + 'o':'o':cs -> "or" ++ fakeEnglish cs + 'p':'h':cs -> "p" ++ fakeEnglish cs + 'p' :cs -> "b" ++ fakeEnglish cs + 't':'h':cs -> "t" ++ fakeEnglish cs + 't' :cs -> "d" ++ fakeEnglish cs + 'u':'u':cs -> "oo" ++ fakeEnglish cs + 'u' :cs -> "oo" ++ fakeEnglish cs + 'v':'v':cs -> "eu" ++ fakeEnglish cs + 'v' :cs -> "eu" ++ fakeEnglish cs + '\228':'\228':cs -> "air" ++ fakeEnglish cs + '\228' :cs -> "a" ++ fakeEnglish cs + '\246':'\246':cs -> "er" ++ fakeEnglish cs + '\246' :cs -> "er" ++ fakeEnglish cs + c:cs | isTone c -> fakeEnglish cs + c:cs -> c : fakeEnglish cs + _ -> s + where + isTone = flip elem "'`^~" + + +-- this works for one syllable + +mkPronSyllable s = case fst $ pronAndOrth s of + Just p -> p + _ -> pronSyllable $ getSyllable $ map mkThaiChar $ unchar s + +data Syllable = Syll { + initv :: [Int], + initc :: [Int], + midv :: [Int], + finalc :: [Int], + finalv :: [Int], + tone :: [Int], + shorten :: Bool, + kill :: Bool + } + deriving Show + +data Tone = TMid | TLow | THigh | TRise | TFall + deriving Show + +data CClass = CLow | CMid | CHigh + deriving Show + +pronSyllable :: Syllable -> String +pronSyllable s = + initCons ++ tonem ++ vowel ++ finalCons + where + + vowel = case (initv s, midv s, finalv s, finalc s, shorten s, tone s) of + ([0x0e40],[0x0e35],_,[0x0e22],_,_) -> "ia" -- e-i:y + ([0x0e40],[0x0e2d,0x0e35],_,_,_,_) -> "va" -- e-i:O + ([0x0e40],[0x0e30,0x0e35],_,[0x0e22],_,_) -> "ia" -- e-i:ya. + ([0x0e40],[0x0e30,0x0e2d],_,_,_,_) -> "\246" -- e-Oa. + ([0x0e40],[0x0e30,0x0e32],_,_,_,_) -> "O" -- e-a:a. -- open o + ([0x0e40],[0x0e2d],_,_,_,_) -> "\246\246" -- e-O + ([0x0e40],[0x0e34],_,_,_,_) -> "\246\246" -- e-i + ([0x0e40],[0x0e30],_,_,_,_) -> "e" -- e-a. + ([0x0e40],[0x0e32],_,_,_,_) -> "aw" -- e-a: + ([0x0e40],[],[],[0x0e22],_,_) -> "\246\246y" -- e-y + ([0x0e40],[],[],_,True,_) -> "e" + + ([0x0e41],[0x0e30],_,_,_,_) -> "\228" -- ä-a. + ([0x0e41],[],[],_,True,_) -> "\228" + + ([0x0e42],[0x0e30],_,_,_,_) -> "o" -- o:-a. + + ([],[0x0e2d],_,[0x0e22],_,_) -> "OOy" -- Oy + ([],[0x0e2d],_,_,_,_) -> "OO" -- O + + ([],[],[],_,_,_) -> "o" + + (i,m,f,_,_,_) -> concatMap pronThaiChar (reverse $ f ++ m ++ i) ---- + + initCons = concatMap pronThaiChar $ case (reverse $ initc s) of + 0x0e2b:cs@(_:_) -> cs -- high h + 0x0e2d:cs@(_:_) -> cs -- O + cs -> cs + + finalCons = + let (c,cs) = splitAt 1 $ finalc s + in + case c of + [] -> [] + [0x0e22] -> [] --- y + [k] -> concatMap pronThaiChar (reverse cs) ++ finalThai k + + iclass = case take 1 (reverse $ initc s) of + [c] -> classThai c + [] -> CMid -- O + + isLong = not (shorten s) && case vowel of + _:_:_ -> True ---- + _ -> False + + isLive = case finalCons of + c | elem c ["n","m","g"] -> True + "" -> isLong + _ -> False + + tonem = case (iclass,isLive,isLong,tone s) of + (_,_,_, [0x0e4a]) -> tHigh + (_,_,_, [0x0e4b]) -> tRise + (CLow,_,_,[0x0e49]) -> tRise + (_,_,_, [0x0e49]) -> tFall + (CLow,_,_,[0x0e48]) -> tFall + (_, _,_,[0x0e48]) -> tLow + (CHigh,True,_,_) -> tRise + (_, True,_,_) -> tMid + (CLow,False,False,_) -> tHigh + (CLow,False,_,_) -> tFall + _ -> tLow + +(tMid,tHigh,tLow,tRise,tFall) = ("-","'","`","~","^") + +isVowel c = 0x0e30 <= c && c <= 0x0e44 ---- +isCons c = 0x0e01 <= c && c <= 0x0e2f ---- +isTone c = 0x0e48 <= c && c <= 0x0e4b + +getSyllable :: [Int] -> Syllable +getSyllable = foldl get (Syll [] [] [] [] [] [] False False) where + get syll c = case c of + 0x0e47 -> syll {shorten = True} + 0x0e4c -> syll {kill = True, finalc = tail (finalc syll)} --- always last + 0x0e2d + | null (initc syll) -> syll {initc = [c]} -- "O" + | otherwise -> syll {midv = c : midv syll} + _ + | isVowel c -> if null (initc syll) + then syll {initv = c : initv syll} + else syll {midv = c : midv syll} + | isCons c -> if null (initc syll) || + (null (midv syll) && isCluster (initc syll) c) + then syll {initc = c : initc syll} + else syll {finalc = c : finalc syll} + | isTone c -> syll {tone = [c]} + _ -> syll ---- check this + + isCluster s c = length s == 1 && (c == 0x0e23 || s == [0x0e2b]) + +-- to test + +test1 = testThai "k2wa:mrak" +test2 = putStrLn $ thaiTable +test3 = do + writeFile "thai.txt" "Thai Character Coding in GF\nAR 2007\n" + appendFile "thai.txt" thaiTable +test4 = do + writeFile "alphthai.txt" "Thai Characters by Pronunciation\nAR 2007\n" + appendFile "alphthai.txt" thaiTableAlph + + +testThai :: String -> IO () +testThai s = do + putStrLn $ encodeUTF8 $ mkThai s + putStrLn $ unwords $ map mkPronSyllable $ words s + +testSyllable s = + let y = getSyllable $ map mkThaiChar $ unchar s + in + putStrLn $ pronSyllable $ trace (show y) y + +thaiFile :: FilePath -> Maybe FilePath -> IO () +thaiFile f mo = do + s <- readFile f + let put = maybe putStr writeFile mo + put $ encodeUTF8 $ thaiStrings s + +thaiPronFile :: FilePath -> Maybe FilePath -> IO () +thaiPronFile f mo = do + s <- readFile f + let put = maybe putStr writeFile mo + put $ encodeUTF8 $ thaiPronStrings s + +thaiFakeFile :: FilePath -> Maybe FilePath -> IO () +thaiFakeFile f mo = do + s <- readFile f + let put = maybe putStr writeFile mo + put $ encodeUTF8 $ (convStrings mkThaiFake) s + +finalThai c = maybe "" return (Map.lookup c thaiFinalMap) +thaiFinalMap = Map.fromList $ zip allThaiCodes finals + +classThai c = maybe CLow readClass (Map.lookup c thaiClassMap) +thaiClassMap = Map.fromList $ zip allThaiCodes heights + +readClass s = case s of + 'L' -> CLow + 'M' -> CMid + 'H' -> CHigh + + +thaiTable :: String +thaiTable = unlines $ ("\n|| hex | thai | trans | pron | fin | class |" ) : [ + "| " ++ + hex c ++ " | " ++ + encodeUTF8 (showThai s) ++ " | " ++ + s ++ " | " ++ + pronThai s ++ " | " ++ + [f] ++ " | " ++ + [q] ++ " | " + | + (c,q,f,s) <- zip4 allThaiCodes heights finals allThaiTrans + ] + +thaiTableAlph :: String +thaiTableAlph = unlines $ ("\n|| pron | thai | trans |" ) : [ + "| " ++ a ++ + " | " ++ unwords (map (encodeUTF8 . showThai) ss) ++ + " | " ++ unwords ss ++ + " |" + | + (a,ss) <- allProns + ] + where + prons = sort $ nub + [p | s <- allThaiTrans, let p = pronThai s, not (null p),isAlpha (head p)] + allProns = + [(a,[s | s <- allThaiTrans, pronThai s == a]) | a <- prons] + +showThai s = case s of + "-" -> "-" +--- v:_ | elem v "ivu" -> map (toEnum . mkThaiChar) ["O",s] + _ -> [toEnum $ mkThaiChar s] + + +pronThaiChar = pronThai . recodeThai + +recodeThai c = allThaiTrans !! (c - 0x0e00) + +pronThai s = case s of + [c,p] + | c == 'N' && isDigit p -> [p] + | c == 'T' && isDigit p -> ['\'',p] + | isDigit p -> c:"h" + | p==':' -> c:[c] + | elem p "%&" -> c:"y" + | p=='+' -> c:"m" + | s == "e'" -> "\228\228" + | otherwise -> [c] + "O" -> "O" + "e" -> "ee" + [c] | isUpper c -> "" + _ -> s + +hex = map hx . reverse . digs where + digs 0 = [0] + digs n = n `mod` 16 : digs (n `div` 16) + hx d = "0123456789ABCDEF" !! d + +heights :: String +finals :: String +heights = + " MHHLLLLMHLLLLMMHLLLMMHLLLMMHHLLLLLL-L-LHHHHLML" ++ replicate 99 ' ' +finals = + " kkkkkkgt-tt-ntttttntttttnpp--pppmyn-n-wttt-n--" ++ replicate 99 ' ' diff --git a/src-3.0/GF/Text/UTF8.hs b/src-3.0/GF/Text/UTF8.hs new file mode 100644 index 000000000..5e9687684 --- /dev/null +++ b/src-3.0/GF/Text/UTF8.hs @@ -0,0 +1,48 @@ +---------------------------------------------------------------------- +-- | +-- Module : UTF8 +-- Maintainer : AR +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/04/21 16:23:42 $ +-- > CVS $Author: bringert $ +-- > CVS $Revision: 1.5 $ +-- +-- From the Char module supplied with HBC. +-- code by Thomas Hallgren (Jul 10 1999) +----------------------------------------------------------------------------- + +module GF.Text.UTF8 (decodeUTF8, encodeUTF8) where + +-- | Take a Unicode string and encode it as a string +-- with the UTF8 method. +decodeUTF8 :: String -> String +decodeUTF8 "" = "" +decodeUTF8 (c:cs) | c < '\x80' = c : decodeUTF8 cs +decodeUTF8 (c:c':cs) | '\xc0' <= c && c <= '\xdf' && + '\x80' <= c' && c' <= '\xbf' = + toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : decodeUTF8 cs +decodeUTF8 (c:c':c'':cs) | '\xe0' <= c && c <= '\xef' && + '\x80' <= c' && c' <= '\xbf' && + '\x80' <= c'' && c'' <= '\xbf' = + toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : decodeUTF8 cs +decodeUTF8 s = s ---- AR workaround 22/6/2006 +----decodeUTF8 _ = error "UniChar.decodeUTF8: bad data" + +encodeUTF8 :: String -> String +encodeUTF8 "" = "" +encodeUTF8 (c:cs) = + if c > '\x0000' && c < '\x0080' then + c : encodeUTF8 cs + else if c < toEnum 0x0800 then + let i = fromEnum c + in toEnum (0xc0 + i `div` 0x40) : + toEnum (0x80 + i `mod` 0x40) : + encodeUTF8 cs + else + let i = fromEnum c + in toEnum (0xe0 + i `div` 0x1000) : + toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) : + toEnum (0x80 + i `mod` 0x40) : + encodeUTF8 cs diff --git a/src-3.0/GF/Text/Unicode.hs b/src-3.0/GF/Text/Unicode.hs new file mode 100644 index 000000000..9d0b9d1a8 --- /dev/null +++ b/src-3.0/GF/Text/Unicode.hs @@ -0,0 +1,69 @@ +---------------------------------------------------------------------- +-- | +-- Module : Unicode +-- Maintainer : AR +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/04/21 16:23:42 $ +-- > CVS $Author: bringert $ +-- > CVS $Revision: 1.12 $ +-- +-- ad hoc Unicode conversions from different alphabets. +-- AR 12\/4\/2000, 18\/9\/2001, 30\/5\/2002, 26\/1\/2004 +----------------------------------------------------------------------------- + +module GF.Text.Unicode (mkUnicode, treat) where + +import GF.Text.Greek (mkGreek) +import GF.Text.Arabic (mkArabic) +import GF.Text.Hebrew (mkHebrew) +import GF.Text.Russian (mkRussian, mkRusKOI8) +import GF.Text.Ethiopic (mkEthiopic) +import GF.Text.Tamil (mkTamil) +import GF.Text.OCSCyrillic (mkOCSCyrillic) +import GF.Text.LatinASupplement (mkLatinASupplement) +import GF.Text.Devanagari (mkDevanagari) +import GF.Text.Hiragana (mkJapanese) +import GF.Text.ExtendedArabic (mkArabic0600) +import GF.Text.ExtendedArabic (mkExtendedArabic) +import GF.Text.ExtraDiacritics (mkExtraDiacritics) + +import Data.Char + +mkUnicode :: String -> String +mkUnicode s = case s of + '/':'/':cs -> treat [] mkGreek unic ++ mkUnicode rest + '/':'+':cs -> mkHebrew unic ++ mkUnicode rest + '/':'-':cs -> mkArabic unic ++ mkUnicode rest + '/':'_':cs -> treat [] mkRussian unic ++ mkUnicode rest + '/':'*':cs -> mkRusKOI8 unic ++ mkUnicode rest + '/':'E':cs -> mkEthiopic unic ++ mkUnicode rest + '/':'T':cs -> mkTamil unic ++ mkUnicode rest + '/':'C':cs -> mkOCSCyrillic unic ++ mkUnicode rest + '/':'&':cs -> mkDevanagari unic ++ mkUnicode rest + '/':'L':cs -> mkLatinASupplement unic ++ mkUnicode rest + '/':'J':cs -> mkJapanese unic ++ mkUnicode rest + '/':'6':cs -> mkArabic0600 unic ++ mkUnicode rest + '/':'A':cs -> mkExtendedArabic unic ++ mkUnicode rest + '/':'X':cs -> mkExtraDiacritics unic ++ mkUnicode rest + c:cs -> c:mkUnicode cs + _ -> s + where + (unic,rest) = remClosing [] $ dropWhile isSpace $ drop 2 s + remClosing u s = case s of + c:'/':s | elem c "/+-_*ETC&LJ6AX" -> (reverse u, s) --- end need not match + c:cs -> remClosing (c:u) cs + _ -> (reverse u,[]) -- forgiving missing end + +-- | don't convert XML tags --- assumes \<\> always means XML tags +treat :: String -> (String -> String) -> String -> String +treat old mk s = case s of + '<':cs -> mk (reverse old) ++ '<':noTreat cs + c:cs -> treat (c:old) mk cs + _ -> mk (reverse old) + where + noTreat s = case s of + '>':cs -> '>' : treat [] mk cs + c:cs -> c : noTreat cs + _ -> s |
