GF/src is now for 2.9, and the new sources are in src-3.0 - keep it this way until the release of GF 3

author: aarne <aarne@cs.chalmers.se> 2008-05-21 09:26:44 +0000
committer: aarne <aarne@cs.chalmers.se> 2008-05-21 09:26:44 +0000
commit: 055c0d0d5a5bb0dc75904fe53df7f2e4f5732a8f (patch)
tree: 0e63fb68c69c8f6ad0f78893c63420f0a3600e1c /src-3.0/GF/Text
parent: 915a1de71783ab8446b1af9e72c7ba7dfbc12d3f (diff)
16 files changed, 1571 insertions, 0 deletions
diff --git a/src-3.0/GF/Text/Arabic.hs b/src-3.0/GF/Text/Arabic.hs
new file mode 100644
index 000000000..c482b1172
--- /dev/null
+++ b/src-3.0/GF/Text/Arabic.hs
@@ -0,0 +1,63 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : Arabic
+-- Maintainer  : (Maintainer)
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/04/21 16:23:34 $ 
+-- > CVS $Author: bringert $
+-- > CVS $Revision: 1.6 $
+--
+-- (Description of the module)
+-----------------------------------------------------------------------------
+
+module GF.Text.Arabic (mkArabic) where
+
+mkArabic :: String -> String
+mkArabic = unwords . (map mkArabicWord) . words
+----mkArabic = reverse . unwords . (map mkArabicWord) . words
+--- reverse : assumes everything's on same line
+
+type ArabicChar = Char
+
+mkArabicWord :: String -> [ArabicChar]
+mkArabicWord = map mkArabicChar . getLetterPos
+
+getLetterPos :: String -> [(Char,Int)]
+getLetterPos []     = []
+getLetterPos ('I':cs)  = ('*',7) : getLetterPos cs           -- 0xfe80
+getLetterPos ('O':cs)  = ('*',8) : getIn cs                  -- 0xfe8b
+getLetterPos ('l':'a':cs) = ('*',5) : getLetterPos cs        -- 0xfefb
+getLetterPos [c]    = [(c,1)]                                -- 1=isolated
+getLetterPos (c:cs) | isReduced c = (c,1) : getLetterPos cs
+getLetterPos (c:cs) = (c,3) : getIn cs                       -- 3=initial
+
+  
+getIn []     = []
+getIn ('I':cs)  = ('*',7) : getLetterPos cs                -- 0xfe80
+getIn ('O':cs)  = ('*',9) : getIn cs                       -- 0xfe8c
+getIn ('l':'a':cs) = ('*',6) : getLetterPos cs             -- 0xfefc
+getIn [c]    = [(c,2)]                                     -- 2=final
+getIn (c:cs) | isReduced c = (c,2) : getLetterPos cs
+getIn (c:cs) = (c,4) : getIn cs                            -- 4=medial
+
+isReduced :: Char -> Bool
+isReduced c = c `elem` "UuWiYOaAdVrzwj"
+
+mkArabicChar ('*',p) | p > 4 && p < 10 = 
+  (map toEnum [0xfefb,0xfefc,0xfe80,0xfe8b,0xfe8c]) !! (p-5)  
+mkArabicChar cp@(c,p) = case lookup c cc of Just c' -> (c' !! (p-1)) ; _ -> c 
+ where 
+   cc = mkArabicTab allArabicCodes allArabic
+
+mkArabicTab (c:cs) as = (c,as1) : mkArabicTab cs as2 where
+  (as1,as2) = if isReduced c then splitAt 2 as else splitAt 4 as
+mkArabicTab [] _ = []
+
+allArabicCodes = "UuWiYOabAtvgHCdVrzscSDTZoxfqklmnhwjy" 
+
+allArabic :: String
+allArabic = (map toEnum [0xfe81 .. 0xfef4]) -- I=0xfe80
+
+
diff --git a/src-3.0/GF/Text/Devanagari.hs b/src-3.0/GF/Text/Devanagari.hs
new file mode 100644
index 000000000..bf4343cd0
--- /dev/null
+++ b/src-3.0/GF/Text/Devanagari.hs
@@ -0,0 +1,97 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : Devanagari
+-- Maintainer  : (Maintainer)
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/04/21 16:23:34 $ 
+-- > CVS $Author: bringert $
+-- > CVS $Revision: 1.5 $
+--
+-- (Description of the module)
+-----------------------------------------------------------------------------
+
+module GF.Text.Devanagari (mkDevanagari) where
+
+mkDevanagari :: String -> String
+mkDevanagari = digraphWordToUnicode . adHocToDigraphWord
+
+adHocToDigraphWord :: String -> [(Char, Char)]
+adHocToDigraphWord str = case str of
+  [] -> []
+  '<' : cs -> ('\\', '<') : spoolMarkup cs 
+  ' ' : cs -> ('\\', ' ') : adHocToDigraphWord cs -- skip space
+
+-- if c1 is a vowel
+  -- Two of the same vowel => lengthening 
+  c1 : c2 : cs | c1 == c2 && isVowel c1 -> (cap c1, ':') : adHocToDigraphWord cs
+  -- digraphed or long vowel
+  c1 : c2 : cs | isVowel c1 && isVowel c2 -> (cap c1, cap c2) : adHocToDigraphWord cs
+  c1 : cs | isVowel c1 -> (' ', cap c1) : adHocToDigraphWord cs
+
+-- c1 isn't a vowel
+  -- c1 : 'a' : [] -> [(' ', c1)] -- a inherent
+  -- c1 : c2 : [] | isVowel c2 -> (' ', c1) : [(' ', c2)]
+
+  -- c1 is aspirated
+  c1 : 'H' : c2 : c3 : cs | c2 == c3 && isVowel c2 -> 
+    (c1, 'H') : (c2, ':') : adHocToDigraphWord cs
+  c1 : 'H' : c2 : c3 : cs | isVowel c2 && isVowel c3 -> 
+    (c1, 'H') : (c2, c3) : adHocToDigraphWord cs
+  c1 : 'H' : 'a' : cs -> (c1, 'H') : adHocToDigraphWord cs -- a inherent
+  c1 : 'H' : c2 : cs | isVowel c2 -> (c1, 'H') : (' ', c2) : adHocToDigraphWord cs  
+  -- not vowelless at EOW
+  c1 : 'H' : ' ' : cs -> (c1, 'H') : ('\\', ' ')  : adHocToDigraphWord cs
+  c1 : 'H' : [] -> [(c1, 'H')]
+  c1 : 'H' : cs -> (c1, 'H') : (' ', '^') : adHocToDigraphWord cs -- vowelless
+
+  -- c1 unasp.
+  c1 : c2 : c3 : cs | c2 == c3 && isVowel c2 -> (' ', c1) : (c2, ':') : adHocToDigraphWord cs
+  c1 : c2 : c3 : cs | isVowel c2 && isVowel c3 -> (' ', c1) : (c2, c3) : adHocToDigraphWord cs
+  c1 : 'a' : cs -> (' ', c1) : adHocToDigraphWord cs -- a inherent
+  c1 : c2 : cs | isVowel c2 -> (' ', c1) : (' ', c2) : adHocToDigraphWord cs  
+  -- not vowelless at EOW
+  c1 : ' ' : cs -> (' ', c1) : ('\\', ' '): adHocToDigraphWord cs
+  c1 : [] -> [(' ', c1)]
+  'M' : cs -> (' ', 'M') : adHocToDigraphWord cs -- vowelless but no vowelless sign for anusvara
+  c1 : cs -> (' ', c1) : (' ', '^') : adHocToDigraphWord cs -- vowelless  
+
+isVowel x = elem x "aeiou:"
+cap :: Char -> Char
+cap x = case x of 
+  'a' -> 'A'
+  'e' -> 'E'
+  'i' -> 'I'
+  'o' -> 'O'
+  'u' -> 'U'
+  c   -> c 
+
+spoolMarkup :: String -> [(Char, Char)]
+spoolMarkup s = case s of
+  -- [] -> [] -- Shouldn't happen
+  '>' : cs -> ('\\', '>') : adHocToDigraphWord cs  
+  c1 : cs -> ('\\', c1) : spoolMarkup cs
+
+
+digraphWordToUnicode :: [(Char, Char)] -> String
+digraphWordToUnicode = map digraphToUnicode
+
+digraphToUnicode :: (Char, Char) -> Char
+digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2 
+ where 
+   cc = zip allDevanagariCodes allDevanagari
+
+digraphedDevanagari = " ~ M ;__ AA: II: UU:RoLoEvE~ EE:AvA~ OAU kkH ggHNG ccH jjH \241 TTH DDH N ttH ddH nn. ppH bbH m y rr. l LL. v \231 S s h____ .-Sa: ii: uu:ror:eve~ eaiava~ oau ^____OM | -dddu______ Q X G zD.RH fy.R:L:mrmR#I#d#0#1#2#3#4#5#6#7#8#9#o" 
+
+allDevanagariCodes :: [(Char, Char)]
+allDevanagariCodes = mkPairs digraphedDevanagari
+
+allDevanagari :: String
+allDevanagari = (map toEnum [0x0901 .. 0x0970]) 
+
+mkPairs :: String -> [(Char, Char)]
+mkPairs str = case str of
+  [] -> []
+  c1 : c2 : cs -> (c1, c2) : mkPairs cs
+
diff --git a/src-3.0/GF/Text/Ethiopic.hs b/src-3.0/GF/Text/Ethiopic.hs
new file mode 100644
index 000000000..81abbf719
--- /dev/null
+++ b/src-3.0/GF/Text/Ethiopic.hs
@@ -0,0 +1,72 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : Ethiopic
+-- Maintainer  : HH
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/04/21 16:23:35 $ 
+-- > CVS $Author: bringert $
+-- > CVS $Revision: 1.6 $
+--
+-- Ascii-Unicode decoding for Ethiopian.
+-- Copyright (c) Harald Hammarstr�m 2003 under Gnu General Public License
+-----------------------------------------------------------------------------
+
+module GF.Text.Ethiopic (mkEthiopic) where
+
+mkEthiopic :: String -> String
+mkEthiopic = digraphWordToUnicode . adHocToDigraphWord
+
+-- mkEthiopic :: String -> String
+-- mkEthiopic = reverse . unwords . (map (digraphWordToUnicode . adHocToDigraphWord)) . words
+--- reverse : assumes everything's on same line
+
+adHocToDigraphWord :: String -> [(Char, Int)]
+adHocToDigraphWord str = case str of
+  [] -> []
+  '<' : cs -> ('<', -1) : spoolMarkup cs
+  c1 : cs | isVowel c1 -> (')', vowelOrder c1) : adHocToDigraphWord cs
+  -- c1 isn't a vowel
+  c1 : cs | not (elem c1 allEthiopicCodes) -> (c1, -1) : adHocToDigraphWord cs
+  c1 : c2 : cs | isVowel c2 -> (c1, vowelOrder c2) : adHocToDigraphWord cs
+  c1 : cs -> (c1, 5) : adHocToDigraphWord cs 
+  
+spoolMarkup :: String -> [(Char, Int)]
+spoolMarkup s = case s of
+  -- [] -> [] -- Shouldn't happen
+  '>' : cs -> ('>', -1) : adHocToDigraphWord cs  
+  c1 : cs -> (c1, -1) : spoolMarkup cs
+    
+isVowel x = elem x "A\228ui\239aeoI"
+
+vowelOrder :: Char -> Int
+vowelOrder x = case x of 
+  'A' -> 0
+  '\228' -> 0 -- �
+  'u' -> 1
+  'i' -> 2
+  'a' -> 3
+  'e' -> 4
+  'I' -> 5
+  '\239' -> 5 -- �
+  'o' -> 6
+  c   -> 5 -- vowelless 
+
+digraphWordToUnicode = map digraphToUnicode
+
+digraphToUnicode :: (Char, Int) -> Char
+-- digraphToUnicode (c1, c2) = c1
+ 
+digraphToUnicode (c1, -1) = c1 
+digraphToUnicode (c1, c2) = toEnum (0x1200 + c2 + 8*case lookup c1 cc of Just c' -> c') 
+    where 
+      cc = zip allEthiopicCodes allEthiopic
+
+allEthiopic :: [Int]
+allEthiopic = [0 .. 44] -- x 8
+
+allEthiopicCodes = "hlHmLrs$KQ__bBtcxXnN)kW__w(zZyd_jgG_TCPSLfp"
+
+-- Q = kW, X = xW, W = kW, G = gW  
+
diff --git a/src-3.0/GF/Text/ExtendedArabic.hs b/src-3.0/GF/Text/ExtendedArabic.hs
new file mode 100644
index 000000000..d2c5faac5
--- /dev/null
+++ b/src-3.0/GF/Text/ExtendedArabic.hs
@@ -0,0 +1,99 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : ExtendedArabic
+-- Maintainer  : (Maintainer)
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/04/21 16:23:36 $ 
+-- > CVS $Author: bringert $
+-- > CVS $Revision: 1.6 $
+--
+-- (Description of the module)
+-----------------------------------------------------------------------------
+
+module GF.Text.ExtendedArabic (mkArabic0600, mkExtendedArabic) where
+
+mkArabic0600 :: String -> String
+mkArabic0600 = digraphWordToUnicode . aarnesToDigraphWord
+
+aarnesToDigraphWord :: String -> [(Char, Char)]
+aarnesToDigraphWord str = case str of
+  [] -> []
+  '<' : cs -> ('\\', '<') : spoolMarkup2 cs
+
+  'v' : cs -> ('T', 'H') : aarnesToDigraphWord cs
+  'a' : cs -> (' ', 'A') : aarnesToDigraphWord cs
+  'o' : cs -> (' ', '3') : aarnesToDigraphWord cs
+  'O' : cs -> ('\'', 'i') : aarnesToDigraphWord cs
+
+  'u' : cs -> ('\'', 'A') : aarnesToDigraphWord cs
+  'C' : cs -> (' ', 'X') : aarnesToDigraphWord cs
+
+  'U' : cs -> ('~', 'A') : aarnesToDigraphWord cs
+  'A' : cs -> ('"', 't') : aarnesToDigraphWord cs
+  'c' : cs -> ('s', 'h') : aarnesToDigraphWord cs
+  c : cs -> (' ', c) : aarnesToDigraphWord cs
+
+mkExtendedArabic :: String -> String
+mkExtendedArabic = digraphWordToUnicode . adHocToDigraphWord
+
+adHocToDigraphWord :: String -> [(Char, Char)]
+adHocToDigraphWord str = case str of
+  [] -> []
+  '<' : cs -> ('\\', '<') : spoolMarkup cs
+  -- Sorani 
+  'W' : cs -> (':', 'w') : adHocToDigraphWord cs -- ?? Will do
+  'E' : cs -> (' ', 'i') : adHocToDigraphWord cs -- ?? Letter missing!
+  'j' : cs -> ('d', 'j') : adHocToDigraphWord cs
+  'O' : cs -> ('v', 'w') : adHocToDigraphWord cs
+  'F' : cs -> (' ', 'v') : adHocToDigraphWord cs
+  'Z' : cs -> ('z', 'h') : adHocToDigraphWord cs
+  'I' : cs -> (' ', 'i') : adHocToDigraphWord cs -- ?? Letter missing!
+  'C' : cs -> ('c', 'h') : adHocToDigraphWord cs
+  -- Pashto
+  'e' : cs -> (':', 'y') : adHocToDigraphWord cs 
+  '$' : cs -> ('3', 'H') : adHocToDigraphWord cs 
+  'X' : cs -> ('s', '.') : adHocToDigraphWord cs
+  'G' : cs -> ('z', '.') : adHocToDigraphWord cs
+  'a' : cs -> (' ', 'A') : adHocToDigraphWord cs
+  'P' : cs -> ('\'', 'H') : adHocToDigraphWord cs
+  'R' : cs -> ('o', 'r') : adHocToDigraphWord cs 
+   -- Shared
+  'A' : cs -> (' ', 'h') : adHocToDigraphWord cs -- ?? Maybe to "t or 0x06d5
+  'c' : cs -> ('s', 'h') : adHocToDigraphWord cs
+  c : cs -> (' ', c) : adHocToDigraphWord cs
+
+
+-- Beginning 0x621 up and including 0x06d1
+digraphedExtendedArabic = " '~A'A'w,A'i A b\"t tTHdj H X dDH r z ssh S D T Z 3GH__________ - f q k l m n h w i y&a&w&i/a/w/i/W/o/~/'/,/|/6/v_____________#0#1#2#3#4#5#6#7#8#9#%#,#'#*>b>q$|> A2'2,3'A'w'w&y'Tb:b:BoT3b p4b4B'H:H2H\"H3Hch4HTdod.dTD:d:D3d3D4dTrvror.rvRz.:rzh4zs.+s*S:S3S3T33>ff.f: v4f.q3q-k~kok.k3k3K gog:g:G3Gvl.l3l3L:n>nTnon3n?h4H't>Y\"Yow-wvwww|w^w:w3w>y/yvy.w:y3y____ -ae"  
+
+digraphWordToUnicode = map digraphToUnicode
+
+digraphToUnicode :: (Char, Char) -> Char
+digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2  
+ where 
+   cc = zip allExtendedArabicCodes allExtendedArabic
+
+allExtendedArabicCodes :: [(Char, Char)]
+allExtendedArabicCodes = mkPairs digraphedExtendedArabic
+
+allExtendedArabic :: String
+allExtendedArabic = (map toEnum [0x0621 .. 0x06d1]) 
+
+mkPairs :: String -> [(Char, Char)]
+mkPairs str = case str of
+  [] -> []
+  c1 : c2 : cs -> (c1, c2) : mkPairs cs
+
+spoolMarkup :: String -> [(Char, Char)]
+spoolMarkup s = case s of
+   [] -> [] -- Shouldn't happen
+   '>' : cs -> ('\\', '>') : adHocToDigraphWord cs
+   c1 : cs -> ('\\', c1) : spoolMarkup cs
+
+spoolMarkup2 :: String -> [(Char, Char)]
+spoolMarkup2 s = case s of
+   [] -> [] -- Shouldn't happen
+   '>' : cs -> ('\\', '>') : aarnesToDigraphWord cs
+   c1 : cs -> ('\\', c1) : spoolMarkup2 cs
diff --git a/src-3.0/GF/Text/ExtraDiacritics.hs b/src-3.0/GF/Text/ExtraDiacritics.hs
new file mode 100644
index 000000000..f3d811c2c
--- /dev/null
+++ b/src-3.0/GF/Text/ExtraDiacritics.hs
@@ -0,0 +1,37 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : ExtraDiacritics
+-- Maintainer  : (Maintainer)
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/04/21 16:23:36 $ 
+-- > CVS $Author: bringert $
+-- > CVS $Revision: 1.5 $
+--
+-- (Description of the module)
+-----------------------------------------------------------------------------
+
+module GF.Text.ExtraDiacritics (mkExtraDiacritics) where
+
+mkExtraDiacritics :: String -> String
+mkExtraDiacritics = mkExtraDiacriticsWord
+
+mkExtraDiacriticsWord :: String -> String
+mkExtraDiacriticsWord str = case str of
+  [] -> []
+  '<' : cs -> '<' : spoolMarkup cs
+  -- 
+  '/' : cs -> toEnum 0x0301 : mkExtraDiacriticsWord cs
+  '~' : cs -> toEnum 0x0306 : mkExtraDiacriticsWord cs
+  ':' : cs -> toEnum 0x0304 : mkExtraDiacriticsWord cs -- some of these could be put in LatinA
+  '.' : cs -> toEnum 0x0323 : mkExtraDiacriticsWord cs
+  'i' : '-' : cs -> toEnum 0x0268 : mkExtraDiacriticsWord cs -- in IPA extensions
+  -- Default 
+  c : cs -> c : mkExtraDiacriticsWord cs
+
+spoolMarkup :: String -> String
+spoolMarkup s = case s of
+   [] -> [] -- Shouldn't happen
+   '>' : cs -> '>' : mkExtraDiacriticsWord cs
+   c1 : cs -> c1 : spoolMarkup cs
diff --git a/src-3.0/GF/Text/Greek.hs b/src-3.0/GF/Text/Greek.hs
new file mode 100644
index 000000000..6b9361a29
--- /dev/null
+++ b/src-3.0/GF/Text/Greek.hs
@@ -0,0 +1,172 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : Greek
+-- Maintainer  : (Maintainer)
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/04/21 16:23:37 $ 
+-- > CVS $Author: bringert $
+-- > CVS $Revision: 1.5 $
+--
+-- (Description of the module)
+-----------------------------------------------------------------------------
+
+module GF.Text.Greek (mkGreek) where
+
+mkGreek :: String -> String
+mkGreek = unwords . (map mkGreekWord) . mkGravis . words
+
+--- TODO : optimize character formation by factorizing the case expressions
+
+type GreekChar = Char
+
+mkGreekWord :: String -> [GreekChar]
+mkGreekWord = map (toEnum . mkGreekChar) . mkGreekSpec
+
+mkGravis :: [String] -> [String]
+mkGravis [] = []
+mkGravis [w] = [w]
+mkGravis (w1:w2:ws)
+ | stressed w2 = mkG w1 : mkGravis (w2:ws)
+ | otherwise = w1 : w2 : mkGravis ws
+  where
+    stressed w = any (`elem`  "'~`") w
+    mkG :: String -> String
+    mkG w = let (w1,w2) = span (/='\'') w in
+               case w2 of
+                 '\'':v:cs | not (any isVowel cs) -> w1 ++ "`" ++ [v] ++ cs
+                 '\'':'!':v:cs | not (any isVowel cs) -> w1 ++ "`!" ++ [v] ++ cs
+                 _ -> w
+    isVowel c = elem c "aehiouw"
+
+mkGreekSpec :: String -> [(Char,Int)]
+mkGreekSpec str = case str of
+  [] -> []
+  '(' :'\'': '!' : c : cs -> (c,25)  : mkGreekSpec cs
+  '(' :'~' : '!' : c : cs -> (c,27)  : mkGreekSpec cs
+  '(' :'`' : '!' : c : cs -> (c,23)  : mkGreekSpec cs
+  '(' :      '!' : c : cs -> (c,21)  : mkGreekSpec cs
+  ')' :'\'': '!' : c : cs -> (c,24)  : mkGreekSpec cs
+  ')' :'~' : '!' : c : cs -> (c,26)  : mkGreekSpec cs
+  ')' :'`' : '!' : c : cs -> (c,22)  : mkGreekSpec cs
+  ')' :      '!' : c : cs -> (c,20)  : mkGreekSpec cs
+  '\'':      '!' : c : cs -> (c,30)  : mkGreekSpec cs
+  '~' :      '!' : c : cs -> (c,31)  : mkGreekSpec cs
+  '`' :      '!' : c : cs -> (c,32)  : mkGreekSpec cs
+  '!' :            c : cs -> (c,33)  : mkGreekSpec cs
+  '(' :'\'': c : cs -> (c,5)  : mkGreekSpec cs
+  '(' :'~' : c : cs -> (c,7)  : mkGreekSpec cs
+  '(' :'`' : c : cs -> (c,3)  : mkGreekSpec cs
+  '(' :      c : cs -> (c,1)  : mkGreekSpec cs
+  ')' :'\'': c : cs -> (c,4)  : mkGreekSpec cs
+  ')' :'~' : c : cs -> (c,6)  : mkGreekSpec cs
+  ')' :'`' : c : cs -> (c,2)  : mkGreekSpec cs
+  ')' :      c : cs -> (c,0)  : mkGreekSpec cs
+  '\'':      c : cs -> (c,10) : mkGreekSpec cs
+  '~' :      c : cs -> (c,11) : mkGreekSpec cs
+  '`' :      c : cs -> (c,12) : mkGreekSpec cs
+  c   :          cs -> (c,-1) : mkGreekSpec cs
+
+mkGreekChar (c,-1) = case lookup c cc of Just c' -> c' ; _ -> fromEnum c 
+ where 
+   cc = zip "abgdezhqiklmnxoprjstyfcuw" allGreekMin
+mkGreekChar (c,n) = case (c,n) of
+  ('a',10)        -> 0x03ac 
+  ('a',11)        -> 0x1fb6
+  ('a',12)        -> 0x1f70
+  ('a',30)        -> 0x1fb4
+  ('a',31)        -> 0x1fb7
+  ('a',32)        -> 0x1fb2
+  ('a',33)        -> 0x1fb3
+  ('a',n) | n >19 -> 0x1f80 + n - 20
+  ('a',n)         -> 0x1f00 + n
+  ('e',10)        -> 0x03ad    -- ' 
+--  ('e',11)        -> 0x1fb6    -- ~ can't happen
+  ('e',12)        -> 0x1f72    -- `
+  ('e',n)         -> 0x1f10 + n
+  ('h',10)        -> 0x03ae    -- ' 
+  ('h',11)        -> 0x1fc6    -- ~
+  ('h',12)        -> 0x1f74    -- `
+
+  ('h',30)        -> 0x1fc4
+  ('h',31)        -> 0x1fc7
+  ('h',32)        -> 0x1fc2
+  ('h',33)        -> 0x1fc3
+  ('h',n) | n >19 -> 0x1f90 + n - 20
+
+  ('h',n)         -> 0x1f20 + n
+  ('i',10)        -> 0x03af    -- ' 
+  ('i',11)        -> 0x1fd6    -- ~
+  ('i',12)        -> 0x1f76    -- `
+  ('i',n)         -> 0x1f30 + n
+  ('o',10)        -> 0x03cc    -- ' 
+--  ('o',11)        -> 0x1fb6    -- ~ can't happen
+  ('o',12)        -> 0x1f78    -- `
+  ('o',n)         -> 0x1f40 + n
+  ('y',10)        -> 0x03cd    -- ' 
+  ('y',11)        -> 0x1fe6    -- ~
+  ('y',12)        -> 0x1f7a    -- `
+  ('y',n)         -> 0x1f50 + n
+  ('w',10)        -> 0x03ce    -- ' 
+  ('w',11)        -> 0x1ff6    -- ~
+  ('w',12)        -> 0x1f7c    -- `
+
+  ('w',30)        -> 0x1ff4
+  ('w',31)        -> 0x1ff7
+  ('w',32)        -> 0x1ff2
+  ('w',33)        -> 0x1ff3
+  ('w',n) | n >19 -> 0x1fa0 + n - 20
+
+  ('w',n)         -> 0x1f60 + n
+  ('r',1)         -> 0x1fe5
+  _               -> mkGreekChar (c,-1) --- should not happen
+
+allGreekMin :: [Int]
+allGreekMin = [0x03b1 .. 0x03c9]
+
+
+{-
+encoding of Greek writing. Those hard to guess are marked with ---
+
+               maj   min
+A a Alpha     0391  03b1  
+B b Beta      0392  03b2
+G g Gamma     0393  03b3
+D d Delta     0394  03b4
+E e Epsilon   0395  03b5
+Z z Zeta      0396  03b6
+H h Eta ---   0397  03b7
+Q q Theta --- 0398  03b8
+I i Iota      0399  03b9
+K k Kappa     039a  03ba
+L l Lambda    039b  03bb
+M m My        039c  03bc
+N n Ny        039d  03bd
+X x Xi        039e  03be
+O o Omikron   039f  03bf
+P p Pi        03a0  03c0
+R r Rho       03a1  03c1
+  j Sigma ---       03c2
+S s Sigma     03a3  03c3
+T t Tau       03a4  03c4
+Y y Ypsilon   03a5  03c5
+F f Phi       03a6  03c6
+C c Khi ---   03a7  03c7
+U u Psi       03a8  03c8
+W w Omega --- 03a9  03c9
+
+( spiritus asper
+) spiritus lenis
+! iota subscriptum
+
+' acutus
+~ circumflexus
+` gravis
+
+-}
+
+
+
+
+
diff --git a/src-3.0/GF/Text/Hebrew.hs b/src-3.0/GF/Text/Hebrew.hs
new file mode 100644
index 000000000..c7026d8da
--- /dev/null
+++ b/src-3.0/GF/Text/Hebrew.hs
@@ -0,0 +1,53 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : Hebrew
+-- Maintainer  : (Maintainer)
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/04/21 16:23:37 $ 
+-- > CVS $Author: bringert $
+-- > CVS $Revision: 1.8 $
+--
+-- (Description of the module)
+-----------------------------------------------------------------------------
+
+module GF.Text.Hebrew (mkHebrew) where
+
+mkHebrew :: String -> String
+mkHebrew = mkHebrewWord
+----mkHebrew = reverse . mkHebrewWord
+--- reverse : assumes everything's on same line
+
+type HebrewChar = Char
+
+-- HH 031103 added code for spooling the markup
+-- removed reverse, words, unwords 
+-- (seemed obsolete and come out wrong on the screen)
+-- AR 26/1/2004 put reverse back - needed in Fudgets (but not in Java?)
+
+mkHebrewWord :: String -> [HebrewChar]
+-- mkHebrewWord = map mkHebrewChar
+
+mkHebrewWord s = case s of 
+  [] -> [] 
+  '<' : cs -> '<' : spoolMarkup cs
+  ' ' : cs -> ' ' : mkHebrewWord cs
+  c1 : cs -> mkHebrewChar c1 : mkHebrewWord cs
+
+spoolMarkup :: String -> String
+spoolMarkup s = case s of
+  [] -> [] -- Shouldn't happen
+  '>' : cs -> '>' : mkHebrewWord cs  
+  c1 : cs -> c1 : spoolMarkup cs
+
+mkHebrewChar c = case lookup c cc of Just c' -> c' ; _ -> c 
+ where 
+   cc = zip allHebrewCodes allHebrew
+
+allHebrewCodes = "-abgdhwzHTyKklMmNnSoPpCcqrst"
+
+allHebrew :: String
+allHebrew = (map toEnum (0x05be : [0x05d0 .. 0x05ea]))
+
+
diff --git a/src-3.0/GF/Text/Hiragana.hs b/src-3.0/GF/Text/Hiragana.hs
new file mode 100644
index 000000000..ba74fc83c
--- /dev/null
+++ b/src-3.0/GF/Text/Hiragana.hs
@@ -0,0 +1,95 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : Hiragana
+-- Maintainer  : (Maintainer)
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/04/21 16:23:38 $ 
+-- > CVS $Author: bringert $
+-- > CVS $Revision: 1.5 $
+--
+-- (Description of the module)
+-----------------------------------------------------------------------------
+
+module GF.Text.Hiragana (mkJapanese) where
+
+-- long vowel romaaji must be ei, ou not ee, oo  
+
+mkJapanese :: String -> String
+mkJapanese = digraphWordToUnicode . romaajiToDigraphWord
+
+romaajiToDigraphWord :: String -> [(Char, Char)]
+romaajiToDigraphWord str = case str of
+  [] -> []
+  '<' : cs -> ('\\', '<') : spoolMarkup cs 
+  ' ' : cs -> ('\\', ' ') : romaajiToDigraphWord cs
+  
+  c1 : cs | isVowel c1 -> (' ', cap c1) : romaajiToDigraphWord cs
+
+  -- The combinations
+  c1 : 'y' : c2 : cs -> (c1, 'i') : ('y', cap c2) : romaajiToDigraphWord cs 
+
+  's' : 'h' : 'a' : cs -> ('S', 'i') : ('y', 'A') : romaajiToDigraphWord cs
+  'c' : 'h' : 'a' : cs -> ('C', 'i') : ('y', 'A') : romaajiToDigraphWord cs
+  'j' : 'a' : cs -> ('j', 'i') : ('y', 'A') : romaajiToDigraphWord cs
+
+  's' : 'h' : 'u' : cs -> ('S', 'i') : ('y', 'U') : romaajiToDigraphWord cs
+  'c' : 'h' : 'u' : cs -> ('C', 'i') : ('y', 'U') : romaajiToDigraphWord cs
+  'j' : 'u' : cs -> ('j', 'i') : ('y', 'U') : romaajiToDigraphWord cs
+
+  's' : 'h' : 'o' : cs -> ('S', 'i') : ('y', 'O') : romaajiToDigraphWord cs
+  'c' : 'h' : 'o' : cs -> ('C', 'i') : ('y', 'O') : romaajiToDigraphWord cs
+  'j' : 'o' : cs -> ('j', 'i') : ('y', 'O') : romaajiToDigraphWord cs
+
+  'd' : 'z' : c3 : cs -> ('D', c3) : romaajiToDigraphWord cs
+  't' : 's' : c3 : cs -> ('T', c3) : romaajiToDigraphWord cs
+  'c' : 'h' : c3 : cs -> ('C', c3) : romaajiToDigraphWord cs
+  's' : 'h' : c3 : cs -> ('S', c3) : romaajiToDigraphWord cs
+  'z' : 'h' : c3 : cs -> ('Z', c3) : romaajiToDigraphWord cs
+
+  c1 : ' ' : cs -> (' ', c1) : ('\\', ' ') : romaajiToDigraphWord cs -- n
+  c1 : [] -> [(' ', c1)] -- n
+   
+  c1 : c2 : cs | isVowel c2 -> (c1, c2) : romaajiToDigraphWord cs
+  c1 : c2 : cs | c1 == c2 -> ('T', 'U') : romaajiToDigraphWord (c2 : cs) -- double cons
+  c1 : cs -> (' ', c1) : romaajiToDigraphWord cs -- n
+  
+isVowel x = elem x "aeiou"
+cap :: Char -> Char
+cap x = case x of 
+  'a' -> 'A'
+  'e' -> 'E'
+  'i' -> 'I'
+  'o' -> 'O'
+  'u' -> 'U'
+  c   -> c 
+
+spoolMarkup :: String -> [(Char, Char)]
+spoolMarkup s = case s of
+  -- [] -> [] -- Shouldn't happen
+  '>' : cs -> ('\\', '>') : romaajiToDigraphWord cs  
+  c1 : cs -> ('\\', c1) : spoolMarkup cs
+
+digraphWordToUnicode :: [(Char, Char)] -> String
+digraphWordToUnicode = map digraphToUnicode
+
+digraphToUnicode :: (Char, Char) -> Char
+digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2 
+ where 
+   cc = zip allHiraganaCodes allHiragana
+
+allHiraganaCodes :: [(Char, Char)]
+allHiraganaCodes = mkPairs digraphedHiragana
+
+allHiragana :: String
+allHiragana = (map toEnum [0x3041 .. 0x309f]) 
+
+mkPairs :: String -> [(Char, Char)]
+mkPairs str = case str of
+  [] -> []
+  c1 : c2 : cs -> (c1, c2) : mkPairs cs
+
+digraphedHiragana = " a A i I u U e E o OkagakigikugukegekogosazaSiZisuzusezesozotadaCijiTUTuDutedetodonaninunenohabapahibipihubupuhebepehobopomamimumemoyAyayUyuyOyorarirurerowaWawiwewo nvukAkE____<< o>>o  >'> b"
+
+
diff --git a/src-3.0/GF/Text/LatinASupplement.hs b/src-3.0/GF/Text/LatinASupplement.hs
new file mode 100644
index 000000000..f42423c91
--- /dev/null
+++ b/src-3.0/GF/Text/LatinASupplement.hs
@@ -0,0 +1,69 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : LatinASupplement
+-- Maintainer  : (Maintainer)
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/04/21 16:23:39 $ 
+-- > CVS $Author: bringert $
+-- > CVS $Revision: 1.5 $
+--
+-- (Description of the module)
+-----------------------------------------------------------------------------
+
+module GF.Text.LatinASupplement (mkLatinASupplement) where
+
+mkLatinASupplement :: String -> String
+mkLatinASupplement = mkLatinASupplementWord
+
+mkLatinASupplementWord :: String -> String
+mkLatinASupplementWord str = case str of
+  [] -> []
+  '<' : cs -> '<' : spoolMarkup cs
+  -- Romanian & partly Turkish
+  's' : ',' : cs -> toEnum 0x015f : mkLatinASupplementWord cs
+  'a' : '%' : cs -> toEnum 0x0103 : mkLatinASupplementWord cs
+  -- Slavic and more
+  'c' : '^' : cs -> toEnum 0x010d : mkLatinASupplementWord cs
+  's' : '^' : cs -> toEnum 0x0161 : mkLatinASupplementWord cs
+  'c' : '\'' : cs -> toEnum 0x0107 : mkLatinASupplementWord cs
+  'z' : '^' : cs -> toEnum 0x017e : mkLatinASupplementWord cs
+  -- Turkish
+  'g' : '%' : cs -> toEnum 0x011f : mkLatinASupplementWord cs
+  'I' : cs -> toEnum 0x0131 : mkLatinASupplementWord cs
+  'c' : ',' : cs -> toEnum 0x00e7 : mkLatinASupplementWord cs
+  -- Polish
+  'e' : ',' : cs -> toEnum 0x0119 : mkLatinASupplementWord cs
+  'a' : ',' : cs -> toEnum 0x0105 : mkLatinASupplementWord cs
+  'l' : '/' : cs -> toEnum 0x0142 : mkLatinASupplementWord cs
+  'z' : '.' : cs -> toEnum 0x017c : mkLatinASupplementWord cs
+  'n' : '\'' : cs -> toEnum 0x0144 : mkLatinASupplementWord cs
+  's' : '\'' : cs -> toEnum 0x015b : mkLatinASupplementWord cs
+-- 'c' : '\'' : cs -> toEnum 0x0107 : mkLatinASupplementWord cs
+
+  -- Hungarian 
+  'o' : '%' : cs -> toEnum 0x0151 : mkLatinASupplementWord cs
+  'u' : '%' : cs -> toEnum 0x0171 : mkLatinASupplementWord cs
+
+  -- Mongolian
+  'j' : '^' : cs -> toEnum 0x0135 : mkLatinASupplementWord cs
+
+  -- Khowar (actually in Combining diacritical marks not Latin-A Suppl.)
+  'o' : '.' : cs -> 'o' : (toEnum 0x0323 : mkLatinASupplementWord cs)
+
+  -- Length bars over vowels e.g korean
+  'a' : ':' : cs -> toEnum 0x0101 : mkLatinASupplementWord cs
+  'e' : ':' : cs -> toEnum 0x0113 : mkLatinASupplementWord cs
+  'i' : ':' : cs -> toEnum 0x012b : mkLatinASupplementWord cs
+  'o' : ':' : cs -> toEnum 0x014d : mkLatinASupplementWord cs
+  'u' : ':' : cs -> toEnum 0x016b : mkLatinASupplementWord cs
+
+  -- Default 
+  c : cs -> c : mkLatinASupplementWord cs
+
+spoolMarkup :: String -> String
+spoolMarkup s = case s of
+   [] -> [] -- Shouldn't happen
+   '>' : cs -> '>' : mkLatinASupplementWord cs
+   c1 : cs -> c1 : spoolMarkup cs
diff --git a/src-3.0/GF/Text/OCSCyrillic.hs b/src-3.0/GF/Text/OCSCyrillic.hs
new file mode 100644
index 000000000..0d4696944
--- /dev/null
+++ b/src-3.0/GF/Text/OCSCyrillic.hs
@@ -0,0 +1,47 @@
+----------------------------------------------------------------------
+-- |
+-- Maintainer  : (Maintainer)
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/04/21 16:23:39 $ 
+-- > CVS $Author: bringert $
+-- > CVS $Revision: 1.6 $
+--
+-- (Description of the module)
+-----------------------------------------------------------------------------
+
+module GF.Text.OCSCyrillic (mkOCSCyrillic) where
+
+mkOCSCyrillic :: String -> String
+mkOCSCyrillic = mkOCSCyrillicWord
+
+mkOCSCyrillicWord :: String -> String
+mkOCSCyrillicWord str = case str of
+  [] -> []
+  ' ' : cs -> ' ' : mkOCSCyrillicWord cs
+  '<' : cs -> '<' : spoolMarkup cs  
+  '\228' : cs -> toEnum 0x0463 : mkOCSCyrillicWord cs -- �
+  'j' : 'e' : '~' : cs -> toEnum 0x0469 : mkOCSCyrillicWord cs 
+  'j' : 'o' : '~' : cs -> toEnum 0x046d : mkOCSCyrillicWord cs
+  'j' : 'e' : cs -> toEnum 0x0465 : mkOCSCyrillicWord cs
+  'e' : '~' : cs -> toEnum 0x0467 : mkOCSCyrillicWord cs
+  'o' : '~' : cs -> toEnum 0x046b : mkOCSCyrillicWord cs
+  'j' : 'u' : cs -> toEnum 0x044e : mkOCSCyrillicWord cs
+  'j' : 'a' : cs -> toEnum 0x044f : mkOCSCyrillicWord cs 
+  'u' : cs -> toEnum 0x0479 : mkOCSCyrillicWord cs
+  c : cs -> (mkOCSCyrillicChar c) : mkOCSCyrillicWord cs   
+
+spoolMarkup :: String -> String
+spoolMarkup s = case s of
+   [] -> [] -- Shouldn't happen
+   '>' : cs -> '>' : mkOCSCyrillicWord cs
+   c1 : cs -> c1 : spoolMarkup cs
+
+mkOCSCyrillicChar :: Char -> Char
+mkOCSCyrillicChar c = case lookup c cc of Just c' -> c' ; _ -> c 
+ where 
+   cc = zip "abvgdeZziJklmnoprstYfxCqwWUyIE" allOCSCyrillic
+
+allOCSCyrillic :: String
+allOCSCyrillic = (map toEnum [0x0430 .. 0x044e])
diff --git a/src-3.0/GF/Text/Russian.hs b/src-3.0/GF/Text/Russian.hs
new file mode 100644
index 000000000..c4f1bfd89
--- /dev/null
+++ b/src-3.0/GF/Text/Russian.hs
@@ -0,0 +1,56 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : Russian
+-- Maintainer  : (Maintainer)
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/04/21 16:23:40 $ 
+-- > CVS $Author: bringert $
+-- > CVS $Revision: 1.5 $
+--
+-- (Description of the module)
+-----------------------------------------------------------------------------
+
+module GF.Text.Russian (mkRussian, mkRusKOI8) where
+
+-- | an ad hoc ASCII encoding. Delimiters: @\/_ _\/@
+mkRussian :: String -> String
+mkRussian = unwords . (map mkRussianWord) . words
+
+-- | the KOI8 encoding, incomplete. Delimiters: @\/* *\/@
+mkRusKOI8 :: String -> String
+mkRusKOI8 = unwords . (map mkRussianKOI8) . words
+
+type RussianChar = Char
+
+mkRussianWord :: String -> [RussianChar]
+mkRussianWord = map (mkRussianChar allRussianCodes)
+
+mkRussianKOI8 :: String -> [RussianChar]
+mkRussianKOI8 = map (mkRussianChar allRussianKOI8)
+
+mkRussianChar chars c = case lookup c cc of Just c' -> c' ; _ -> c 
+ where 
+   cc = zip chars allRussian
+
+allRussianCodes :: [Char]
+allRussianCodes = 
+ -- changed to Ints to work with Haskell compilers e.g. GHC 6.5 CVS
+ -- which expect source files to be in UTF-8
+ -- /bringert 2006-05-19
+ -- "��ABVGDEXZIJKLMNOPRSTUFHCQW�}!*�Y�abvgdexzijklmnoprstufhcqw#01'�y�"
+ map toEnum [197,229,65,66,86,71,68,69,88,90,73,74,75,76,77,78,79,80,82,83,84,85,70,72,67,81,87,163,125,33,42,214,89,196,97,98,118,103,100,101,120,122,105,106,107,108,109,110,111,112,114,115,116,117,102,104,99,113,119,35,48,49,39,246,121,228]
+
+allRussianKOI8 :: [Char]
+allRussianKOI8 =
+ -- changed to Ints to work with Haskell compilers e.g. GHC 6.5 CVS
+ -- which expect source files to be in UTF-8
+ -- /bringert 2006-05-19
+ -- "^@����������������������������������������������������������������"
+ map toEnum [94,64,225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,242,243,244,245,230,232,227,254,251,253,248,249,255,252,224,241,193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,210,211,212,213,198,200,195,222,219,221,216,217,223,220,192,209]
+
+allRussian :: String
+allRussian = (map toEnum (0x0401:0x0451:[0x0410 .. 0x044f])) -- �� in odd places
+
+
diff --git a/src-3.0/GF/Text/Tamil.hs b/src-3.0/GF/Text/Tamil.hs
new file mode 100644
index 000000000..8ee171acf
--- /dev/null
+++ b/src-3.0/GF/Text/Tamil.hs
@@ -0,0 +1,77 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : Tamil
+-- Maintainer  : (Maintainer)
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/04/21 16:23:40 $ 
+-- > CVS $Author: bringert $
+-- > CVS $Revision: 1.5 $
+--
+-- (Description of the module)
+-----------------------------------------------------------------------------
+
+module GF.Text.Tamil (mkTamil) where
+
+mkTamil :: String -> String
+mkTamil = digraphWordToUnicode . adHocToDigraphWord
+
+adHocToDigraphWord :: String -> [(Char, Char)]
+adHocToDigraphWord str = case str of
+  [] -> []
+  '<' : cs -> ('\\', '<') : spoolMarkup cs 
+  ' ' : cs -> ('\\', ' ') : adHocToDigraphWord cs -- skip space
+
+-- if c1 is a vowel
+  -- Two of the same vowel => lengthening 
+  c1 : c2 : cs | c1 == c2 && isVowel c1 -> (cap c1, ':') : adHocToDigraphWord cs  
+  -- digraphed or long vowel
+  c1 : c2 : cs | isVowel c1 && isVowel c2 -> (cap c1, cap c2) : adHocToDigraphWord cs
+  c1 : cs | isVowel c1 -> (' ', cap c1) : adHocToDigraphWord cs
+
+-- c1 isn't a vowel
+  c1 : c2 : c3 : cs | c2 == c3 && isVowel c2 -> (' ', c1) : (c2, ':') : adHocToDigraphWord cs
+  c1 : c2 : c3 : cs | isVowel c2 && isVowel c3 -> (' ', c1) : (c2, c3) : adHocToDigraphWord cs
+  c1 : 'a' : cs -> (' ', c1) : adHocToDigraphWord cs -- a inherent
+  c1 : c2 : cs | isVowel c2 -> (' ', c1) : (' ', c2) : adHocToDigraphWord cs  
+
+  c1 : cs -> (' ', c1) : (' ', '.') : adHocToDigraphWord cs -- vowelless  
+
+isVowel x = elem x "aeiou:"
+cap :: Char -> Char
+cap x = case x of 
+  'a' -> 'A'
+  'e' -> 'E'
+  'i' -> 'I'
+  'o' -> 'O'
+  'u' -> 'U'
+  c   -> c 
+
+spoolMarkup :: String -> [(Char, Char)]
+spoolMarkup s = case s of
+  -- [] -> [] -- Shouldn't happen
+  '>' : cs -> ('\\', '>') : adHocToDigraphWord cs  
+  c1 : cs -> ('\\', c1) : spoolMarkup cs
+
+digraphWordToUnicode :: [(Char, Char)] -> String
+digraphWordToUnicode = map digraphToUnicode
+
+digraphToUnicode :: (Char, Char) -> Char
+digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2 
+ where 
+   cc = zip allTamilCodes allTamil
+
+mkPairs :: String -> [(Char, Char)]
+mkPairs str = case str of
+  [] -> []
+  c1 : c2 : cs -> (c1, c2) : mkPairs cs
+
+allTamilCodes :: [(Char, Char)]
+allTamilCodes = mkPairs digraphedTamil
+
+allTamil :: String
+allTamil = (map toEnum [0x0b85 .. 0x0bfa]) 
+
+digraphedTamil = " AA: II: UU:______ EE:AI__ OO:AU k______ G c__ j__ \241 T______ N t______ V n p______ m y r l L M v__ s S h________a: ii: uu:______ ee:ai__ oo:au .__________________ :______________________________#1#2#3#4#5#6#7#8#9^1^2^3=d=m=y=d=c==ru##"
+
diff --git a/src-3.0/GF/Text/Text.hs b/src-3.0/GF/Text/Text.hs
new file mode 100644
index 000000000..b55355c20
--- /dev/null
+++ b/src-3.0/GF/Text/Text.hs
@@ -0,0 +1,149 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : Text
+-- Maintainer  : AR
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/06/23 14:32:44 $ 
+-- > CVS $Author: aarne $
+-- > CVS $Revision: 1.10 $
+--
+-- elementary text postprocessing. AR 21\/11\/2001.
+--
+-- This is very primitive indeed. The functions should work on
+-- token lists and not on strings. AR 5\/12\/2002
+--
+-- XML hack 14\/8\/2004; not in use yet
+-----------------------------------------------------------------------------
+
+module GF.Text.Text (untokWithXML,
+	     exceptXML,
+	     formatAsTextLit,
+	     formatAsCodeLit,
+	     formatAsText,
+	     formatAsHTML,
+	     formatAsLatex,
+	     formatAsCode,
+	     performBinds,
+	     performBindsFinnish,
+	     unStringLit,
+	     concatRemSpace
+	    ) where
+
+import GF.Data.Operations
+import Data.Char
+
+-- | does not apply untokenizer within XML tags --- heuristic "< "
+-- this function is applied from top level...
+untokWithXML :: (String -> String) -> String -> String
+untokWithXML unt s = case s of
+  '<':cs@(c:_) | isAlpha c -> '<':beg ++ ">" ++ unto (drop 1 rest) where 
+                  (beg,rest) = span (/='>') cs
+  '<':cs -> '<':unto cs ---
+  [] -> []
+  _ -> unt beg ++ unto rest where
+               (beg,rest) = span (/='<') s
+ where
+   unto = untokWithXML unt
+
+-- | ... whereas this one is embedded on a branch
+exceptXML :: (String -> String) -> String -> String
+exceptXML unt s = '<':beg ++ ">" ++ unt (drop 1 rest) where 
+  (beg,rest) = span (/='>') s
+
+formatAsTextLit :: String -> String
+formatAsTextLit = formatAsText . unwords . map unStringLit . words 
+--- hope that there will be deforestation...
+
+formatAsCodeLit :: String -> String
+formatAsCodeLit = formatAsCode . unwords . map unStringLit . words 
+
+formatAsText,formatAsHTML,formatAsLatex :: String -> String
+formatAsText  = formatAsTextGen (const False) (=="&-") 
+formatAsHTML  = formatAsTextGen (\s -> take 1 s == "<" || last s == '>') (const False) 
+formatAsLatex = formatAsTextGen ((=="\\") . take 1)  (const False) 
+
+formatAsTextGen :: (String -> Bool) -> (String -> Bool) -> String -> String
+formatAsTextGen tag para = unwords . format . cap . words where
+  format ws = case ws of
+    w :     ww | capit w -> format $ (cap ww)
+    w : c : ww | major c -> format $ (w ++ c) :(cap ww)
+    w : c : ww | minor c -> format $ (w ++ c) :    ww
+    p : c : ww | openp p -> format $ (p ++ c) :ww
+    p : c : ww | spanish p -> format $ (p ++ concat (cap [c])) :ww
+    c     : ww | para  c -> "\n\n"        : format ww
+    w     : ww           -> w             : format ww
+    [] -> []
+  cap (p:ww) | tag p = p : cap ww
+  cap ((c:cs):ww) = (toUpper c : cs) : ww
+  cap [] = []
+  capit = (=="&|")
+  major = flip elem (map singleton ".!?") 
+  minor = flip elem (map singleton ",:;)")
+  openp = all (flip elem "(")
+  spanish = all (flip elem "\161\191")
+
+formatAsCode :: String -> String
+formatAsCode = rend 0 . words where
+  -- render from BNF Converter
+  rend i ss = case ss of
+    "["      :ts -> cons "["  $ rend i ts
+    "("      :ts -> cons "("  $ rend i ts
+    "{"      :ts -> cons "{"  $ new (i+1) $ rend (i+1) ts
+    "}" : ";":ts -> new (i-1) $ space "}" $ cons ";" $ new (i-1) $ rend (i-1) ts
+    "}"      :ts -> new (i-1) $ cons "}" $ new (i-1) $ rend (i-1) ts
+    ";"      :ts -> cons ";"  $ new i $ rend i ts
+    t  : "," :ts -> cons t    $ space "," $ rend i ts
+    t  : ")" :ts -> cons t    $ cons ")"  $ rend i ts
+    t  : "]" :ts -> cons t    $ cons "]"  $ rend i ts
+    t        :ts -> space t   $ rend i ts
+    _            -> ""
+  cons s t  = s ++ t
+  new i s   = '\n' : replicate (2*i) ' ' ++ dropWhile isSpace s
+  space t s = if null s then t else t ++ " " ++ s
+
+performBinds :: String -> String
+performBinds = performBindsOpt (\x y -> y) 
+
+
+-- The function defines an effect of the former on the latter part,
+-- such as in vowel harmony. It is triggered by the binder token "&*"
+
+performBindsOpt :: (String -> String -> String) -> String -> String
+performBindsOpt harm = unwords . format . words where
+  format ws = case ws of
+    w : "&+" : u : ws -> format ((w ++        u) : ws)
+    w : "&*" : u : ws -> format ((w ++ harm w u) : ws)
+    w : ws            -> w : format ws
+    []                -> []
+
+-- unlexer for Finnish particles
+-- Notice: left associativity crucial for "tie &* ko &* han" --> "tiek�h�n"
+
+performBindsFinnish :: String -> String
+performBindsFinnish = performBindsOpt vowelHarmony where
+  vowelHarmony w p = if any (flip elem "aouAOU") w then p else map toFront p
+  toFront c = case c of
+    'A' -> '\196'
+    'O' -> '\214'
+    'a' -> '\228'
+    'o' -> '\246'
+    _ -> c
+
+unStringLit :: String -> String
+unStringLit s = case s of
+  c : cs | strlim c && strlim (last cs) -> init cs
+  _ -> s
+ where
+   strlim = (=='\'')
+
+concatRemSpace :: String -> String
+concatRemSpace = concat . words
+{-
+concatRemSpace s = case s of
+  '<':cs -> exceptXML concatRemSpace cs
+  c : cs | isSpace c -> concatRemSpace cs
+  c :cs -> c : concatRemSpace cs
+  _ -> s
+-}
diff --git a/src-3.0/GF/Text/Thai.hs b/src-3.0/GF/Text/Thai.hs
new file mode 100644
index 000000000..1b186cb3a
--- /dev/null
+++ b/src-3.0/GF/Text/Thai.hs
@@ -0,0 +1,368 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : Thai
+-- Maintainer  : (Maintainer)
+-- Stability   : (experimental)
+-- Portability : (portable)
+--
+--
+-- Thai transliteration and other alphabet information.
+-----------------------------------------------------------------------------
+
+-- AR 27/12/2006. Execute test2 to see the transliteration table.
+
+module GF.Text.Thai (
+  mkThai,mkThaiWord,mkThaiPron,mkThaiFake,thaiFile,thaiPronFile,thaiFakeFile
+  ) where
+
+import qualified Data.Map as Map
+import Data.Char
+
+-- for testing
+import GF.Text.UTF8 
+import Data.List
+
+import Debug.Trace
+
+
+mkThai :: String -> String
+mkThai = concat . map mkThaiWord . words
+mkThaiPron = unwords . map mkPronSyllable . words
+mkThaiFake = unwords . map (fakeEnglish . mkPronSyllable) . words
+
+
+type ThaiChar = Char
+
+mkThaiWord :: String -> [ThaiChar]
+mkThaiWord = map (toEnum . mkThaiChar) . unchar . snd . pronAndOrth
+
+mkThaiChar :: String -> Int
+mkThaiChar c = maybe 0 id $ Map.lookup c thaiMap
+
+thaiMap :: Map.Map String Int
+thaiMap = Map.fromList $ zip allThaiTrans allThaiCodes
+
+-- convert all string literals in a text
+
+thaiStrings :: String -> String
+thaiStrings = convStrings mkThai
+
+thaiPronStrings :: String -> String
+thaiPronStrings = convStrings mkThaiPron
+
+convStrings conv s = case s of
+  '"':cs -> let (t,_:r) = span (/='"') cs in
+            '"': conv t ++ "\"" ++ convStrings conv r
+  c:cs -> c : convStrings conv cs
+  _ -> s
+
+
+-- each character is either [letter] or [letter+nonletter]
+
+unchar :: String -> [String]
+unchar s = case s of
+  c:d:cs 
+   | isAlpha d -> [c]   : unchar (d:cs)
+   | d == '?'  ->         unchar cs -- use "o?" to represent implicit 'o'
+   | otherwise -> [c,d] : unchar cs
+  [_]          -> [s]
+  _            -> []
+
+-- you can prefix transliteration by irregular phonology in []
+
+pronAndOrth :: String -> (Maybe String, String)
+pronAndOrth s = case s of
+  '[':cs -> case span (/=']') cs of
+    (p,_:o) -> (Just p,o)
+    _ -> (Nothing,s)
+  _ -> (Nothing,s)
+
+allThaiTrans :: [String]
+allThaiTrans = words $
+  "-  k  k1 -  k2 -  k3 g  c  c1 c2 s' c3 y' d' t' " ++
+  "t1 t2 t3 n' d  t  t4 t5 t6 n  b  p  p1 f  p2 f' " ++
+  "p3 m  y  r  -  l  -  w  s- s. s  h  l' O  h' -  " ++
+  "a. a  a: a+ i  i: v  v: u  u: -  -  -  -  -  -  " ++
+  "e  e' o: a% a& L  R  S  T1 T2 T3 T4 K  -  -  -  " ++
+  "N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 -  -  -  -  -  -  "
+
+allThaiCodes :: [Int]
+allThaiCodes = [0x0e00 .. 0x0e7f]
+
+
+---------------------
+-- heuristic pronunciation of codes
+---------------------
+
+-- fake English for TTS, a la Teach Yourself Thai
+
+fakeEnglish :: String -> String
+fakeEnglish s = case s of
+  'a':'a':cs -> "ah"  ++ fakeEnglish cs
+  'a':'y':cs -> "ai"  ++ fakeEnglish cs
+  'a'    :cs -> "ah"  ++ fakeEnglish cs
+  'c':'h':cs -> "ch"  ++ fakeEnglish cs
+  'c'    :cs -> "j"   ++ fakeEnglish cs
+  'e':'e':cs -> "aih" ++ fakeEnglish cs
+  'g'    :cs -> "ng"  ++ fakeEnglish cs
+  'i':'i':cs -> "ee"  ++ fakeEnglish cs
+  'k':'h':cs -> "k"   ++ fakeEnglish cs
+  'k'    :cs -> "g"   ++ fakeEnglish cs
+  'O':'O':cs -> "or"  ++ fakeEnglish cs
+  'O'    :cs -> "or"  ++ fakeEnglish cs
+  'o':'o':cs -> "or"  ++ fakeEnglish cs
+  'p':'h':cs -> "p"   ++ fakeEnglish cs
+  'p'    :cs -> "b"   ++ fakeEnglish cs
+  't':'h':cs -> "t"   ++ fakeEnglish cs
+  't'    :cs -> "d"   ++ fakeEnglish cs
+  'u':'u':cs -> "oo"  ++ fakeEnglish cs
+  'u'    :cs -> "oo"  ++ fakeEnglish cs
+  'v':'v':cs -> "eu"  ++ fakeEnglish cs
+  'v'    :cs -> "eu"  ++ fakeEnglish cs
+  '\228':'\228':cs -> "air" ++ fakeEnglish cs
+  '\228'    :cs -> "a"   ++ fakeEnglish cs
+  '\246':'\246':cs -> "er"  ++ fakeEnglish cs
+  '\246'    :cs -> "er"  ++ fakeEnglish cs
+  c:cs | isTone c -> fakeEnglish cs
+  c:cs -> c : fakeEnglish cs
+  _ -> s
+ where
+   isTone = flip elem "'`^~"
+
+
+-- this works for one syllable
+
+mkPronSyllable s = case fst $ pronAndOrth s of
+  Just p -> p
+  _ -> pronSyllable $ getSyllable $ map mkThaiChar $ unchar s
+
+data Syllable = Syll {
+  initv   :: [Int],
+  initc   :: [Int],
+  midv    :: [Int],
+  finalc  :: [Int],
+  finalv  :: [Int],
+  tone    :: [Int],
+  shorten :: Bool,
+  kill    :: Bool
+  }
+   deriving Show
+
+data Tone = TMid | TLow | THigh | TRise | TFall
+  deriving Show
+
+data CClass = CLow | CMid | CHigh
+  deriving Show
+
+pronSyllable :: Syllable -> String
+pronSyllable s = 
+  initCons ++ tonem ++ vowel ++ finalCons
+ where
+
+  vowel = case (initv s, midv s, finalv s, finalc s, shorten s, tone s) of
+    ([0x0e40],[0x0e35],_,[0x0e22],_,_) -> "ia" -- e-i:y
+    ([0x0e40],[0x0e2d,0x0e35],_,_,_,_) -> "va" -- e-i:O
+    ([0x0e40],[0x0e30,0x0e35],_,[0x0e22],_,_) -> "ia" -- e-i:ya.
+    ([0x0e40],[0x0e30,0x0e2d],_,_,_,_) -> "\246"  -- e-Oa.
+    ([0x0e40],[0x0e30,0x0e32],_,_,_,_) -> "O"  -- e-a:a. -- open o 
+    ([0x0e40],[0x0e2d],_,_,_,_) -> "\246\246"        -- e-O
+    ([0x0e40],[0x0e34],_,_,_,_) -> "\246\246"        -- e-i
+    ([0x0e40],[0x0e30],_,_,_,_) -> "e"         -- e-a.
+    ([0x0e40],[0x0e32],_,_,_,_) -> "aw"        -- e-a:
+    ([0x0e40],[],[],[0x0e22],_,_) -> "\246\246y"     -- e-y
+    ([0x0e40],[],[],_,True,_) -> "e"
+
+    ([0x0e41],[0x0e30],_,_,_,_) -> "\228"         -- ä-a.
+    ([0x0e41],[],[],_,True,_)   -> "\228"
+
+    ([0x0e42],[0x0e30],_,_,_,_) -> "o"         -- o:-a.
+
+    ([],[0x0e2d],_,[0x0e22],_,_) -> "OOy"      -- Oy
+    ([],[0x0e2d],_,_,_,_)        -> "OO"       -- O
+
+    ([],[],[],_,_,_) -> "o"
+
+    (i,m,f,_,_,_) -> concatMap pronThaiChar (reverse $ f ++ m ++ i) ----
+
+  initCons = concatMap pronThaiChar $ case (reverse $ initc s) of
+    0x0e2b:cs@(_:_) -> cs -- high h
+    0x0e2d:cs@(_:_) -> cs -- O
+    cs -> cs
+
+  finalCons =
+    let (c,cs) = splitAt 1 $ finalc s
+    in
+    case c of
+      [] -> []
+      [0x0e22] -> [] --- y
+      [k] -> concatMap pronThaiChar (reverse cs) ++ finalThai k
+
+  iclass = case take 1 (reverse $ initc s) of
+    [c] -> classThai c
+    []  -> CMid -- O
+
+  isLong = not (shorten s) && case vowel of
+    _:_:_ -> True ----
+    _ -> False
+
+  isLive = case finalCons of
+    c | elem c ["n","m","g"] -> True
+    "" -> isLong
+    _ -> False
+
+  tonem = case (iclass,isLive,isLong,tone s) of
+    (_,_,_,   [0x0e4a])    -> tHigh
+    (_,_,_,   [0x0e4b])    -> tRise
+    (CLow,_,_,[0x0e49])    -> tRise
+    (_,_,_,   [0x0e49])    -> tFall
+    (CLow,_,_,[0x0e48])    -> tFall
+    (_,   _,_,[0x0e48])    -> tLow
+    (CHigh,True,_,_)       -> tRise
+    (_,    True,_,_)       -> tMid
+    (CLow,False,False,_)   -> tHigh
+    (CLow,False,_,_)       -> tFall
+    _                      -> tLow
+
+(tMid,tHigh,tLow,tRise,tFall) = ("-","'","`","~","^")
+
+isVowel c = 0x0e30 <= c && c <= 0x0e44 ----
+isCons c  = 0x0e01 <= c && c <= 0x0e2f ----
+isTone c  = 0x0e48 <= c && c <= 0x0e4b
+
+getSyllable :: [Int] -> Syllable
+getSyllable = foldl get (Syll [] [] [] [] [] [] False False) where
+  get syll c = case c of
+    0x0e47 -> syll {shorten = True}
+    0x0e4c -> syll {kill = True, finalc = tail (finalc syll)} --- always last
+    0x0e2d 
+      | null (initc syll) -> syll {initc = [c]}  -- "O"
+      | otherwise -> syll {midv = c : midv syll}
+    _ 
+      | isVowel c -> if null (initc syll) 
+                       then syll {initv = c : initv syll}
+                       else syll {midv  = c : midv syll}
+      | isCons c  -> if  null (initc syll) || 
+                        (null (midv syll) && isCluster (initc syll) c) 
+                       then syll {initc  = c : initc syll}
+                       else syll {finalc = c : finalc syll}
+      | isTone c  -> syll {tone = [c]}
+    _ -> syll ---- check this
+
+  isCluster s c = length s == 1 && (c == 0x0e23 || s == [0x0e2b])
+
+-- to test
+
+test1 = testThai "k2wa:mrak"
+test2 = putStrLn $ thaiTable
+test3 = do
+  writeFile  "thai.txt" "Thai Character Coding in GF\nAR 2007\n"
+  appendFile "thai.txt" thaiTable
+test4 = do
+  writeFile  "alphthai.txt" "Thai Characters by Pronunciation\nAR 2007\n"
+  appendFile "alphthai.txt" thaiTableAlph
+
+
+testThai :: String -> IO ()
+testThai s = do
+  putStrLn $ encodeUTF8 $ mkThai s
+  putStrLn $ unwords $ map mkPronSyllable $ words s
+
+testSyllable s =
+  let y = getSyllable $ map mkThaiChar $ unchar s 
+  in
+  putStrLn $ pronSyllable $ trace (show y) y
+
+thaiFile :: FilePath -> Maybe FilePath -> IO ()
+thaiFile f mo = do
+  s <- readFile f
+  let put = maybe putStr writeFile mo
+  put $ encodeUTF8 $ thaiStrings s
+
+thaiPronFile :: FilePath -> Maybe FilePath -> IO ()
+thaiPronFile f mo = do
+  s <- readFile f
+  let put = maybe putStr writeFile mo
+  put $ encodeUTF8 $ thaiPronStrings s
+
+thaiFakeFile :: FilePath -> Maybe FilePath -> IO ()
+thaiFakeFile f mo = do
+  s <- readFile f
+  let put = maybe putStr writeFile mo
+  put $ encodeUTF8 $ (convStrings mkThaiFake) s
+
+finalThai c = maybe "" return (Map.lookup c thaiFinalMap)
+thaiFinalMap = Map.fromList $ zip allThaiCodes finals
+
+classThai c = maybe CLow readClass (Map.lookup c thaiClassMap)
+thaiClassMap = Map.fromList $ zip allThaiCodes heights
+
+readClass s = case s of
+  'L' -> CLow
+  'M' -> CMid
+  'H' -> CHigh
+
+
+thaiTable :: String
+thaiTable = unlines $ ("\n|| hex | thai | trans | pron | fin | class |" ) : [
+  "| " ++ 
+  hex c ++ " | " ++ 
+  encodeUTF8 (showThai s) ++ " | " ++ 
+  s ++ " | " ++ 
+  pronThai s ++ " | " ++
+  [f] ++ " | " ++
+  [q] ++ " | "
+    |
+      (c,q,f,s) <- zip4 allThaiCodes heights finals allThaiTrans
+  ]
+
+thaiTableAlph :: String
+thaiTableAlph = unlines $ ("\n|| pron | thai | trans |" ) : [
+  "| " ++ a ++
+  " | " ++ unwords (map (encodeUTF8 . showThai) ss) ++ 
+  " | " ++ unwords ss ++
+  " |"
+   |
+    (a,ss) <- allProns
+  ]
+ where
+   prons = sort $ nub 
+     [p | s <- allThaiTrans, let p = pronThai s, not (null p),isAlpha (head p)]
+   allProns = 
+     [(a,[s | s <- allThaiTrans, pronThai s == a]) | a <- prons]
+
+showThai s = case s of
+  "-" -> "-"
+---  v:_ | elem v "ivu" -> map (toEnum . mkThaiChar) ["O",s] 
+  _   -> [toEnum $ mkThaiChar s]
+
+
+pronThaiChar = pronThai . recodeThai
+  
+recodeThai c = allThaiTrans !! (c - 0x0e00)
+
+pronThai s = case s of
+  [c,p]
+    | c == 'N' && isDigit p -> [p]
+    | c == 'T' && isDigit p -> ['\'',p]
+    | isDigit p   -> c:"h"
+    | p==':'      -> c:[c]
+    | elem p "%&" -> c:"y"
+    | p=='+'      -> c:"m"
+    | s == "e'"   -> "\228\228"
+    | otherwise   -> [c]
+  "O"             -> "O"
+  "e"             -> "ee"
+  [c] | isUpper c -> "" 
+  _ -> s
+
+hex = map hx . reverse . digs where
+  digs 0 = [0]
+  digs n = n `mod` 16 : digs (n `div` 16)
+  hx d = "0123456789ABCDEF" !! d
+
+heights :: String
+finals  :: String
+heights = 
+  " MHHLLLLMHLLLLMMHLLLMMHLLLMMHHLLLLLL-L-LHHHHLML" ++ replicate 99 ' '
+finals  = 
+  " kkkkkkgt-tt-ntttttntttttnpp--pppmyn-n-wttt-n--" ++ replicate 99 ' '
diff --git a/src-3.0/GF/Text/UTF8.hs b/src-3.0/GF/Text/UTF8.hs
new file mode 100644
index 000000000..5e9687684
--- /dev/null
+++ b/src-3.0/GF/Text/UTF8.hs
@@ -0,0 +1,48 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : UTF8
+-- Maintainer  : AR
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/04/21 16:23:42 $ 
+-- > CVS $Author: bringert $
+-- > CVS $Revision: 1.5 $
+--
+-- From the Char module supplied with HBC.
+-- code by Thomas Hallgren (Jul 10 1999)
+-----------------------------------------------------------------------------
+
+module GF.Text.UTF8 (decodeUTF8, encodeUTF8) where
+
+-- | Take a Unicode string and encode it as a string
+-- with the UTF8 method.
+decodeUTF8 :: String -> String
+decodeUTF8 "" = ""
+decodeUTF8 (c:cs) | c < '\x80' = c : decodeUTF8 cs
+decodeUTF8 (c:c':cs) | '\xc0' <= c  && c  <= '\xdf' && 
+		      '\x80' <= c' && c' <= '\xbf' =
+	toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : decodeUTF8 cs
+decodeUTF8 (c:c':c'':cs) | '\xe0' <= c   && c   <= '\xef' && 
+		          '\x80' <= c'  && c'  <= '\xbf' &&
+		          '\x80' <= c'' && c'' <= '\xbf' =
+	toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : decodeUTF8 cs
+decodeUTF8 s = s ---- AR workaround 22/6/2006
+----decodeUTF8 _ = error "UniChar.decodeUTF8: bad data"
+
+encodeUTF8 :: String -> String
+encodeUTF8 "" = ""
+encodeUTF8 (c:cs) =
+	if c > '\x0000' && c < '\x0080' then
+	    c : encodeUTF8 cs
+	else if c < toEnum 0x0800 then
+	    let i = fromEnum c
+	    in  toEnum (0xc0 + i `div` 0x40) : 
+	        toEnum (0x80 + i `mod` 0x40) : 
+		encodeUTF8 cs
+	else
+	    let i = fromEnum c
+	    in  toEnum (0xe0 + i `div` 0x1000) : 
+	        toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) : 
+		toEnum (0x80 + i `mod` 0x40) : 
+		encodeUTF8 cs
diff --git a/src-3.0/GF/Text/Unicode.hs b/src-3.0/GF/Text/Unicode.hs
new file mode 100644
index 000000000..9d0b9d1a8
--- /dev/null
+++ b/src-3.0/GF/Text/Unicode.hs
@@ -0,0 +1,69 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : Unicode
+-- Maintainer  : AR
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/04/21 16:23:42 $ 
+-- > CVS $Author: bringert $
+-- > CVS $Revision: 1.12 $
+--
+-- ad hoc Unicode conversions from different alphabets.
+-- AR 12\/4\/2000, 18\/9\/2001, 30\/5\/2002, 26\/1\/2004
+-----------------------------------------------------------------------------
+
+module GF.Text.Unicode (mkUnicode, treat) where
+
+import GF.Text.Greek (mkGreek)
+import GF.Text.Arabic (mkArabic)
+import GF.Text.Hebrew (mkHebrew)
+import GF.Text.Russian (mkRussian, mkRusKOI8)
+import GF.Text.Ethiopic (mkEthiopic)
+import GF.Text.Tamil (mkTamil)
+import GF.Text.OCSCyrillic (mkOCSCyrillic)
+import GF.Text.LatinASupplement (mkLatinASupplement)
+import GF.Text.Devanagari (mkDevanagari)
+import GF.Text.Hiragana (mkJapanese)
+import GF.Text.ExtendedArabic (mkArabic0600)
+import GF.Text.ExtendedArabic (mkExtendedArabic)
+import GF.Text.ExtraDiacritics (mkExtraDiacritics)
+
+import Data.Char
+
+mkUnicode :: String -> String
+mkUnicode s = case s of
+  '/':'/':cs -> treat [] mkGreek   unic ++ mkUnicode rest
+  '/':'+':cs -> mkHebrew  unic ++ mkUnicode rest
+  '/':'-':cs -> mkArabic  unic ++ mkUnicode rest
+  '/':'_':cs -> treat [] mkRussian unic ++ mkUnicode rest
+  '/':'*':cs -> mkRusKOI8 unic ++ mkUnicode rest
+  '/':'E':cs -> mkEthiopic unic ++ mkUnicode rest
+  '/':'T':cs -> mkTamil unic ++ mkUnicode rest
+  '/':'C':cs -> mkOCSCyrillic unic ++ mkUnicode rest
+  '/':'&':cs -> mkDevanagari unic ++ mkUnicode rest
+  '/':'L':cs -> mkLatinASupplement unic ++ mkUnicode rest
+  '/':'J':cs -> mkJapanese unic ++ mkUnicode rest
+  '/':'6':cs -> mkArabic0600 unic ++ mkUnicode rest
+  '/':'A':cs -> mkExtendedArabic unic ++ mkUnicode rest
+  '/':'X':cs -> mkExtraDiacritics unic ++ mkUnicode rest
+  c:cs -> c:mkUnicode cs
+  _ -> s
+ where
+   (unic,rest) = remClosing [] $ dropWhile isSpace $ drop 2 s
+   remClosing u s = case s of
+     c:'/':s | elem c "/+-_*ETC&LJ6AX" -> (reverse u, s) --- end need not match
+     c:cs -> remClosing (c:u) cs
+     _ -> (reverse u,[]) -- forgiving missing end
+
+-- | don't convert XML tags --- assumes \<\> always means XML tags
+treat :: String -> (String -> String) -> String -> String
+treat old mk s = case s of
+     '<':cs -> mk (reverse old) ++ '<':noTreat cs
+     c:cs -> treat (c:old) mk cs
+     _ -> mk (reverse old)
+    where
+      noTreat s = case s of
+        '>':cs -> '>' : treat [] mk cs
+        c:cs -> c : noTreat cs
+        _ -> s
author	aarne <aarne@cs.chalmers.se>	2008-05-21 09:26:44 +0000
committer	aarne <aarne@cs.chalmers.se>	2008-05-21 09:26:44 +0000
commit	055c0d0d5a5bb0dc75904fe53df7f2e4f5732a8f (patch)
tree	0e63fb68c69c8f6ad0f78893c63420f0a3600e1c /src-3.0/GF/Text
parent	915a1de71783ab8446b1af9e72c7ba7dfbc12d3f (diff)