summaryrefslogtreecommitdiff
path: root/src-3.0/GF/Text/Thai.hs
diff options
context:
space:
mode:
authoraarne <aarne@cs.chalmers.se>2008-05-21 09:26:44 +0000
committeraarne <aarne@cs.chalmers.se>2008-05-21 09:26:44 +0000
commit055c0d0d5a5bb0dc75904fe53df7f2e4f5732a8f (patch)
tree0e63fb68c69c8f6ad0f78893c63420f0a3600e1c /src-3.0/GF/Text/Thai.hs
parent915a1de71783ab8446b1af9e72c7ba7dfbc12d3f (diff)
GF/src is now for 2.9, and the new sources are in src-3.0 - keep it this way until the release of GF 3
Diffstat (limited to 'src-3.0/GF/Text/Thai.hs')
-rw-r--r--src-3.0/GF/Text/Thai.hs368
1 files changed, 368 insertions, 0 deletions
diff --git a/src-3.0/GF/Text/Thai.hs b/src-3.0/GF/Text/Thai.hs
new file mode 100644
index 000000000..1b186cb3a
--- /dev/null
+++ b/src-3.0/GF/Text/Thai.hs
@@ -0,0 +1,368 @@
+----------------------------------------------------------------------
+-- |
+-- Module : Thai
+-- Maintainer : (Maintainer)
+-- Stability : (experimental)
+-- Portability : (portable)
+--
+--
+-- Thai transliteration and other alphabet information.
+-----------------------------------------------------------------------------
+
+-- AR 27/12/2006. Execute test2 to see the transliteration table.
+
+module GF.Text.Thai (
+ mkThai,mkThaiWord,mkThaiPron,mkThaiFake,thaiFile,thaiPronFile,thaiFakeFile
+ ) where
+
+import qualified Data.Map as Map
+import Data.Char
+
+-- for testing
+import GF.Text.UTF8
+import Data.List
+
+import Debug.Trace
+
+
+mkThai :: String -> String
+mkThai = concat . map mkThaiWord . words
+mkThaiPron = unwords . map mkPronSyllable . words
+mkThaiFake = unwords . map (fakeEnglish . mkPronSyllable) . words
+
+
+type ThaiChar = Char
+
+mkThaiWord :: String -> [ThaiChar]
+mkThaiWord = map (toEnum . mkThaiChar) . unchar . snd . pronAndOrth
+
+mkThaiChar :: String -> Int
+mkThaiChar c = maybe 0 id $ Map.lookup c thaiMap
+
+thaiMap :: Map.Map String Int
+thaiMap = Map.fromList $ zip allThaiTrans allThaiCodes
+
+-- convert all string literals in a text
+
+thaiStrings :: String -> String
+thaiStrings = convStrings mkThai
+
+thaiPronStrings :: String -> String
+thaiPronStrings = convStrings mkThaiPron
+
+convStrings conv s = case s of
+ '"':cs -> let (t,_:r) = span (/='"') cs in
+ '"': conv t ++ "\"" ++ convStrings conv r
+ c:cs -> c : convStrings conv cs
+ _ -> s
+
+
+-- each character is either [letter] or [letter+nonletter]
+
+unchar :: String -> [String]
+unchar s = case s of
+ c:d:cs
+ | isAlpha d -> [c] : unchar (d:cs)
+ | d == '?' -> unchar cs -- use "o?" to represent implicit 'o'
+ | otherwise -> [c,d] : unchar cs
+ [_] -> [s]
+ _ -> []
+
+-- you can prefix transliteration by irregular phonology in []
+
+pronAndOrth :: String -> (Maybe String, String)
+pronAndOrth s = case s of
+ '[':cs -> case span (/=']') cs of
+ (p,_:o) -> (Just p,o)
+ _ -> (Nothing,s)
+ _ -> (Nothing,s)
+
+allThaiTrans :: [String]
+allThaiTrans = words $
+ "- k k1 - k2 - k3 g c c1 c2 s' c3 y' d' t' " ++
+ "t1 t2 t3 n' d t t4 t5 t6 n b p p1 f p2 f' " ++
+ "p3 m y r - l - w s- s. s h l' O h' - " ++
+ "a. a a: a+ i i: v v: u u: - - - - - - " ++
+ "e e' o: a% a& L R S T1 T2 T3 T4 K - - - " ++
+ "N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 - - - - - - "
+
+allThaiCodes :: [Int]
+allThaiCodes = [0x0e00 .. 0x0e7f]
+
+
+---------------------
+-- heuristic pronunciation of codes
+---------------------
+
+-- fake English for TTS, a la Teach Yourself Thai
+
+fakeEnglish :: String -> String
+fakeEnglish s = case s of
+ 'a':'a':cs -> "ah" ++ fakeEnglish cs
+ 'a':'y':cs -> "ai" ++ fakeEnglish cs
+ 'a' :cs -> "ah" ++ fakeEnglish cs
+ 'c':'h':cs -> "ch" ++ fakeEnglish cs
+ 'c' :cs -> "j" ++ fakeEnglish cs
+ 'e':'e':cs -> "aih" ++ fakeEnglish cs
+ 'g' :cs -> "ng" ++ fakeEnglish cs
+ 'i':'i':cs -> "ee" ++ fakeEnglish cs
+ 'k':'h':cs -> "k" ++ fakeEnglish cs
+ 'k' :cs -> "g" ++ fakeEnglish cs
+ 'O':'O':cs -> "or" ++ fakeEnglish cs
+ 'O' :cs -> "or" ++ fakeEnglish cs
+ 'o':'o':cs -> "or" ++ fakeEnglish cs
+ 'p':'h':cs -> "p" ++ fakeEnglish cs
+ 'p' :cs -> "b" ++ fakeEnglish cs
+ 't':'h':cs -> "t" ++ fakeEnglish cs
+ 't' :cs -> "d" ++ fakeEnglish cs
+ 'u':'u':cs -> "oo" ++ fakeEnglish cs
+ 'u' :cs -> "oo" ++ fakeEnglish cs
+ 'v':'v':cs -> "eu" ++ fakeEnglish cs
+ 'v' :cs -> "eu" ++ fakeEnglish cs
+ '\228':'\228':cs -> "air" ++ fakeEnglish cs
+ '\228' :cs -> "a" ++ fakeEnglish cs
+ '\246':'\246':cs -> "er" ++ fakeEnglish cs
+ '\246' :cs -> "er" ++ fakeEnglish cs
+ c:cs | isTone c -> fakeEnglish cs
+ c:cs -> c : fakeEnglish cs
+ _ -> s
+ where
+ isTone = flip elem "'`^~"
+
+
+-- this works for one syllable
+
+mkPronSyllable s = case fst $ pronAndOrth s of
+ Just p -> p
+ _ -> pronSyllable $ getSyllable $ map mkThaiChar $ unchar s
+
+data Syllable = Syll {
+ initv :: [Int],
+ initc :: [Int],
+ midv :: [Int],
+ finalc :: [Int],
+ finalv :: [Int],
+ tone :: [Int],
+ shorten :: Bool,
+ kill :: Bool
+ }
+ deriving Show
+
+data Tone = TMid | TLow | THigh | TRise | TFall
+ deriving Show
+
+data CClass = CLow | CMid | CHigh
+ deriving Show
+
+pronSyllable :: Syllable -> String
+pronSyllable s =
+ initCons ++ tonem ++ vowel ++ finalCons
+ where
+
+ vowel = case (initv s, midv s, finalv s, finalc s, shorten s, tone s) of
+ ([0x0e40],[0x0e35],_,[0x0e22],_,_) -> "ia" -- e-i:y
+ ([0x0e40],[0x0e2d,0x0e35],_,_,_,_) -> "va" -- e-i:O
+ ([0x0e40],[0x0e30,0x0e35],_,[0x0e22],_,_) -> "ia" -- e-i:ya.
+ ([0x0e40],[0x0e30,0x0e2d],_,_,_,_) -> "\246" -- e-Oa.
+ ([0x0e40],[0x0e30,0x0e32],_,_,_,_) -> "O" -- e-a:a. -- open o
+ ([0x0e40],[0x0e2d],_,_,_,_) -> "\246\246" -- e-O
+ ([0x0e40],[0x0e34],_,_,_,_) -> "\246\246" -- e-i
+ ([0x0e40],[0x0e30],_,_,_,_) -> "e" -- e-a.
+ ([0x0e40],[0x0e32],_,_,_,_) -> "aw" -- e-a:
+ ([0x0e40],[],[],[0x0e22],_,_) -> "\246\246y" -- e-y
+ ([0x0e40],[],[],_,True,_) -> "e"
+
+ ([0x0e41],[0x0e30],_,_,_,_) -> "\228" -- รค-a.
+ ([0x0e41],[],[],_,True,_) -> "\228"
+
+ ([0x0e42],[0x0e30],_,_,_,_) -> "o" -- o:-a.
+
+ ([],[0x0e2d],_,[0x0e22],_,_) -> "OOy" -- Oy
+ ([],[0x0e2d],_,_,_,_) -> "OO" -- O
+
+ ([],[],[],_,_,_) -> "o"
+
+ (i,m,f,_,_,_) -> concatMap pronThaiChar (reverse $ f ++ m ++ i) ----
+
+ initCons = concatMap pronThaiChar $ case (reverse $ initc s) of
+ 0x0e2b:cs@(_:_) -> cs -- high h
+ 0x0e2d:cs@(_:_) -> cs -- O
+ cs -> cs
+
+ finalCons =
+ let (c,cs) = splitAt 1 $ finalc s
+ in
+ case c of
+ [] -> []
+ [0x0e22] -> [] --- y
+ [k] -> concatMap pronThaiChar (reverse cs) ++ finalThai k
+
+ iclass = case take 1 (reverse $ initc s) of
+ [c] -> classThai c
+ [] -> CMid -- O
+
+ isLong = not (shorten s) && case vowel of
+ _:_:_ -> True ----
+ _ -> False
+
+ isLive = case finalCons of
+ c | elem c ["n","m","g"] -> True
+ "" -> isLong
+ _ -> False
+
+ tonem = case (iclass,isLive,isLong,tone s) of
+ (_,_,_, [0x0e4a]) -> tHigh
+ (_,_,_, [0x0e4b]) -> tRise
+ (CLow,_,_,[0x0e49]) -> tRise
+ (_,_,_, [0x0e49]) -> tFall
+ (CLow,_,_,[0x0e48]) -> tFall
+ (_, _,_,[0x0e48]) -> tLow
+ (CHigh,True,_,_) -> tRise
+ (_, True,_,_) -> tMid
+ (CLow,False,False,_) -> tHigh
+ (CLow,False,_,_) -> tFall
+ _ -> tLow
+
+(tMid,tHigh,tLow,tRise,tFall) = ("-","'","`","~","^")
+
+isVowel c = 0x0e30 <= c && c <= 0x0e44 ----
+isCons c = 0x0e01 <= c && c <= 0x0e2f ----
+isTone c = 0x0e48 <= c && c <= 0x0e4b
+
+getSyllable :: [Int] -> Syllable
+getSyllable = foldl get (Syll [] [] [] [] [] [] False False) where
+ get syll c = case c of
+ 0x0e47 -> syll {shorten = True}
+ 0x0e4c -> syll {kill = True, finalc = tail (finalc syll)} --- always last
+ 0x0e2d
+ | null (initc syll) -> syll {initc = [c]} -- "O"
+ | otherwise -> syll {midv = c : midv syll}
+ _
+ | isVowel c -> if null (initc syll)
+ then syll {initv = c : initv syll}
+ else syll {midv = c : midv syll}
+ | isCons c -> if null (initc syll) ||
+ (null (midv syll) && isCluster (initc syll) c)
+ then syll {initc = c : initc syll}
+ else syll {finalc = c : finalc syll}
+ | isTone c -> syll {tone = [c]}
+ _ -> syll ---- check this
+
+ isCluster s c = length s == 1 && (c == 0x0e23 || s == [0x0e2b])
+
+-- to test
+
+test1 = testThai "k2wa:mrak"
+test2 = putStrLn $ thaiTable
+test3 = do
+ writeFile "thai.txt" "Thai Character Coding in GF\nAR 2007\n"
+ appendFile "thai.txt" thaiTable
+test4 = do
+ writeFile "alphthai.txt" "Thai Characters by Pronunciation\nAR 2007\n"
+ appendFile "alphthai.txt" thaiTableAlph
+
+
+testThai :: String -> IO ()
+testThai s = do
+ putStrLn $ encodeUTF8 $ mkThai s
+ putStrLn $ unwords $ map mkPronSyllable $ words s
+
+testSyllable s =
+ let y = getSyllable $ map mkThaiChar $ unchar s
+ in
+ putStrLn $ pronSyllable $ trace (show y) y
+
+thaiFile :: FilePath -> Maybe FilePath -> IO ()
+thaiFile f mo = do
+ s <- readFile f
+ let put = maybe putStr writeFile mo
+ put $ encodeUTF8 $ thaiStrings s
+
+thaiPronFile :: FilePath -> Maybe FilePath -> IO ()
+thaiPronFile f mo = do
+ s <- readFile f
+ let put = maybe putStr writeFile mo
+ put $ encodeUTF8 $ thaiPronStrings s
+
+thaiFakeFile :: FilePath -> Maybe FilePath -> IO ()
+thaiFakeFile f mo = do
+ s <- readFile f
+ let put = maybe putStr writeFile mo
+ put $ encodeUTF8 $ (convStrings mkThaiFake) s
+
+finalThai c = maybe "" return (Map.lookup c thaiFinalMap)
+thaiFinalMap = Map.fromList $ zip allThaiCodes finals
+
+classThai c = maybe CLow readClass (Map.lookup c thaiClassMap)
+thaiClassMap = Map.fromList $ zip allThaiCodes heights
+
+readClass s = case s of
+ 'L' -> CLow
+ 'M' -> CMid
+ 'H' -> CHigh
+
+
+thaiTable :: String
+thaiTable = unlines $ ("\n|| hex | thai | trans | pron | fin | class |" ) : [
+ "| " ++
+ hex c ++ " | " ++
+ encodeUTF8 (showThai s) ++ " | " ++
+ s ++ " | " ++
+ pronThai s ++ " | " ++
+ [f] ++ " | " ++
+ [q] ++ " | "
+ |
+ (c,q,f,s) <- zip4 allThaiCodes heights finals allThaiTrans
+ ]
+
+thaiTableAlph :: String
+thaiTableAlph = unlines $ ("\n|| pron | thai | trans |" ) : [
+ "| " ++ a ++
+ " | " ++ unwords (map (encodeUTF8 . showThai) ss) ++
+ " | " ++ unwords ss ++
+ " |"
+ |
+ (a,ss) <- allProns
+ ]
+ where
+ prons = sort $ nub
+ [p | s <- allThaiTrans, let p = pronThai s, not (null p),isAlpha (head p)]
+ allProns =
+ [(a,[s | s <- allThaiTrans, pronThai s == a]) | a <- prons]
+
+showThai s = case s of
+ "-" -> "-"
+--- v:_ | elem v "ivu" -> map (toEnum . mkThaiChar) ["O",s]
+ _ -> [toEnum $ mkThaiChar s]
+
+
+pronThaiChar = pronThai . recodeThai
+
+recodeThai c = allThaiTrans !! (c - 0x0e00)
+
+pronThai s = case s of
+ [c,p]
+ | c == 'N' && isDigit p -> [p]
+ | c == 'T' && isDigit p -> ['\'',p]
+ | isDigit p -> c:"h"
+ | p==':' -> c:[c]
+ | elem p "%&" -> c:"y"
+ | p=='+' -> c:"m"
+ | s == "e'" -> "\228\228"
+ | otherwise -> [c]
+ "O" -> "O"
+ "e" -> "ee"
+ [c] | isUpper c -> ""
+ _ -> s
+
+hex = map hx . reverse . digs where
+ digs 0 = [0]
+ digs n = n `mod` 16 : digs (n `div` 16)
+ hx d = "0123456789ABCDEF" !! d
+
+heights :: String
+finals :: String
+heights =
+ " MHHLLLLMHLLLLMMHLLLMMHLLLMMHHLLLLLL-L-LHHHHLML" ++ replicate 99 ' '
+finals =
+ " kkkkkkgt-tt-ntttttntttttnpp--pppmyn-n-wttt-n--" ++ replicate 99 ' '