summaryrefslogtreecommitdiff
path: root/src/GF/Text/Transliterations.hs
blob: e77d30fc4442e46667e1b71a1ed6b4a776fbf348 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
module GF.Text.Transliterations (transliterate,transliteration,characterTable) where

import GF.Text.UTF8

import Data.Char
import Numeric
import qualified Data.Map as Map

-- transliterations between ASCII and a Unicode character set

-- current transliterations: devanagari, thai

-- to add a new one: define the Unicode range and the corresponding ASCII strings,
-- which may be one or two characters long

-- conventions to be followed: 
--   each character is either [letter] or [letter+nonletter]
--   when using a sparse range of unicodes, mark missing codes as "-" in transliterations
--   characters can be invisible: ignored in translation to unicode

transliterate :: String -> Maybe (String -> String)
transliterate s = case s of
  'f':'r':'o':'m':'_':t -> fmap appTransFromUnicode $ transliteration t
  't':'o':'_':t -> fmap appTransToUnicode $ transliteration t
  _ -> Nothing

transliteration :: String -> Maybe Transliteration
transliteration s = case s of
  "arabic" -> Just transArabic
  "devanagari" -> Just transDevanagari
  "telugu" -> Just transTelugu
  "thai" -> Just transThai
----  "urdu" -> Just transUrdu
  _ -> Nothing

characterTable :: Transliteration -> String
characterTable = unlines . map prOne . Map.assocs . trans_from_unicode where
  prOne (i,s) = unwords ["|", showHex i "", "|", [toEnum i], "|", s, "|"]

data Transliteration = Trans {
  trans_to_unicode   :: Map.Map String Int,
  trans_from_unicode :: Map.Map Int String,
  invisible_chars    :: [String]
  }

appTransToUnicode :: Transliteration -> String -> String
appTransToUnicode trans = 
  concat .
  map (\c -> maybe c (return . toEnum) $
             Map.lookup c (trans_to_unicode trans)
      ) . 
  filter (flip notElem (invisible_chars trans)) . 
  unchar

appTransFromUnicode :: Transliteration -> String -> String
appTransFromUnicode trans = 
  concat .
  map (maybe "?" id . 
       flip Map.lookup (trans_from_unicode trans)
      ) . 
  map fromEnum


mkTransliteration :: [String] -> [Int] -> Transliteration
mkTransliteration ts us = Trans (Map.fromList (tzip ts us)) (Map.fromList (uzip us ts)) []
  where
    tzip ts us = [(t,u) | (t,u) <- zip ts us, t /= "-"]
    uzip us ts = [(u,t) | (u,t) <- zip us ts, t /= "-"]


unchar :: String -> [String]
unchar s = case s of
  c:d:cs 
   | isAlpha d -> [c]    : unchar (d:cs)
   | isSpace d -> [c]:[d]: unchar cs
   | otherwise -> [c,d]  : unchar cs
  [_]          -> [s]
  _            -> []

transThai :: Transliteration
transThai = mkTransliteration allTrans allCodes where
  allTrans = words $
    "-  k  k1 -  k2 -  k3 g  c  c1 c2 s' c3 y' d' t' " ++
    "t1 t2 t3 n' d  t  t4 t5 t6 n  b  p  p1 f  p2 f' " ++
    "p3 m  y  r  -  l  -  w  s- s. s  h  l' O  h' -  " ++
    "a. a  a: a+ i  i: v  v: u  u: -  -  -  -  -  -  " ++
    "e  e' o: a% a& L  R  S  T1 T2 T3 T4 K  -  -  -  " ++
    "N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 -  -  -  -  -  -  "
  allCodes = [0x0e00 .. 0x0e7f]

transDevanagari :: Transliteration
transDevanagari = 
  (mkTransliteration allTransUrduHindi allCodes){invisible_chars = ["a"]} where
    allCodes = [0x0901 .. 0x094c]

allTransUrduHindi = words $
    "M  N  -  - " ++
    "a- A- i- I- u- U- R- -  -  -  e- E- -  -  o- O- " ++
    "k  K  g  G  N: c  C  j  J  n: t. T. d. D. n. t  " ++
    "T  d  D  n  -  p  P  b  B  m  y  r  -  l  -  -  v  " ++
    "S  s. s  h  -  -  r. -  A  i  I  u  U  R  -  -  " ++
    "-  e  E  o  O "

transUrdu :: Transliteration
transUrdu = 
  (mkTransliteration allTransUrduHindi allCodes){invisible_chars = ["a"]} where
    allCodes = [0x0901 .. 0x094c] ---- TODO: this is devanagari

transArabic :: Transliteration
transArabic = mkTransliteration allTrans allCodes where
  allTrans = words $
    "   V  A: A? w? A- y? A  b  t. t  v  g  H  K  d " ++  -- 0621 - 062f
    "W  r  z  s  C S  D  T  Z  c  G "                ++  -- 0630 - 063a
    "   f  q  k  l  m  n  h  w  y. y a. u. i. a  u " ++  -- 0641 - 064f
    "i  v2 o  a: V+ V- i: a+"                             -- 0650 - 0657 
  allCodes = [0x0621..0x062f] ++ [0x0630..0x063a] ++ 
             [0x0641..0x064f] ++ [0x0650..0x0657]

transTelugu :: Transliteration
transTelugu = mkTransliteration allTrans allCodes where
  allTrans = words $
    "-  c1 c2 c3 -  A  A: I  I: U  U: R_ L_ -  E  E: " ++
    "A' -  O  O: A_ k  k. g  g. n. c  c. j  j. n' T  " ++
    "T. d  d. N  t  t. d  d. n  -  p  p. b  b. m  y  " ++
    "r  R  l  L  -  v  s' S  s  h  -  -  -  c5 a: i  " ++
    "i: u  u: r_ r. -  e  e: a' -  o  o: a_ c6 -  -  " ++
    "-  -  -  -  -  c7 c8 z  Z  -  -  -  -  -  -  -  " ++
    "R+ L+ l+ l* -  -  n0 n1 n2 n3 n4 n5 n6 n7 n8 n9 "
  allCodes = [0x0c00 .. 0x0c7f]