summaryrefslogtreecommitdiff
path: root/src/GF/Text/Unicode.hs
blob: 9d0b9d1a8da05716f8cbbccfceffd5657ae11ab5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
----------------------------------------------------------------------
-- |
-- Module      : Unicode
-- Maintainer  : AR
-- Stability   : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:42 $ 
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.12 $
--
-- ad hoc Unicode conversions from different alphabets.
-- AR 12\/4\/2000, 18\/9\/2001, 30\/5\/2002, 26\/1\/2004
-----------------------------------------------------------------------------

module GF.Text.Unicode (mkUnicode, treat) where

import GF.Text.Greek (mkGreek)
import GF.Text.Arabic (mkArabic)
import GF.Text.Hebrew (mkHebrew)
import GF.Text.Russian (mkRussian, mkRusKOI8)
import GF.Text.Ethiopic (mkEthiopic)
import GF.Text.Tamil (mkTamil)
import GF.Text.OCSCyrillic (mkOCSCyrillic)
import GF.Text.LatinASupplement (mkLatinASupplement)
import GF.Text.Devanagari (mkDevanagari)
import GF.Text.Hiragana (mkJapanese)
import GF.Text.ExtendedArabic (mkArabic0600)
import GF.Text.ExtendedArabic (mkExtendedArabic)
import GF.Text.ExtraDiacritics (mkExtraDiacritics)

import Data.Char

mkUnicode :: String -> String
mkUnicode s = case s of
  '/':'/':cs -> treat [] mkGreek   unic ++ mkUnicode rest
  '/':'+':cs -> mkHebrew  unic ++ mkUnicode rest
  '/':'-':cs -> mkArabic  unic ++ mkUnicode rest
  '/':'_':cs -> treat [] mkRussian unic ++ mkUnicode rest
  '/':'*':cs -> mkRusKOI8 unic ++ mkUnicode rest
  '/':'E':cs -> mkEthiopic unic ++ mkUnicode rest
  '/':'T':cs -> mkTamil unic ++ mkUnicode rest
  '/':'C':cs -> mkOCSCyrillic unic ++ mkUnicode rest
  '/':'&':cs -> mkDevanagari unic ++ mkUnicode rest
  '/':'L':cs -> mkLatinASupplement unic ++ mkUnicode rest
  '/':'J':cs -> mkJapanese unic ++ mkUnicode rest
  '/':'6':cs -> mkArabic0600 unic ++ mkUnicode rest
  '/':'A':cs -> mkExtendedArabic unic ++ mkUnicode rest
  '/':'X':cs -> mkExtraDiacritics unic ++ mkUnicode rest
  c:cs -> c:mkUnicode cs
  _ -> s
 where
   (unic,rest) = remClosing [] $ dropWhile isSpace $ drop 2 s
   remClosing u s = case s of
     c:'/':s | elem c "/+-_*ETC&LJ6AX" -> (reverse u, s) --- end need not match
     c:cs -> remClosing (c:u) cs
     _ -> (reverse u,[]) -- forgiving missing end

-- | don't convert XML tags --- assumes \<\> always means XML tags
treat :: String -> (String -> String) -> String -> String
treat old mk s = case s of
     '<':cs -> mk (reverse old) ++ '<':noTreat cs
     c:cs -> treat (c:old) mk cs
     _ -> mk (reverse old)
    where
      noTreat s = case s of
        '>':cs -> '>' : treat [] mk cs
        c:cs -> c : noTreat cs
        _ -> s