summaryrefslogtreecommitdiff
path: root/src-3.0/GF/Text/Unicode.hs
diff options
context:
space:
mode:
authoraarne <aarne@cs.chalmers.se>2008-05-21 09:26:44 +0000
committeraarne <aarne@cs.chalmers.se>2008-05-21 09:26:44 +0000
commit055c0d0d5a5bb0dc75904fe53df7f2e4f5732a8f (patch)
tree0e63fb68c69c8f6ad0f78893c63420f0a3600e1c /src-3.0/GF/Text/Unicode.hs
parent915a1de71783ab8446b1af9e72c7ba7dfbc12d3f (diff)
GF/src is now for 2.9, and the new sources are in src-3.0 - keep it this way until the release of GF 3
Diffstat (limited to 'src-3.0/GF/Text/Unicode.hs')
-rw-r--r--src-3.0/GF/Text/Unicode.hs69
1 files changed, 69 insertions, 0 deletions
diff --git a/src-3.0/GF/Text/Unicode.hs b/src-3.0/GF/Text/Unicode.hs
new file mode 100644
index 000000000..9d0b9d1a8
--- /dev/null
+++ b/src-3.0/GF/Text/Unicode.hs
@@ -0,0 +1,69 @@
+----------------------------------------------------------------------
+-- |
+-- Module : Unicode
+-- Maintainer : AR
+-- Stability : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/04/21 16:23:42 $
+-- > CVS $Author: bringert $
+-- > CVS $Revision: 1.12 $
+--
+-- ad hoc Unicode conversions from different alphabets.
+-- AR 12\/4\/2000, 18\/9\/2001, 30\/5\/2002, 26\/1\/2004
+-----------------------------------------------------------------------------
+
+module GF.Text.Unicode (mkUnicode, treat) where
+
+import GF.Text.Greek (mkGreek)
+import GF.Text.Arabic (mkArabic)
+import GF.Text.Hebrew (mkHebrew)
+import GF.Text.Russian (mkRussian, mkRusKOI8)
+import GF.Text.Ethiopic (mkEthiopic)
+import GF.Text.Tamil (mkTamil)
+import GF.Text.OCSCyrillic (mkOCSCyrillic)
+import GF.Text.LatinASupplement (mkLatinASupplement)
+import GF.Text.Devanagari (mkDevanagari)
+import GF.Text.Hiragana (mkJapanese)
+import GF.Text.ExtendedArabic (mkArabic0600)
+import GF.Text.ExtendedArabic (mkExtendedArabic)
+import GF.Text.ExtraDiacritics (mkExtraDiacritics)
+
+import Data.Char
+
+mkUnicode :: String -> String
+mkUnicode s = case s of
+ '/':'/':cs -> treat [] mkGreek unic ++ mkUnicode rest
+ '/':'+':cs -> mkHebrew unic ++ mkUnicode rest
+ '/':'-':cs -> mkArabic unic ++ mkUnicode rest
+ '/':'_':cs -> treat [] mkRussian unic ++ mkUnicode rest
+ '/':'*':cs -> mkRusKOI8 unic ++ mkUnicode rest
+ '/':'E':cs -> mkEthiopic unic ++ mkUnicode rest
+ '/':'T':cs -> mkTamil unic ++ mkUnicode rest
+ '/':'C':cs -> mkOCSCyrillic unic ++ mkUnicode rest
+ '/':'&':cs -> mkDevanagari unic ++ mkUnicode rest
+ '/':'L':cs -> mkLatinASupplement unic ++ mkUnicode rest
+ '/':'J':cs -> mkJapanese unic ++ mkUnicode rest
+ '/':'6':cs -> mkArabic0600 unic ++ mkUnicode rest
+ '/':'A':cs -> mkExtendedArabic unic ++ mkUnicode rest
+ '/':'X':cs -> mkExtraDiacritics unic ++ mkUnicode rest
+ c:cs -> c:mkUnicode cs
+ _ -> s
+ where
+ (unic,rest) = remClosing [] $ dropWhile isSpace $ drop 2 s
+ remClosing u s = case s of
+ c:'/':s | elem c "/+-_*ETC&LJ6AX" -> (reverse u, s) --- end need not match
+ c:cs -> remClosing (c:u) cs
+ _ -> (reverse u,[]) -- forgiving missing end
+
+-- | don't convert XML tags --- assumes \<\> always means XML tags
+treat :: String -> (String -> String) -> String -> String
+treat old mk s = case s of
+ '<':cs -> mk (reverse old) ++ '<':noTreat cs
+ c:cs -> treat (c:old) mk cs
+ _ -> mk (reverse old)
+ where
+ noTreat s = case s of
+ '>':cs -> '>' : treat [] mk cs
+ c:cs -> c : noTreat cs
+ _ -> s