summaryrefslogtreecommitdiff
path: root/src-3.0/GF
diff options
context:
space:
mode:
authoraarne <aarne@cs.chalmers.se>2008-06-15 15:24:11 +0000
committeraarne <aarne@cs.chalmers.se>2008-06-15 15:24:11 +0000
commitc2bbdc8a162adb6e50c66a681b7df1c8fbed3f1e (patch)
tree5da1d5e3b7e7ee00619fd1bd9585f99ad06a84e1 /src-3.0/GF
parent8c3111e36a039b2070e796821216d2ff59e09ee6 (diff)
commands for displaying transliteration tables
Diffstat (limited to 'src-3.0/GF')
-rw-r--r--src-3.0/GF/Command/Commands.hs23
-rw-r--r--src-3.0/GF/Text/Lexing.hs3
-rw-r--r--src-3.0/GF/Text/Transliterations.hs87
3 files changed, 110 insertions, 3 deletions
diff --git a/src-3.0/GF/Command/Commands.hs b/src-3.0/GF/Command/Commands.hs
index cb002f5d7..f71e1611c 100644
--- a/src-3.0/GF/Command/Commands.hs
+++ b/src-3.0/GF/Command/Commands.hs
@@ -22,6 +22,7 @@ import GF.Data.ErrM ----
import PGF.ExprSyntax (readExp)
import GF.Command.Abstract
import GF.Text.Lexing
+import GF.Text.Transliterations
import GF.Data.Operations
@@ -301,21 +302,27 @@ allCommands pgf = Map.fromList [
"string processing functions in the order given in the command line",
"option list. Thus 'ps -f -g s' returns g (f s). Typical string processors",
"are lexers and unlexers, but also character encoding conversions are possible.",
- "The unlexers preserve the division of their input to lines."
+ "The unlexers preserve the division of their input to lines.",
+ "To see transliteration tables, use command ut."
],
examples = [
"l (EAdd 3 4) | ps -code -- linearize code-like output",
"ps -lexer=code | p -cat=Exp -- parse code-like input",
"gr -cat=QCl | l | ps -bind -to_utf8 -- linearization output from LangFin",
- "ps -from_utf8 \"jag ?r h?r\" | p -- parser in LangSwe in UYF8 terminal"
+ "ps -from_utf8 \"jag ?r h?r\" | p -- parser in LangSwe in UTF8 terminal",
+ "ps -to_devanagari -to_utf8 \"A-p\" -- show Devanagari in UTF8 terminal"
],
exec = \opts -> return . fromString . stringOps opts . toString,
options = [
("bind","bind tokens separated by Prelude.BIND, i.e. &+"),
+ ("from_devanagari","from unicode to GF Devanagari transliteration"),
+ ("from_thai","from unicode to GF Thai transliteration"),
("from_utf8","decode from utf8"),
("lextext","text-like lexer"),
("lexcode","code-like lexer"),
("lexmixed","mixture of text and code (code between $...$)"),
+ ("to_devanagari","from GF Devanagari transliteration to unicode"),
+ ("to_thai","from GF Thai transliteration to unicode"),
("to_utf8","encode to utf8"),
("unlextext","text-like unlexer"),
("unlexcode","code-like unlexer"),
@@ -370,6 +377,18 @@ allCommands pgf = Map.fromList [
("number","the maximum number of questions")
]
}),
+ ("ut", emptyCommandInfo {
+ longname = "unicode_table",
+ synopsis = "show a transliteration table for a unicode character set",
+ exec = \opts arg -> do
+ let t = concatMap prOpt (take 1 opts)
+ let out = maybe "no such transliteration" characterTable $ transliteration t
+ return $ fromString out,
+ options = [
+ ("devanagari","Devanagari"),
+ ("thai", "Thai")
+ ]
+ }),
("wf", emptyCommandInfo {
longname = "write_file",
synopsis = "send string or tree to a file",
diff --git a/src-3.0/GF/Text/Lexing.hs b/src-3.0/GF/Text/Lexing.hs
index 5ad2a69b7..16391d183 100644
--- a/src-3.0/GF/Text/Lexing.hs
+++ b/src-3.0/GF/Text/Lexing.hs
@@ -1,5 +1,6 @@
module GF.Text.Lexing (stringOp) where
+import GF.Text.Transliterations
import GF.Text.UTF8
import Data.Char
@@ -19,7 +20,7 @@ stringOp name = case name of
"unwords" -> Just $ appUnlexer unwords
"to_utf8" -> Just encodeUTF8
"from_utf8" -> Just decodeUTF8
- _ -> Nothing
+ _ -> transliterate name
appLexer :: (String -> [String]) -> String -> String
appLexer f = unwords . filter (not . null) . f
diff --git a/src-3.0/GF/Text/Transliterations.hs b/src-3.0/GF/Text/Transliterations.hs
new file mode 100644
index 000000000..28f653dcf
--- /dev/null
+++ b/src-3.0/GF/Text/Transliterations.hs
@@ -0,0 +1,87 @@
+module GF.Text.Transliterations (transliterate,transliteration,characterTable) where
+
+import GF.Text.UTF8
+
+import Data.Char
+import qualified Data.Map as Map
+
+transliterate :: String -> Maybe (String -> String)
+transliterate s = case s of
+ 'f':'r':'o':'m':'_':t -> fmap appTransFromUnicode $ transliteration t
+ 't':'o':'_':t -> fmap appTransToUnicode $ transliteration t
+ _ -> Nothing
+
+transliteration :: String -> Maybe Transliteration
+transliteration s = case s of
+ "devanagari" -> Just transDevanagari
+ "thai" -> Just transThai
+ _ -> Nothing
+
+characterTable :: Transliteration -> String
+characterTable = unlines . map prOne . Map.assocs . trans_from_unicode where
+ prOne (i,s) = unwords ["|", show i, "|", encodeUTF8 [toEnum i], "|", s, "|"]
+
+data Transliteration = Trans {
+ trans_to_unicode :: Map.Map String Int,
+ trans_from_unicode :: Map.Map Int String
+ }
+
+appTransToUnicode :: Transliteration -> String -> String
+appTransToUnicode trans =
+ concat .
+ map (\c -> maybe c (return . toEnum) $
+ Map.lookup c (trans_to_unicode trans)
+ ) .
+ unchar
+
+appTransFromUnicode :: Transliteration -> String -> String
+appTransFromUnicode trans =
+ concat .
+ map (maybe "?" id .
+ flip Map.lookup (trans_from_unicode trans)
+ ) .
+ map fromEnum
+
+
+-- conventions:
+-- each character is either [letter] or [letter+nonletter]
+-- when using a sparse range of unicodes, mark missing codes as "-" in transliterations
+
+mkTransliteration :: [String] -> [Int] -> Transliteration
+mkTransliteration ts us = Trans (Map.fromList (tzip ts us)) (Map.fromList (uzip us ts))
+ where
+ tzip ts us = [(t,u) | (t,u) <- zip ts us, t /= "-"]
+ uzip us ts = [(u,t) | (u,t) <- zip us ts, t /= "-"]
+
+
+unchar :: String -> [String]
+unchar s = case s of
+ c:d:cs
+ | isAlpha d -> [c] : unchar (d:cs)
+ | isSpace d -> [c] : unchar cs
+ | otherwise -> [c,d] : unchar cs
+ [_] -> [s]
+ _ -> []
+
+transThai :: Transliteration
+transThai = mkTransliteration allTrans allCodes where
+ allTrans = words $
+ "- k k1 - k2 - k3 g c c1 c2 s' c3 y' d' t' " ++
+ "t1 t2 t3 n' d t t4 t5 t6 n b p p1 f p2 f' " ++
+ "p3 m y r - l - w s- s. s h l' O h' - " ++
+ "a. a a: a+ i i: v v: u u: - - - - - - " ++
+ "e e' o: a% a& L R S T1 T2 T3 T4 K - - - " ++
+ "N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 - - - - - - "
+ allCodes = [0x0e00 .. 0x0e7f]
+
+transDevanagari :: Transliteration
+transDevanagari = mkTransliteration allTrans allCodes where
+ allTrans = words $
+ "~ * - - " ++
+ "a- A- i- I- u- U- R- - - - e- E- - - o- O- " ++
+ "k K g G N: c C j J n: t. T. d. D. n. t " ++
+ "T d D n - p P b B m y r - l - - v " ++
+ "S s. s h - - r. - A i I u U R - - " ++
+ "- e E o O "
+ allCodes = [0x0901 .. 0x094c]
+