summaryrefslogtreecommitdiff
path: root/src/GF/Text
diff options
context:
space:
mode:
authoraarne <unknown>2003-09-22 13:16:55 +0000
committeraarne <unknown>2003-09-22 13:16:55 +0000
commitb1402e8bd6a68a891b00a214d6cf184d66defe19 (patch)
tree90372ac4e53dce91cf949dbf8e93be06f1d9e8bd /src/GF/Text
Founding the newly structured GF2.0 cvs archive.
Diffstat (limited to 'src/GF/Text')
-rw-r--r--src/GF/Text/Arabic.hs48
-rw-r--r--src/GF/Text/Greek.hs158
-rw-r--r--src/GF/Text/Hebrew.hs21
-rw-r--r--src/GF/Text/Russian.hs31
-rw-r--r--src/GF/Text/Text.hs56
-rw-r--r--src/GF/Text/UTF8.hs35
-rw-r--r--src/GF/Text/Unicode.hs24
7 files changed, 373 insertions, 0 deletions
diff --git a/src/GF/Text/Arabic.hs b/src/GF/Text/Arabic.hs
new file mode 100644
index 000000000..6df79c4a9
--- /dev/null
+++ b/src/GF/Text/Arabic.hs
@@ -0,0 +1,48 @@
+module Arabic where
+
+mkArabic :: String -> String
+mkArabic = reverse . unwords . (map mkArabicWord) . words
+--- reverse : assumes everything's on same line
+
+type ArabicChar = Char
+
+mkArabicWord :: String -> [ArabicChar]
+mkArabicWord = map mkArabicChar . getLetterPos
+
+getLetterPos :: String -> [(Char,Int)]
+getLetterPos [] = []
+getLetterPos ('I':cs) = ('*',7) : getLetterPos cs -- 0xfe80
+getLetterPos ('O':cs) = ('*',8) : getIn cs -- 0xfe8b
+getLetterPos ('l':'a':cs) = ('*',5) : getLetterPos cs -- 0xfefb
+getLetterPos [c] = [(c,1)] -- 1=isolated
+getLetterPos (c:cs) | isReduced c = (c,1) : getLetterPos cs
+getLetterPos (c:cs) = (c,3) : getIn cs -- 3=initial
+
+
+getIn [] = []
+getIn ('I':cs) = ('*',7) : getLetterPos cs -- 0xfe80
+getIn ('O':cs) = ('*',9) : getIn cs -- 0xfe8c
+getIn ('l':'a':cs) = ('*',6) : getLetterPos cs -- 0xfefc
+getIn [c] = [(c,2)] -- 2=final
+getIn (c:cs) | isReduced c = (c,2) : getLetterPos cs
+getIn (c:cs) = (c,4) : getIn cs -- 4=medial
+
+isReduced :: Char -> Bool
+isReduced c = c `elem` "UuWiYOaAdVrzwj"
+
+mkArabicChar ('*',p) | p > 4 && p < 10 =
+ (map toEnum [0xfefb,0xfefc,0xfe80,0xfe8b,0xfe8c]) !! (p-5)
+mkArabicChar cp@(c,p) = case lookup c cc of Just c' -> (c' !! (p-1)) ; _ -> c
+ where
+ cc = mkArabicTab allArabicCodes allArabic
+
+mkArabicTab (c:cs) as = (c,as1) : mkArabicTab cs as2 where
+ (as1,as2) = if isReduced c then splitAt 2 as else splitAt 4 as
+mkArabicTab [] _ = []
+
+allArabicCodes = "UuWiYOabAtvgHCdVrzscSDTZoxfqklmnhwjy"
+
+allArabic :: String
+allArabic = (map toEnum [0xfe81 .. 0xfef4]) -- I=0xfe80
+
+
diff --git a/src/GF/Text/Greek.hs b/src/GF/Text/Greek.hs
new file mode 100644
index 000000000..8cbba8c54
--- /dev/null
+++ b/src/GF/Text/Greek.hs
@@ -0,0 +1,158 @@
+module Greek where
+
+mkGreek :: String -> String
+mkGreek = unwords . (map mkGreekWord) . mkGravis . words
+
+--- TODO : optimize character formation by factorizing the case expressions
+
+type GreekChar = Char
+
+mkGreekWord :: String -> [GreekChar]
+mkGreekWord = map (toEnum . mkGreekChar) . mkGreekSpec
+
+mkGravis :: [String] -> [String]
+mkGravis [] = []
+mkGravis [w] = [w]
+mkGravis (w1:w2:ws)
+ | stressed w2 = mkG w1 : mkGravis (w2:ws)
+ | otherwise = w1 : w2 : mkGravis ws
+ where
+ stressed w = any (`elem` "'~`") w
+ mkG :: String -> String
+ mkG w = let (w1,w2) = span (/='\'') w in
+ case w2 of
+ '\'':v:cs | not (any isVowel cs) -> w1 ++ "`" ++ [v] ++ cs
+ '\'':'!':v:cs | not (any isVowel cs) -> w1 ++ "`!" ++ [v] ++ cs
+ _ -> w
+ isVowel c = elem c "aehiouw"
+
+mkGreekSpec :: String -> [(Char,Int)]
+mkGreekSpec str = case str of
+ [] -> []
+ '(' :'\'': '!' : c : cs -> (c,25) : mkGreekSpec cs
+ '(' :'~' : '!' : c : cs -> (c,27) : mkGreekSpec cs
+ '(' :'`' : '!' : c : cs -> (c,23) : mkGreekSpec cs
+ '(' : '!' : c : cs -> (c,21) : mkGreekSpec cs
+ ')' :'\'': '!' : c : cs -> (c,24) : mkGreekSpec cs
+ ')' :'~' : '!' : c : cs -> (c,26) : mkGreekSpec cs
+ ')' :'`' : '!' : c : cs -> (c,22) : mkGreekSpec cs
+ ')' : '!' : c : cs -> (c,20) : mkGreekSpec cs
+ '\'': '!' : c : cs -> (c,30) : mkGreekSpec cs
+ '~' : '!' : c : cs -> (c,31) : mkGreekSpec cs
+ '`' : '!' : c : cs -> (c,32) : mkGreekSpec cs
+ '!' : c : cs -> (c,33) : mkGreekSpec cs
+ '(' :'\'': c : cs -> (c,5) : mkGreekSpec cs
+ '(' :'~' : c : cs -> (c,7) : mkGreekSpec cs
+ '(' :'`' : c : cs -> (c,3) : mkGreekSpec cs
+ '(' : c : cs -> (c,1) : mkGreekSpec cs
+ ')' :'\'': c : cs -> (c,4) : mkGreekSpec cs
+ ')' :'~' : c : cs -> (c,6) : mkGreekSpec cs
+ ')' :'`' : c : cs -> (c,2) : mkGreekSpec cs
+ ')' : c : cs -> (c,0) : mkGreekSpec cs
+ '\'': c : cs -> (c,10) : mkGreekSpec cs
+ '~' : c : cs -> (c,11) : mkGreekSpec cs
+ '`' : c : cs -> (c,12) : mkGreekSpec cs
+ c : cs -> (c,-1) : mkGreekSpec cs
+
+mkGreekChar (c,-1) = case lookup c cc of Just c' -> c' ; _ -> fromEnum c
+ where
+ cc = zip "abgdezhqiklmnxoprjstyfcuw" allGreekMin
+mkGreekChar (c,n) = case (c,n) of
+ ('a',10) -> 0x03ac
+ ('a',11) -> 0x1fb6
+ ('a',12) -> 0x1f70
+ ('a',30) -> 0x1fb4
+ ('a',31) -> 0x1fb7
+ ('a',32) -> 0x1fb2
+ ('a',33) -> 0x1fb3
+ ('a',n) | n >19 -> 0x1f80 + n - 20
+ ('a',n) -> 0x1f00 + n
+ ('e',10) -> 0x03ad -- '
+-- ('e',11) -> 0x1fb6 -- ~ can't happen
+ ('e',12) -> 0x1f72 -- `
+ ('e',n) -> 0x1f10 + n
+ ('h',10) -> 0x03ae -- '
+ ('h',11) -> 0x1fc6 -- ~
+ ('h',12) -> 0x1f74 -- `
+
+ ('h',30) -> 0x1fc4
+ ('h',31) -> 0x1fc7
+ ('h',32) -> 0x1fc2
+ ('h',33) -> 0x1fc3
+ ('h',n) | n >19 -> 0x1f90 + n - 20
+
+ ('h',n) -> 0x1f20 + n
+ ('i',10) -> 0x03af -- '
+ ('i',11) -> 0x1fd6 -- ~
+ ('i',12) -> 0x1f76 -- `
+ ('i',n) -> 0x1f30 + n
+ ('o',10) -> 0x03cc -- '
+-- ('o',11) -> 0x1fb6 -- ~ can't happen
+ ('o',12) -> 0x1f78 -- `
+ ('o',n) -> 0x1f40 + n
+ ('y',10) -> 0x03cd -- '
+ ('y',11) -> 0x1fe6 -- ~
+ ('y',12) -> 0x1f7a -- `
+ ('y',n) -> 0x1f50 + n
+ ('w',10) -> 0x03ce -- '
+ ('w',11) -> 0x1ff6 -- ~
+ ('w',12) -> 0x1f7c -- `
+
+ ('w',30) -> 0x1ff4
+ ('w',31) -> 0x1ff7
+ ('w',32) -> 0x1ff2
+ ('w',33) -> 0x1ff3
+ ('w',n) | n >19 -> 0x1fa0 + n - 20
+
+ ('w',n) -> 0x1f60 + n
+ ('r',1) -> 0x1fe5
+ _ -> mkGreekChar (c,-1) --- should not happen
+
+allGreekMin :: [Int]
+allGreekMin = [0x03b1 .. 0x03c9]
+
+
+{-
+encoding of Greek writing. Those hard to guess are marked with ---
+
+ maj min
+A a Alpha 0391 03b1
+B b Beta 0392 03b2
+G g Gamma 0393 03b3
+D d Delta 0394 03b4
+E e Epsilon 0395 03b5
+Z z Zeta 0396 03b6
+H h Eta --- 0397 03b7
+Q q Theta --- 0398 03b8
+I i Iota 0399 03b9
+K k Kappa 039a 03ba
+L l Lambda 039b 03bb
+M m My 039c 03bc
+N n Ny 039d 03bd
+X x Xi 039e 03be
+O o Omikron 039f 03bf
+P p Pi 03a0 03c0
+R r Rho 03a1 03c1
+ j Sigma --- 03c2
+S s Sigma 03a3 03c3
+T t Tau 03a4 03c4
+Y y Ypsilon 03a5 03c5
+F f Phi 03a6 03c6
+C c Khi --- 03a7 03c7
+U u Psi 03a8 03c8
+W w Omega --- 03a9 03c9
+
+( spiritus asper
+) spiritus lenis
+! iota subscriptum
+
+' acutus
+~ circumflexus
+` gravis
+
+-}
+
+
+
+
+
diff --git a/src/GF/Text/Hebrew.hs b/src/GF/Text/Hebrew.hs
new file mode 100644
index 000000000..ebcc078e3
--- /dev/null
+++ b/src/GF/Text/Hebrew.hs
@@ -0,0 +1,21 @@
+module Hebrew where
+
+mkHebrew :: String -> String
+mkHebrew = reverse . unwords . (map mkHebrewWord) . words
+--- reverse : assumes everything's on same line
+
+type HebrewChar = Char
+
+mkHebrewWord :: String -> [HebrewChar]
+mkHebrewWord = map mkHebrewChar
+
+mkHebrewChar c = case lookup c cc of Just c' -> c' ; _ -> c
+ where
+ cc = zip allHebrewCodes allHebrew
+
+allHebrewCodes = "-abgdhwzHTyKklMmNnSoPpCcqrst"
+
+allHebrew :: String
+allHebrew = (map toEnum (0x05be : [0x05d0 .. 0x05ea]))
+
+
diff --git a/src/GF/Text/Russian.hs b/src/GF/Text/Russian.hs
new file mode 100644
index 000000000..07605a83a
--- /dev/null
+++ b/src/GF/Text/Russian.hs
@@ -0,0 +1,31 @@
+module Russian where
+
+-- an ad hoc ASCII encoding. Delimiters: /_ _/
+mkRussian :: String -> String
+mkRussian = unwords . (map mkRussianWord) . words
+
+-- the KOI8 encoding, incomplete. Delimiters: /* */
+mkRusKOI8 :: String -> String
+mkRusKOI8 = unwords . (map mkRussianKOI8) . words
+
+type RussianChar = Char
+
+mkRussianWord :: String -> [RussianChar]
+mkRussianWord = map (mkRussianChar allRussianCodes)
+
+mkRussianKOI8 :: String -> [RussianChar]
+mkRussianKOI8 = map (mkRussianChar allRussianKOI8)
+
+mkRussianChar chars c = case lookup c cc of Just c' -> c' ; _ -> c
+ where
+ cc = zip chars allRussian
+
+allRussianCodes =
+ "ÅåABVGDEXZIJKLMNOPRSTUFHCQW£}!*ÖYÄabvgdexzijklmnoprstufhcqw#01'öyä"
+allRussianKOI8 =
+ "^@áâ÷çäåöúéêëìíîïðòóôõæèãþûýøùÿüàñÁÂ×ÇÄÅÖÚÉÊËÌÍÎÏÐÒÓÔÕÆÈÃÞÛÝØÙßÜÀÑ"
+
+allRussian :: String
+allRussian = (map toEnum (0x0401:0x0451:[0x0410 .. 0x044f])) -- Ëë in odd places
+
+
diff --git a/src/GF/Text/Text.hs b/src/GF/Text/Text.hs
new file mode 100644
index 000000000..08e897a9b
--- /dev/null
+++ b/src/GF/Text/Text.hs
@@ -0,0 +1,56 @@
+module Text where
+
+import Operations
+import Char
+
+-- elementary text postprocessing. AR 21/11/2001
+-- This is very primitive indeed. The functions should work on
+-- token lists and not on strings. AR 5/12/2002
+
+
+formatAsTextLit :: String -> String
+formatAsTextLit = formatAsText . unwords . map unStringLit . words
+--- hope that there will be deforestation...
+
+formatAsCodeLit :: String -> String
+formatAsCodeLit = formatAsCode . unwords . map unStringLit . words
+
+formatAsText :: String -> String
+formatAsText = unwords . format . cap . words where
+ format ws = case ws of
+ w : c : ww | major c -> (w ++ c) : format (cap ww)
+ w : c : ww | minor c -> (w ++ c) : format ww
+ c : ww | para c -> "\n\n" : format ww
+ w : ww -> w : format ww
+ [] -> []
+ cap (p:(c:cs):ww) | para p = p : (toUpper c : cs) : ww
+ cap ((c:cs):ww) = (toUpper c : cs) : ww
+ cap [] = []
+ major = flip elem (map singleton ".!?")
+ minor = flip elem (map singleton ",:;")
+ para = (=="<p>")
+
+formatAsCode :: String -> String
+formatAsCode = unwords . format . words where
+ format ws = case ws of
+ p : w : ww | parB p -> format ((p ++ w') : ww') where (w':ww') = format (w:ww)
+ w : p : ww | par p -> format ((w ++ p') : ww') where (p':ww') = format (p:ww)
+ w : ww -> w : format ww
+ [] -> []
+ parB = flip elem (map singleton "([{")
+ parE = flip elem (map singleton "}])")
+ par t = parB t || parE t
+
+performBinds :: String -> String
+performBinds = unwords . format . words where
+ format ws = case ws of
+ w : "&+" : u : ws -> format ((w ++ u) : ws)
+ w : ws -> w : format ws
+ [] -> []
+
+unStringLit :: String -> String
+unStringLit s = case s of
+ c : cs | strlim c && strlim (last cs) -> init cs
+ _ -> s
+ where
+ strlim = (=='\'')
diff --git a/src/GF/Text/UTF8.hs b/src/GF/Text/UTF8.hs
new file mode 100644
index 000000000..57b711b4b
--- /dev/null
+++ b/src/GF/Text/UTF8.hs
@@ -0,0 +1,35 @@
+module UTF8 where
+
+-- From the Char module supplied with HBC.
+-- code by Thomas Hallgren (Jul 10 1999)
+
+-- Take a Unicode string and encode it as a string
+-- with the UTF8 method.
+decodeUTF8 :: String -> String
+decodeUTF8 "" = ""
+decodeUTF8 (c:cs) | c < '\x80' = c : decodeUTF8 cs
+decodeUTF8 (c:c':cs) | '\xc0' <= c && c <= '\xdf' &&
+ '\x80' <= c' && c' <= '\xbf' =
+ toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : decodeUTF8 cs
+decodeUTF8 (c:c':c'':cs) | '\xe0' <= c && c <= '\xef' &&
+ '\x80' <= c' && c' <= '\xbf' &&
+ '\x80' <= c'' && c'' <= '\xbf' =
+ toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : decodeUTF8 cs
+decodeUTF8 _ = error "UniChar.decodeUTF8: bad data"
+
+encodeUTF8 :: String -> String
+encodeUTF8 "" = ""
+encodeUTF8 (c:cs) =
+ if c > '\x0000' && c < '\x0080' then
+ c : encodeUTF8 cs
+ else if c < toEnum 0x0800 then
+ let i = fromEnum c
+ in toEnum (0xc0 + i `div` 0x40) :
+ toEnum (0x80 + i `mod` 0x40) :
+ encodeUTF8 cs
+ else
+ let i = fromEnum c
+ in toEnum (0xe0 + i `div` 0x1000) :
+ toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) :
+ toEnum (0x80 + i `mod` 0x40) :
+ encodeUTF8 cs
diff --git a/src/GF/Text/Unicode.hs b/src/GF/Text/Unicode.hs
new file mode 100644
index 000000000..78aba0461
--- /dev/null
+++ b/src/GF/Text/Unicode.hs
@@ -0,0 +1,24 @@
+module Unicode where
+
+import Greek (mkGreek)
+import Arabic (mkArabic)
+import Hebrew (mkHebrew)
+import Russian (mkRussian, mkRusKOI8)
+
+-- ad hoc Unicode conversions from different alphabets
+
+-- AR 12/4/2000, 18/9/2001, 30/5/2002
+
+mkUnicode s = case s of
+ '/':'/':cs -> mkGreek (remClosing cs)
+ '/':'+':cs -> mkHebrew (remClosing cs)
+ '/':'-':cs -> mkArabic (remClosing cs)
+ '/':'_':cs -> mkRussian (remClosing cs)
+ '/':'*':cs -> mkRusKOI8 (remClosing cs)
+ _ -> s
+
+remClosing cs
+ | lcs > 1 && last cs == '/' = take (lcs-2) cs
+ | otherwise = cs
+ where lcs = length cs
+