Founding the newly structured GF2.0 cvs archive.

author: aarne <unknown> 2003-09-22 13:16:55 +0000
committer: aarne <unknown> 2003-09-22 13:16:55 +0000
commit: b1402e8bd6a68a891b00a214d6cf184d66defe19 (patch)
tree: 90372ac4e53dce91cf949dbf8e93be06f1d9e8bd /src/GF/Text
7 files changed, 373 insertions, 0 deletions
diff --git a/src/GF/Text/Arabic.hs b/src/GF/Text/Arabic.hs
new file mode 100644
index 000000000..6df79c4a9
--- /dev/null
+++ b/src/GF/Text/Arabic.hs
@@ -0,0 +1,48 @@
+module Arabic where
+
+mkArabic :: String -> String
+mkArabic = reverse . unwords . (map mkArabicWord) . words
+--- reverse : assumes everything's on same line
+
+type ArabicChar = Char
+
+mkArabicWord :: String -> [ArabicChar]
+mkArabicWord = map mkArabicChar . getLetterPos
+
+getLetterPos :: String -> [(Char,Int)]
+getLetterPos []     = []
+getLetterPos ('I':cs)  = ('*',7) : getLetterPos cs           -- 0xfe80
+getLetterPos ('O':cs)  = ('*',8) : getIn cs                  -- 0xfe8b
+getLetterPos ('l':'a':cs) = ('*',5) : getLetterPos cs        -- 0xfefb
+getLetterPos [c]    = [(c,1)]                                -- 1=isolated
+getLetterPos (c:cs) | isReduced c = (c,1) : getLetterPos cs
+getLetterPos (c:cs) = (c,3) : getIn cs                       -- 3=initial
+
+  
+getIn []     = []
+getIn ('I':cs)  = ('*',7) : getLetterPos cs                -- 0xfe80
+getIn ('O':cs)  = ('*',9) : getIn cs                       -- 0xfe8c
+getIn ('l':'a':cs) = ('*',6) : getLetterPos cs             -- 0xfefc
+getIn [c]    = [(c,2)]                                     -- 2=final
+getIn (c:cs) | isReduced c = (c,2) : getLetterPos cs
+getIn (c:cs) = (c,4) : getIn cs                            -- 4=medial
+
+isReduced :: Char -> Bool
+isReduced c = c `elem` "UuWiYOaAdVrzwj"
+
+mkArabicChar ('*',p) | p > 4 && p < 10 = 
+  (map toEnum [0xfefb,0xfefc,0xfe80,0xfe8b,0xfe8c]) !! (p-5)  
+mkArabicChar cp@(c,p) = case lookup c cc of Just c' -> (c' !! (p-1)) ; _ -> c 
+ where 
+   cc = mkArabicTab allArabicCodes allArabic
+
+mkArabicTab (c:cs) as = (c,as1) : mkArabicTab cs as2 where
+  (as1,as2) = if isReduced c then splitAt 2 as else splitAt 4 as
+mkArabicTab [] _ = []
+
+allArabicCodes = "UuWiYOabAtvgHCdVrzscSDTZoxfqklmnhwjy" 
+
+allArabic :: String
+allArabic = (map toEnum [0xfe81 .. 0xfef4]) -- I=0xfe80
+
+
diff --git a/src/GF/Text/Greek.hs b/src/GF/Text/Greek.hs
new file mode 100644
index 000000000..8cbba8c54
--- /dev/null
+++ b/src/GF/Text/Greek.hs
@@ -0,0 +1,158 @@
+module Greek where
+
+mkGreek :: String -> String
+mkGreek = unwords . (map mkGreekWord) . mkGravis . words
+
+--- TODO : optimize character formation by factorizing the case expressions
+
+type GreekChar = Char
+
+mkGreekWord :: String -> [GreekChar]
+mkGreekWord = map (toEnum . mkGreekChar) . mkGreekSpec
+
+mkGravis :: [String] -> [String]
+mkGravis [] = []
+mkGravis [w] = [w]
+mkGravis (w1:w2:ws)
+ | stressed w2 = mkG w1 : mkGravis (w2:ws)
+ | otherwise = w1 : w2 : mkGravis ws
+  where
+    stressed w = any (`elem`  "'~`") w
+    mkG :: String -> String
+    mkG w = let (w1,w2) = span (/='\'') w in
+               case w2 of
+                 '\'':v:cs | not (any isVowel cs) -> w1 ++ "`" ++ [v] ++ cs
+                 '\'':'!':v:cs | not (any isVowel cs) -> w1 ++ "`!" ++ [v] ++ cs
+                 _ -> w
+    isVowel c = elem c "aehiouw"
+
+mkGreekSpec :: String -> [(Char,Int)]
+mkGreekSpec str = case str of
+  [] -> []
+  '(' :'\'': '!' : c : cs -> (c,25)  : mkGreekSpec cs
+  '(' :'~' : '!' : c : cs -> (c,27)  : mkGreekSpec cs
+  '(' :'`' : '!' : c : cs -> (c,23)  : mkGreekSpec cs
+  '(' :      '!' : c : cs -> (c,21)  : mkGreekSpec cs
+  ')' :'\'': '!' : c : cs -> (c,24)  : mkGreekSpec cs
+  ')' :'~' : '!' : c : cs -> (c,26)  : mkGreekSpec cs
+  ')' :'`' : '!' : c : cs -> (c,22)  : mkGreekSpec cs
+  ')' :      '!' : c : cs -> (c,20)  : mkGreekSpec cs
+  '\'':      '!' : c : cs -> (c,30)  : mkGreekSpec cs
+  '~' :      '!' : c : cs -> (c,31)  : mkGreekSpec cs
+  '`' :      '!' : c : cs -> (c,32)  : mkGreekSpec cs
+  '!' :            c : cs -> (c,33)  : mkGreekSpec cs
+  '(' :'\'': c : cs -> (c,5)  : mkGreekSpec cs
+  '(' :'~' : c : cs -> (c,7)  : mkGreekSpec cs
+  '(' :'`' : c : cs -> (c,3)  : mkGreekSpec cs
+  '(' :      c : cs -> (c,1)  : mkGreekSpec cs
+  ')' :'\'': c : cs -> (c,4)  : mkGreekSpec cs
+  ')' :'~' : c : cs -> (c,6)  : mkGreekSpec cs
+  ')' :'`' : c : cs -> (c,2)  : mkGreekSpec cs
+  ')' :      c : cs -> (c,0)  : mkGreekSpec cs
+  '\'':      c : cs -> (c,10) : mkGreekSpec cs
+  '~' :      c : cs -> (c,11) : mkGreekSpec cs
+  '`' :      c : cs -> (c,12) : mkGreekSpec cs
+  c   :          cs -> (c,-1) : mkGreekSpec cs
+
+mkGreekChar (c,-1) = case lookup c cc of Just c' -> c' ; _ -> fromEnum c 
+ where 
+   cc = zip "abgdezhqiklmnxoprjstyfcuw" allGreekMin
+mkGreekChar (c,n) = case (c,n) of
+  ('a',10)        -> 0x03ac 
+  ('a',11)        -> 0x1fb6
+  ('a',12)        -> 0x1f70
+  ('a',30)        -> 0x1fb4
+  ('a',31)        -> 0x1fb7
+  ('a',32)        -> 0x1fb2
+  ('a',33)        -> 0x1fb3
+  ('a',n) | n >19 -> 0x1f80 + n - 20
+  ('a',n)         -> 0x1f00 + n
+  ('e',10)        -> 0x03ad    -- ' 
+--  ('e',11)        -> 0x1fb6    -- ~ can't happen
+  ('e',12)        -> 0x1f72    -- `
+  ('e',n)         -> 0x1f10 + n
+  ('h',10)        -> 0x03ae    -- ' 
+  ('h',11)        -> 0x1fc6    -- ~
+  ('h',12)        -> 0x1f74    -- `
+
+  ('h',30)        -> 0x1fc4
+  ('h',31)        -> 0x1fc7
+  ('h',32)        -> 0x1fc2
+  ('h',33)        -> 0x1fc3
+  ('h',n) | n >19 -> 0x1f90 + n - 20
+
+  ('h',n)         -> 0x1f20 + n
+  ('i',10)        -> 0x03af    -- ' 
+  ('i',11)        -> 0x1fd6    -- ~
+  ('i',12)        -> 0x1f76    -- `
+  ('i',n)         -> 0x1f30 + n
+  ('o',10)        -> 0x03cc    -- ' 
+--  ('o',11)        -> 0x1fb6    -- ~ can't happen
+  ('o',12)        -> 0x1f78    -- `
+  ('o',n)         -> 0x1f40 + n
+  ('y',10)        -> 0x03cd    -- ' 
+  ('y',11)        -> 0x1fe6    -- ~
+  ('y',12)        -> 0x1f7a    -- `
+  ('y',n)         -> 0x1f50 + n
+  ('w',10)        -> 0x03ce    -- ' 
+  ('w',11)        -> 0x1ff6    -- ~
+  ('w',12)        -> 0x1f7c    -- `
+
+  ('w',30)        -> 0x1ff4
+  ('w',31)        -> 0x1ff7
+  ('w',32)        -> 0x1ff2
+  ('w',33)        -> 0x1ff3
+  ('w',n) | n >19 -> 0x1fa0 + n - 20
+
+  ('w',n)         -> 0x1f60 + n
+  ('r',1)         -> 0x1fe5
+  _               -> mkGreekChar (c,-1) --- should not happen
+
+allGreekMin :: [Int]
+allGreekMin = [0x03b1 .. 0x03c9]
+
+
+{-
+encoding of Greek writing. Those hard to guess are marked with ---
+
+               maj   min
+A a Alpha     0391  03b1  
+B b Beta      0392  03b2
+G g Gamma     0393  03b3
+D d Delta     0394  03b4
+E e Epsilon   0395  03b5
+Z z Zeta      0396  03b6
+H h Eta ---   0397  03b7
+Q q Theta --- 0398  03b8
+I i Iota      0399  03b9
+K k Kappa     039a  03ba
+L l Lambda    039b  03bb
+M m My        039c  03bc
+N n Ny        039d  03bd
+X x Xi        039e  03be
+O o Omikron   039f  03bf
+P p Pi        03a0  03c0
+R r Rho       03a1  03c1
+  j Sigma ---       03c2
+S s Sigma     03a3  03c3
+T t Tau       03a4  03c4
+Y y Ypsilon   03a5  03c5
+F f Phi       03a6  03c6
+C c Khi ---   03a7  03c7
+U u Psi       03a8  03c8
+W w Omega --- 03a9  03c9
+
+( spiritus asper
+) spiritus lenis
+! iota subscriptum
+
+' acutus
+~ circumflexus
+` gravis
+
+-}
+
+
+
+
+
diff --git a/src/GF/Text/Hebrew.hs b/src/GF/Text/Hebrew.hs
new file mode 100644
index 000000000..ebcc078e3
--- /dev/null
+++ b/src/GF/Text/Hebrew.hs
@@ -0,0 +1,21 @@
+module Hebrew where
+
+mkHebrew :: String -> String
+mkHebrew = reverse . unwords . (map mkHebrewWord) . words
+--- reverse : assumes everything's on same line
+
+type HebrewChar = Char
+
+mkHebrewWord :: String -> [HebrewChar]
+mkHebrewWord = map mkHebrewChar
+
+mkHebrewChar c = case lookup c cc of Just c' -> c' ; _ -> c 
+ where 
+   cc = zip allHebrewCodes allHebrew
+
+allHebrewCodes = "-abgdhwzHTyKklMmNnSoPpCcqrst"
+
+allHebrew :: String
+allHebrew = (map toEnum (0x05be : [0x05d0 .. 0x05ea]))
+
+
diff --git a/src/GF/Text/Russian.hs b/src/GF/Text/Russian.hs
new file mode 100644
index 000000000..07605a83a
--- /dev/null
+++ b/src/GF/Text/Russian.hs
@@ -0,0 +1,31 @@
+module Russian where
+
+-- an ad hoc ASCII encoding. Delimiters: /_ _/
+mkRussian :: String -> String
+mkRussian = unwords . (map mkRussianWord) . words
+
+-- the KOI8 encoding, incomplete. Delimiters: /* */
+mkRusKOI8 :: String -> String
+mkRusKOI8 = unwords . (map mkRussianKOI8) . words
+
+type RussianChar = Char
+
+mkRussianWord :: String -> [RussianChar]
+mkRussianWord = map (mkRussianChar allRussianCodes)
+
+mkRussianKOI8 :: String -> [RussianChar]
+mkRussianKOI8 = map (mkRussianChar allRussianKOI8)
+
+mkRussianChar chars c = case lookup c cc of Just c' -> c' ; _ -> c 
+ where 
+   cc = zip chars allRussian
+
+allRussianCodes = 
+ "��ABVGDEXZIJKLMNOPRSTUFHCQW�}!*�Y�abvgdexzijklmnoprstufhcqw#01'�y�"
+allRussianKOI8 =
+ "^@����������������������������������������������������������������"
+
+allRussian :: String
+allRussian = (map toEnum (0x0401:0x0451:[0x0410 .. 0x044f])) -- �� in odd places
+
+
diff --git a/src/GF/Text/Text.hs b/src/GF/Text/Text.hs
new file mode 100644
index 000000000..08e897a9b
--- /dev/null
+++ b/src/GF/Text/Text.hs
@@ -0,0 +1,56 @@
+module Text where
+
+import Operations
+import Char
+
+-- elementary text postprocessing. AR 21/11/2001
+-- This is very primitive indeed. The functions should work on
+-- token lists and not on strings. AR 5/12/2002
+
+
+formatAsTextLit :: String -> String
+formatAsTextLit = formatAsText . unwords . map unStringLit . words 
+--- hope that there will be deforestation...
+
+formatAsCodeLit :: String -> String
+formatAsCodeLit = formatAsCode . unwords . map unStringLit . words 
+
+formatAsText :: String -> String
+formatAsText = unwords . format . cap . words where
+  format ws = case ws of
+    w : c : ww | major c -> (w ++ c)      : format (cap ww)
+    w : c : ww | minor c -> (w ++ c)      : format ww
+    c     : ww | para  c -> "\n\n"        : format ww
+    w     : ww           -> w             : format ww
+    [] -> []
+  cap (p:(c:cs):ww) | para p = p : (toUpper c : cs) : ww
+  cap ((c:cs):ww) = (toUpper c : cs) : ww
+  cap [] = []
+  major = flip elem (map singleton ".!?") 
+  minor = flip elem (map singleton ",:;")
+  para  = (=="<p>") 
+
+formatAsCode :: String -> String
+formatAsCode = unwords . format . words where
+  format ws = case ws of
+    p : w : ww | parB p -> format ((p ++ w') : ww') where (w':ww') = format (w:ww)
+    w : p : ww | par  p -> format ((w ++ p') : ww') where (p':ww') = format (p:ww)
+    w     : ww           -> w        : format ww
+    [] -> []
+  parB = flip elem (map singleton "([{")
+  parE = flip elem (map singleton "}])")
+  par t = parB t || parE t
+
+performBinds :: String -> String
+performBinds = unwords . format . words where
+  format ws = case ws of
+    w : "&+" : u : ws -> format ((w ++ u) : ws)
+    w : ws            -> w : format ws
+    []                -> []
+
+unStringLit :: String -> String
+unStringLit s = case s of
+  c : cs | strlim c && strlim (last cs) -> init cs
+  _ -> s
+ where
+   strlim = (=='\'')
diff --git a/src/GF/Text/UTF8.hs b/src/GF/Text/UTF8.hs
new file mode 100644
index 000000000..57b711b4b
--- /dev/null
+++ b/src/GF/Text/UTF8.hs
@@ -0,0 +1,35 @@
+module UTF8 where
+
+-- From the Char module supplied with HBC.
+-- code by Thomas Hallgren (Jul 10 1999)
+
+-- Take a Unicode string and encode it as a string
+-- with the UTF8 method.
+decodeUTF8 :: String -> String
+decodeUTF8 "" = ""
+decodeUTF8 (c:cs) | c < '\x80' = c : decodeUTF8 cs
+decodeUTF8 (c:c':cs) | '\xc0' <= c  && c  <= '\xdf' && 
+		      '\x80' <= c' && c' <= '\xbf' =
+	toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : decodeUTF8 cs
+decodeUTF8 (c:c':c'':cs) | '\xe0' <= c   && c   <= '\xef' && 
+		          '\x80' <= c'  && c'  <= '\xbf' &&
+		          '\x80' <= c'' && c'' <= '\xbf' =
+	toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : decodeUTF8 cs
+decodeUTF8 _ = error "UniChar.decodeUTF8: bad data"
+
+encodeUTF8 :: String -> String
+encodeUTF8 "" = ""
+encodeUTF8 (c:cs) =
+	if c > '\x0000' && c < '\x0080' then
+	    c : encodeUTF8 cs
+	else if c < toEnum 0x0800 then
+	    let i = fromEnum c
+	    in  toEnum (0xc0 + i `div` 0x40) : 
+	        toEnum (0x80 + i `mod` 0x40) : 
+		encodeUTF8 cs
+	else
+	    let i = fromEnum c
+	    in  toEnum (0xe0 + i `div` 0x1000) : 
+	        toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) : 
+		toEnum (0x80 + i `mod` 0x40) : 
+		encodeUTF8 cs
diff --git a/src/GF/Text/Unicode.hs b/src/GF/Text/Unicode.hs
new file mode 100644
index 000000000..78aba0461
--- /dev/null
+++ b/src/GF/Text/Unicode.hs
@@ -0,0 +1,24 @@
+module Unicode where
+
+import Greek (mkGreek)
+import Arabic (mkArabic)
+import Hebrew (mkHebrew)
+import Russian (mkRussian, mkRusKOI8)
+
+-- ad hoc Unicode conversions from different alphabets
+
+-- AR 12/4/2000, 18/9/2001, 30/5/2002
+
+mkUnicode s = case s of
+ '/':'/':cs -> mkGreek   (remClosing cs)
+ '/':'+':cs -> mkHebrew  (remClosing cs)
+ '/':'-':cs -> mkArabic  (remClosing cs)
+ '/':'_':cs -> mkRussian (remClosing cs)
+ '/':'*':cs -> mkRusKOI8 (remClosing cs)
+ _      -> s
+
+remClosing cs 
+ | lcs > 1 && last cs == '/' = take (lcs-2) cs
+ | otherwise = cs
+    where lcs = length cs
+
author	aarne <unknown>	2003-09-22 13:16:55 +0000
committer	aarne <unknown>	2003-09-22 13:16:55 +0000
commit	b1402e8bd6a68a891b00a214d6cf184d66defe19 (patch)
tree	90372ac4e53dce91cf949dbf8e93be06f1d9e8bd /src/GF/Text