From 6313244eacf992fb10a5091bee28582e84540809 Mon Sep 17 00:00:00 2001 From: krasimir Date: Mon, 19 Apr 2010 09:38:36 +0000 Subject: use the native unicode support from GHC 6.12 --- src/compiler/GF/Text/CP1250.hs | 91 -------------------------------- src/compiler/GF/Text/CP1251.hs | 86 ------------------------------ src/compiler/GF/Text/CP1252.hs | 17 ------ src/compiler/GF/Text/CP1254.hs | 84 ----------------------------- src/compiler/GF/Text/Coding.hs | 85 ++++++++++++++++++++++------- src/compiler/GF/Text/Lexing.hs | 6 --- src/compiler/GF/Text/Transliterations.hs | 2 - src/compiler/GF/Text/UTF8.hs | 48 ----------------- 8 files changed, 65 insertions(+), 354 deletions(-) delete mode 100644 src/compiler/GF/Text/CP1250.hs delete mode 100644 src/compiler/GF/Text/CP1251.hs delete mode 100644 src/compiler/GF/Text/CP1252.hs delete mode 100644 src/compiler/GF/Text/CP1254.hs delete mode 100644 src/compiler/GF/Text/UTF8.hs (limited to 'src/compiler/GF/Text') diff --git a/src/compiler/GF/Text/CP1250.hs b/src/compiler/GF/Text/CP1250.hs deleted file mode 100644 index 2ed263877..000000000 --- a/src/compiler/GF/Text/CP1250.hs +++ /dev/null @@ -1,91 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : GF.Text.CP1250 --- Maintainer : Krasimir Angelov --- --- cp1250 is a code page used under Microsoft Windows to represent texts --- in Central European and Eastern European languages that use Latin script, --- such as Polish, Czech, Slovak, Hungarian, Slovene, Bosnian, Croatian, --- Serbian (Latin script), Romanian and Albanian. It may also be used with --- the German language; German-language texts encoded with cp1250 and cp1252 --- are identical. --- ------------------------------------------------------------------------------ - -module GF.Text.CP1250 where - -import Data.Char - -decodeCP1250 = map convert where - convert c - | c == '\x80' = chr 0x20AC - | c == '\x82' = chr 0x201A - | c == '\x84' = chr 0x201E - | c == '\x85' = chr 0x2026 - | c == '\x86' = chr 0x2020 - | c == '\x87' = chr 0x2021 - | c == '\x89' = chr 0x2030 - | c == '\x8A' = chr 0x0160 - | c == '\x8B' = chr 0x2039 - | c == '\x8C' = chr 0x015A - | c == '\x8D' = chr 0x0164 - | c == '\x8E' = chr 0x017D - | c == '\x8F' = chr 0x0179 - | c == '\x91' = chr 0x2018 - | c == '\x92' = chr 0x2019 - | c == '\x93' = chr 0x201C - | c == '\x94' = chr 0x201D - | c == '\x95' = chr 0x2022 - | c == '\x96' = chr 0x2013 - | c == '\x97' = chr 0x2014 - | c == '\x99' = chr 0x2122 - | c == '\x9A' = chr 0x0161 - | c == '\x9B' = chr 0x203A - | c == '\x9C' = chr 0x015B - | c == '\x9D' = chr 0x0165 - | c == '\x9E' = chr 0x017E - | c == '\x9F' = chr 0x017A - | c == '\xA1' = chr 0x02C7 - | c == '\xA5' = chr 0x0104 - | c == '\xB9' = chr 0x0105 - | c == '\xBC' = chr 0x013D - | c == '\xBE' = chr 0x013E - | otherwise = c - - -encodeCP1250 = map convert where - convert c - | oc == 0x20AC = '\x80' - | oc == 0x201A = '\x82' - | oc == 0x201E = '\x84' - | oc == 0x2026 = '\x85' - | oc == 0x2020 = '\x86' - | oc == 0x2021 = '\x87' - | oc == 0x2030 = '\x89' - | oc == 0x0160 = '\x8A' - | oc == 0x2039 = '\x8B' - | oc == 0x015A = '\x8C' - | oc == 0x0164 = '\x8D' - | oc == 0x017D = '\x8E' - | oc == 0x0179 = '\x8F' - | oc == 0x2018 = '\x91' - | oc == 0x2019 = '\x92' - | oc == 0x201C = '\x93' - | oc == 0x201D = '\x94' - | oc == 0x2022 = '\x95' - | oc == 0x2013 = '\x96' - | oc == 0x2014 = '\x97' - | oc == 0x2122 = '\x99' - | oc == 0x0161 = '\x9A' - | oc == 0x203A = '\x9B' - | oc == 0x015B = '\x9C' - | oc == 0x0165 = '\x9D' - | oc == 0x017E = '\x9E' - | oc == 0x017A = '\x9F' - | oc == 0x02C7 = '\xA1' - | oc == 0x0104 = '\xA5' - | oc == 0x0105 = '\xB9' - | oc == 0x013D = '\xBC' - | oc == 0x013E = '\xBE' - | otherwise = c - where oc = ord c diff --git a/src/compiler/GF/Text/CP1251.hs b/src/compiler/GF/Text/CP1251.hs deleted file mode 100644 index 8d8ceebf6..000000000 --- a/src/compiler/GF/Text/CP1251.hs +++ /dev/null @@ -1,86 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : GF.Text.CP1251 --- Maintainer : Krasimir Angelov --- --- cp1251 is a popular 8-bit character encoding, designed to cover languages --- that use the Cyrillic alphabet such as Russian, Bulgarian, Serbian Cyrillic --- and other languages. It is the most widely used for encoding the Bulgarian, --- Serbian and Macedonian languages. --- ------------------------------------------------------------------------------ - -module GF.Text.CP1251 where - -import Data.Char - -decodeCP1251 = map convert where - convert c - | c >= '\xC0' && c <= '\xFF' = chr (ord c + (0x410-0xC0)) - | c == '\xA8' = chr 0x401 -- cyrillic capital letter lo - | c == '\x80' = chr 0x402 - | c == '\x81' = chr 0x403 - | c == '\xAA' = chr 0x404 - | c == '\xBD' = chr 0x405 - | c == '\xB2' = chr 0x406 - | c == '\xAF' = chr 0x407 - | c == '\xA3' = chr 0x408 - | c == '\x8A' = chr 0x409 - | c == '\x8C' = chr 0x40A - | c == '\x8E' = chr 0x40B - | c == '\x8D' = chr 0x40C - | c == '\xA1' = chr 0x40E - | c == '\x8F' = chr 0x40F - | c == '\xB8' = chr 0x451 -- cyrillic small letter lo - | c == '\x90' = chr 0x452 - | c == '\x83' = chr 0x453 - | c == '\xBA' = chr 0x454 - | c == '\xBE' = chr 0x455 - | c == '\xB3' = chr 0x456 - | c == '\xBF' = chr 0x457 - | c == '\xBC' = chr 0x458 - | c == '\x9A' = chr 0x459 - | c == '\x9C' = chr 0x45A - | c == '\x9E' = chr 0x45B - | c == '\x9D' = chr 0x45C - | c == '\xA2' = chr 0x45E - | c == '\x9F' = chr 0x45F - | c == '\xA5' = chr 0x490 - | c == '\xB4' = chr 0x491 - | otherwise = c - -encodeCP1251 = map convert where - convert c - | oc >= 0x410 && oc <= 0x44F = chr (oc - (0x410-0xC0)) - | oc == 0x401 = '\xA8' -- cyrillic capital letter lo - | oc == 0x402 = '\x80' - | oc == 0x403 = '\x81' - | oc == 0x404 = '\xAA' - | oc == 0x405 = '\xBD' - | oc == 0x406 = '\xB2' - | oc == 0x407 = '\xAF' - | oc == 0x408 = '\xA3' - | oc == 0x409 = '\x8A' - | oc == 0x40A = '\x8C' - | oc == 0x40B = '\x8E' - | oc == 0x40C = '\x8D' - | oc == 0x40E = '\xA1' - | oc == 0x40F = '\x8F' - | oc == 0x451 = '\xB8' -- cyrillic small letter lo - | oc == 0x452 = '\x90' - | oc == 0x453 = '\x83' - | oc == 0x454 = '\xBA' - | oc == 0x455 = '\xBE' - | oc == 0x456 = '\xB3' - | oc == 0x457 = '\xBF' - | oc == 0x458 = '\xBC' - | oc == 0x459 = '\x9A' - | oc == 0x45A = '\x9C' - | oc == 0x45B = '\x9E' - | oc == 0x45C = '\x9D' - | oc == 0x45E = '\xA2' - | oc == 0x45F = '\x9F' - | oc == 0x490 = '\xA5' - | oc == 0x491 = '\xB4' - | otherwise = c - where oc = ord c diff --git a/src/compiler/GF/Text/CP1252.hs b/src/compiler/GF/Text/CP1252.hs deleted file mode 100644 index a1d8ab8f3..000000000 --- a/src/compiler/GF/Text/CP1252.hs +++ /dev/null @@ -1,17 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : GF.Text.CP1252 --- Maintainer : Krasimir Angelov --- --- cp1252 is a character encoding of the Latin alphabet, used by default in --- the legacy components of Microsoft Windows in English and some other --- Western languages. --- ------------------------------------------------------------------------------ - -module GF.Text.CP1252 where - -import Data.Char - -decodeCP1252 = map id -encodeCP1252 = map (\x -> if x <= '\255' then x else '?') diff --git a/src/compiler/GF/Text/CP1254.hs b/src/compiler/GF/Text/CP1254.hs deleted file mode 100644 index 488359d70..000000000 --- a/src/compiler/GF/Text/CP1254.hs +++ /dev/null @@ -1,84 +0,0 @@ ------------------------------------------------------------------------------ --- | --- Module : GF.Text.CP1254 --- Maintainer : Krasimir Angelov --- --- cp1254 is a code page used under Microsoft Windows to write Turkish. --- Characters with codepoints A0 through FF are compatible with ISO 8859-9. --- ------------------------------------------------------------------------------ - -module GF.Text.CP1254 where - -import Data.Char - -decodeCP1254 = map convert where - convert c - | c == '\x80' = chr 0x20AC - | c == '\x82' = chr 0x201A - | c == '\x83' = chr 0x192 - | c == '\x84' = chr 0x201E - | c == '\x85' = chr 0x2026 - | c == '\x86' = chr 0x2020 - | c == '\x87' = chr 0x2021 - | c == '\x88' = chr 0x2C6 - | c == '\x89' = chr 0x2030 - | c == '\x8A' = chr 0x160 - | c == '\x8B' = chr 0x2039 - | c == '\x8C' = chr 0x152 - | c == '\x91' = chr 0x2018 - | c == '\x92' = chr 0x2019 - | c == '\x93' = chr 0x201C - | c == '\x94' = chr 0x201D - | c == '\x95' = chr 0x2022 - | c == '\x96' = chr 0x2013 - | c == '\x97' = chr 0x2014 - | c == '\x98' = chr 0x2DC - | c == '\x99' = chr 0x2122 - | c == '\x9A' = chr 0x161 - | c == '\x9B' = chr 0x203A - | c == '\x9C' = chr 0x153 - | c == '\x9F' = chr 0x178 - | c == '\xD0' = chr 0x11E - | c == '\xDD' = chr 0x130 - | c == '\xDE' = chr 0x15E - | c == '\xF0' = chr 0x11F - | c == '\xFD' = chr 0x131 - | c == '\xFE' = chr 0x15F - | otherwise = c - -encodeCP1254 = map convert where - convert c - | oc == 0x20AC = '\x80' - | oc == 0x201A = '\x82' - | oc == 0x192 = '\x83' - | oc == 0x201E = '\x84' - | oc == 0x2026 = '\x85' - | oc == 0x2020 = '\x86' - | oc == 0x2021 = '\x87' - | oc == 0x2C6 = '\x88' - | oc == 0x2030 = '\x89' - | oc == 0x160 = '\x8A' - | oc == 0x2039 = '\x8B' - | oc == 0x152 = '\x8C' - | oc == 0x2018 = '\x91' - | oc == 0x2019 = '\x92' - | oc == 0x201C = '\x93' - | oc == 0x201D = '\x94' - | oc == 0x2022 = '\x95' - | oc == 0x2013 = '\x96' - | oc == 0x2014 = '\x97' - | oc == 0x2DC = '\x98' - | oc == 0x2122 = '\x99' - | oc == 0x161 = '\x9A' - | oc == 0x203A = '\x9B' - | oc == 0x153 = '\x9C' - | oc == 0x178 = '\x9F' - | oc == 0x11E = '\xD0' - | oc == 0x130 = '\xDD' - | oc == 0x15E = '\xDE' - | oc == 0x11F = '\xF0' - | oc == 0x131 = '\xFD' - | oc == 0x15F = '\xFE' - | otherwise = c - where oc = ord c diff --git a/src/compiler/GF/Text/Coding.hs b/src/compiler/GF/Text/Coding.hs index 3481b278d..a206bb4d2 100644 --- a/src/compiler/GF/Text/Coding.hs +++ b/src/compiler/GF/Text/Coding.hs @@ -1,24 +1,69 @@ module GF.Text.Coding where -import GF.Infra.Option -import GF.Text.UTF8 -import GF.Text.CP1250 -import GF.Text.CP1251 -import GF.Text.CP1252 -import GF.Text.CP1254 +import qualified Data.ByteString as BS +import Data.ByteString.Internal +import GHC.IO +import GHC.IO.Buffer +import GHC.IO.Encoding +import GHC.IO.Exception +import Control.Monad -encodeUnicode e = case e of - UTF_8 -> encodeUTF8 - CP_1250 -> encodeCP1250 - CP_1251 -> encodeCP1251 - CP_1252 -> encodeCP1252 - CP_1254 -> encodeCP1254 - _ -> id +encodeUnicode :: TextEncoding -> String -> ByteString +encodeUnicode enc s = + unsafePerformIO $ do + let len = length s + cbuf0 <- newCharBuffer (len*4) ReadBuffer + foldM (\i c -> writeCharBuf (bufRaw cbuf0) i c) 0 s + let cbuf = cbuf0{bufR=len} + case enc of + TextEncoding {mkTextEncoder=mk} -> do encoder <- mk + bss <- translate (encode encoder) cbuf + close encoder + return (BS.concat bss) + where + translate cod cbuf + | i < w = do bbuf <- newByteBuffer 128 WriteBuffer + (cbuf,bbuf) <- cod cbuf bbuf + if isEmptyBuffer bbuf + then ioe_invalidCharacter + else do let bs = PS (bufRaw bbuf) (bufL bbuf) (bufR bbuf-bufL bbuf) + bss <- translate cod cbuf + return (bs:bss) + | otherwise = return [] + where + i = bufL cbuf + w = bufR cbuf -decodeUnicode e = case e of - UTF_8 -> decodeUTF8 - CP_1250 -> decodeCP1250 - CP_1251 -> decodeCP1251 - CP_1252 -> decodeCP1252 - CP_1254 -> decodeCP1254 - _ -> id +decodeUnicode :: TextEncoding -> ByteString -> String +decodeUnicode enc (PS fptr l len) = + unsafePerformIO $ do + let bbuf = Buffer{bufRaw=fptr, bufState=ReadBuffer, bufSize=len, bufL=l, bufR=l+len} + cbuf <- newCharBuffer 128 WriteBuffer + case enc of + TextEncoding {mkTextDecoder=mk} -> do decoder <- mk + s <- translate (encode decoder) bbuf cbuf + close decoder + return s + where + translate cod bbuf cbuf + | i < w = do (bbuf,cbuf) <- cod bbuf cbuf + if isEmptyBuffer cbuf + then ioe_invalidCharacter + else unpack cod bbuf cbuf + | otherwise = return [] + where + i = bufL bbuf + w = bufR bbuf + + unpack cod bbuf cbuf + | i < w = do (c,i') <- readCharBuf (bufRaw cbuf) i + cs <- unpack cod bbuf cbuf{bufL=i'} + return (c:cs) + | otherwise = translate cod bbuf cbuf{bufL=0,bufR=0} + where + i = bufL cbuf + w = bufR cbuf + +ioe_invalidCharacter = ioException + (IOError Nothing InvalidArgument "" + ("invalid byte sequence for this encoding") Nothing Nothing) diff --git a/src/compiler/GF/Text/Lexing.hs b/src/compiler/GF/Text/Lexing.hs index a5a2c71eb..ec030e158 100644 --- a/src/compiler/GF/Text/Lexing.hs +++ b/src/compiler/GF/Text/Lexing.hs @@ -1,8 +1,6 @@ module GF.Text.Lexing (stringOp,opInEnv) where import GF.Text.Transliterations -import GF.Text.UTF8 -import GF.Text.CP1251 import Data.Char import Data.List (intersperse) @@ -23,10 +21,6 @@ stringOp name = case name of "unlexmixed" -> Just $ capitInit . appUnlexer (unlexMixed . unquote) "unwords" -> Just $ appUnlexer unwords "to_html" -> Just wrapHTML - "to_utf8" -> Just encodeUTF8 - "from_utf8" -> Just decodeUTF8 - "to_cp1251" -> Just encodeCP1251 - "from_cp1251" -> Just decodeCP1251 _ -> transliterate name -- perform op in environments beg--end, t.ex. between "--" diff --git a/src/compiler/GF/Text/Transliterations.hs b/src/compiler/GF/Text/Transliterations.hs index bd56f5f89..cbe8baf15 100644 --- a/src/compiler/GF/Text/Transliterations.hs +++ b/src/compiler/GF/Text/Transliterations.hs @@ -5,8 +5,6 @@ module GF.Text.Transliterations ( transliterationPrintNames ) where -import GF.Text.UTF8 - import Data.Char import Numeric import qualified Data.Map as Map diff --git a/src/compiler/GF/Text/UTF8.hs b/src/compiler/GF/Text/UTF8.hs deleted file mode 100644 index 5e9687684..000000000 --- a/src/compiler/GF/Text/UTF8.hs +++ /dev/null @@ -1,48 +0,0 @@ ----------------------------------------------------------------------- --- | --- Module : UTF8 --- Maintainer : AR --- Stability : (stable) --- Portability : (portable) --- --- > CVS $Date: 2005/04/21 16:23:42 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.5 $ --- --- From the Char module supplied with HBC. --- code by Thomas Hallgren (Jul 10 1999) ------------------------------------------------------------------------------ - -module GF.Text.UTF8 (decodeUTF8, encodeUTF8) where - --- | Take a Unicode string and encode it as a string --- with the UTF8 method. -decodeUTF8 :: String -> String -decodeUTF8 "" = "" -decodeUTF8 (c:cs) | c < '\x80' = c : decodeUTF8 cs -decodeUTF8 (c:c':cs) | '\xc0' <= c && c <= '\xdf' && - '\x80' <= c' && c' <= '\xbf' = - toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : decodeUTF8 cs -decodeUTF8 (c:c':c'':cs) | '\xe0' <= c && c <= '\xef' && - '\x80' <= c' && c' <= '\xbf' && - '\x80' <= c'' && c'' <= '\xbf' = - toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : decodeUTF8 cs -decodeUTF8 s = s ---- AR workaround 22/6/2006 -----decodeUTF8 _ = error "UniChar.decodeUTF8: bad data" - -encodeUTF8 :: String -> String -encodeUTF8 "" = "" -encodeUTF8 (c:cs) = - if c > '\x0000' && c < '\x0080' then - c : encodeUTF8 cs - else if c < toEnum 0x0800 then - let i = fromEnum c - in toEnum (0xc0 + i `div` 0x40) : - toEnum (0x80 + i `mod` 0x40) : - encodeUTF8 cs - else - let i = fromEnum c - in toEnum (0xe0 + i `div` 0x1000) : - toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) : - toEnum (0x80 + i `mod` 0x40) : - encodeUTF8 cs -- cgit v1.2.3