diff options
| author | krasimir <krasimir@chalmers.se> | 2010-04-19 09:38:36 +0000 |
|---|---|---|
| committer | krasimir <krasimir@chalmers.se> | 2010-04-19 09:38:36 +0000 |
| commit | 6313244eacf992fb10a5091bee28582e84540809 (patch) | |
| tree | 8208fb18a5e1ab9447bd060cf08a3d78ed0a8c0a /src/compiler/GF/Text | |
| parent | 8b5827fc892c2f395ae26f1811da2d4cc3b1669d (diff) | |
use the native unicode support from GHC 6.12
Diffstat (limited to 'src/compiler/GF/Text')
| -rw-r--r-- | src/compiler/GF/Text/CP1250.hs | 91 | ||||
| -rw-r--r-- | src/compiler/GF/Text/CP1251.hs | 86 | ||||
| -rw-r--r-- | src/compiler/GF/Text/CP1252.hs | 17 | ||||
| -rw-r--r-- | src/compiler/GF/Text/CP1254.hs | 84 | ||||
| -rw-r--r-- | src/compiler/GF/Text/Coding.hs | 85 | ||||
| -rw-r--r-- | src/compiler/GF/Text/Lexing.hs | 6 | ||||
| -rw-r--r-- | src/compiler/GF/Text/Transliterations.hs | 2 | ||||
| -rw-r--r-- | src/compiler/GF/Text/UTF8.hs | 48 |
8 files changed, 65 insertions, 354 deletions
diff --git a/src/compiler/GF/Text/CP1250.hs b/src/compiler/GF/Text/CP1250.hs deleted file mode 100644 index 2ed263877..000000000 --- a/src/compiler/GF/Text/CP1250.hs +++ /dev/null @@ -1,91 +0,0 @@ ------------------------------------------------------------------------------
--- |
--- Module : GF.Text.CP1250
--- Maintainer : Krasimir Angelov
---
--- cp1250 is a code page used under Microsoft Windows to represent texts
--- in Central European and Eastern European languages that use Latin script,
--- such as Polish, Czech, Slovak, Hungarian, Slovene, Bosnian, Croatian,
--- Serbian (Latin script), Romanian and Albanian. It may also be used with
--- the German language; German-language texts encoded with cp1250 and cp1252
--- are identical.
---
------------------------------------------------------------------------------
-
-module GF.Text.CP1250 where
-
-import Data.Char
-
-decodeCP1250 = map convert where
- convert c
- | c == '\x80' = chr 0x20AC
- | c == '\x82' = chr 0x201A
- | c == '\x84' = chr 0x201E
- | c == '\x85' = chr 0x2026
- | c == '\x86' = chr 0x2020
- | c == '\x87' = chr 0x2021
- | c == '\x89' = chr 0x2030
- | c == '\x8A' = chr 0x0160
- | c == '\x8B' = chr 0x2039
- | c == '\x8C' = chr 0x015A
- | c == '\x8D' = chr 0x0164
- | c == '\x8E' = chr 0x017D
- | c == '\x8F' = chr 0x0179
- | c == '\x91' = chr 0x2018
- | c == '\x92' = chr 0x2019
- | c == '\x93' = chr 0x201C
- | c == '\x94' = chr 0x201D
- | c == '\x95' = chr 0x2022
- | c == '\x96' = chr 0x2013
- | c == '\x97' = chr 0x2014
- | c == '\x99' = chr 0x2122
- | c == '\x9A' = chr 0x0161
- | c == '\x9B' = chr 0x203A
- | c == '\x9C' = chr 0x015B
- | c == '\x9D' = chr 0x0165
- | c == '\x9E' = chr 0x017E
- | c == '\x9F' = chr 0x017A
- | c == '\xA1' = chr 0x02C7
- | c == '\xA5' = chr 0x0104
- | c == '\xB9' = chr 0x0105
- | c == '\xBC' = chr 0x013D
- | c == '\xBE' = chr 0x013E
- | otherwise = c
-
-
-encodeCP1250 = map convert where
- convert c
- | oc == 0x20AC = '\x80'
- | oc == 0x201A = '\x82'
- | oc == 0x201E = '\x84'
- | oc == 0x2026 = '\x85'
- | oc == 0x2020 = '\x86'
- | oc == 0x2021 = '\x87'
- | oc == 0x2030 = '\x89'
- | oc == 0x0160 = '\x8A'
- | oc == 0x2039 = '\x8B'
- | oc == 0x015A = '\x8C'
- | oc == 0x0164 = '\x8D'
- | oc == 0x017D = '\x8E'
- | oc == 0x0179 = '\x8F'
- | oc == 0x2018 = '\x91'
- | oc == 0x2019 = '\x92'
- | oc == 0x201C = '\x93'
- | oc == 0x201D = '\x94'
- | oc == 0x2022 = '\x95'
- | oc == 0x2013 = '\x96'
- | oc == 0x2014 = '\x97'
- | oc == 0x2122 = '\x99'
- | oc == 0x0161 = '\x9A'
- | oc == 0x203A = '\x9B'
- | oc == 0x015B = '\x9C'
- | oc == 0x0165 = '\x9D'
- | oc == 0x017E = '\x9E'
- | oc == 0x017A = '\x9F'
- | oc == 0x02C7 = '\xA1'
- | oc == 0x0104 = '\xA5'
- | oc == 0x0105 = '\xB9'
- | oc == 0x013D = '\xBC'
- | oc == 0x013E = '\xBE'
- | otherwise = c
- where oc = ord c
diff --git a/src/compiler/GF/Text/CP1251.hs b/src/compiler/GF/Text/CP1251.hs deleted file mode 100644 index 8d8ceebf6..000000000 --- a/src/compiler/GF/Text/CP1251.hs +++ /dev/null @@ -1,86 +0,0 @@ ------------------------------------------------------------------------------
--- |
--- Module : GF.Text.CP1251
--- Maintainer : Krasimir Angelov
---
--- cp1251 is a popular 8-bit character encoding, designed to cover languages
--- that use the Cyrillic alphabet such as Russian, Bulgarian, Serbian Cyrillic
--- and other languages. It is the most widely used for encoding the Bulgarian,
--- Serbian and Macedonian languages.
---
------------------------------------------------------------------------------
-
-module GF.Text.CP1251 where
-
-import Data.Char
-
-decodeCP1251 = map convert where
- convert c
- | c >= '\xC0' && c <= '\xFF' = chr (ord c + (0x410-0xC0))
- | c == '\xA8' = chr 0x401 -- cyrillic capital letter lo
- | c == '\x80' = chr 0x402
- | c == '\x81' = chr 0x403
- | c == '\xAA' = chr 0x404
- | c == '\xBD' = chr 0x405
- | c == '\xB2' = chr 0x406
- | c == '\xAF' = chr 0x407
- | c == '\xA3' = chr 0x408
- | c == '\x8A' = chr 0x409
- | c == '\x8C' = chr 0x40A
- | c == '\x8E' = chr 0x40B
- | c == '\x8D' = chr 0x40C
- | c == '\xA1' = chr 0x40E
- | c == '\x8F' = chr 0x40F
- | c == '\xB8' = chr 0x451 -- cyrillic small letter lo
- | c == '\x90' = chr 0x452
- | c == '\x83' = chr 0x453
- | c == '\xBA' = chr 0x454
- | c == '\xBE' = chr 0x455
- | c == '\xB3' = chr 0x456
- | c == '\xBF' = chr 0x457
- | c == '\xBC' = chr 0x458
- | c == '\x9A' = chr 0x459
- | c == '\x9C' = chr 0x45A
- | c == '\x9E' = chr 0x45B
- | c == '\x9D' = chr 0x45C
- | c == '\xA2' = chr 0x45E
- | c == '\x9F' = chr 0x45F
- | c == '\xA5' = chr 0x490
- | c == '\xB4' = chr 0x491
- | otherwise = c
-
-encodeCP1251 = map convert where
- convert c
- | oc >= 0x410 && oc <= 0x44F = chr (oc - (0x410-0xC0))
- | oc == 0x401 = '\xA8' -- cyrillic capital letter lo
- | oc == 0x402 = '\x80'
- | oc == 0x403 = '\x81'
- | oc == 0x404 = '\xAA'
- | oc == 0x405 = '\xBD'
- | oc == 0x406 = '\xB2'
- | oc == 0x407 = '\xAF'
- | oc == 0x408 = '\xA3'
- | oc == 0x409 = '\x8A'
- | oc == 0x40A = '\x8C'
- | oc == 0x40B = '\x8E'
- | oc == 0x40C = '\x8D'
- | oc == 0x40E = '\xA1'
- | oc == 0x40F = '\x8F'
- | oc == 0x451 = '\xB8' -- cyrillic small letter lo
- | oc == 0x452 = '\x90'
- | oc == 0x453 = '\x83'
- | oc == 0x454 = '\xBA'
- | oc == 0x455 = '\xBE'
- | oc == 0x456 = '\xB3'
- | oc == 0x457 = '\xBF'
- | oc == 0x458 = '\xBC'
- | oc == 0x459 = '\x9A'
- | oc == 0x45A = '\x9C'
- | oc == 0x45B = '\x9E'
- | oc == 0x45C = '\x9D'
- | oc == 0x45E = '\xA2'
- | oc == 0x45F = '\x9F'
- | oc == 0x490 = '\xA5'
- | oc == 0x491 = '\xB4'
- | otherwise = c
- where oc = ord c
diff --git a/src/compiler/GF/Text/CP1252.hs b/src/compiler/GF/Text/CP1252.hs deleted file mode 100644 index a1d8ab8f3..000000000 --- a/src/compiler/GF/Text/CP1252.hs +++ /dev/null @@ -1,17 +0,0 @@ ------------------------------------------------------------------------------
--- |
--- Module : GF.Text.CP1252
--- Maintainer : Krasimir Angelov
---
--- cp1252 is a character encoding of the Latin alphabet, used by default in
--- the legacy components of Microsoft Windows in English and some other
--- Western languages.
---
------------------------------------------------------------------------------
-
-module GF.Text.CP1252 where
-
-import Data.Char
-
-decodeCP1252 = map id
-encodeCP1252 = map (\x -> if x <= '\255' then x else '?')
diff --git a/src/compiler/GF/Text/CP1254.hs b/src/compiler/GF/Text/CP1254.hs deleted file mode 100644 index 488359d70..000000000 --- a/src/compiler/GF/Text/CP1254.hs +++ /dev/null @@ -1,84 +0,0 @@ ------------------------------------------------------------------------------
--- |
--- Module : GF.Text.CP1254
--- Maintainer : Krasimir Angelov
---
--- cp1254 is a code page used under Microsoft Windows to write Turkish.
--- Characters with codepoints A0 through FF are compatible with ISO 8859-9.
---
------------------------------------------------------------------------------
-
-module GF.Text.CP1254 where
-
-import Data.Char
-
-decodeCP1254 = map convert where
- convert c
- | c == '\x80' = chr 0x20AC
- | c == '\x82' = chr 0x201A
- | c == '\x83' = chr 0x192
- | c == '\x84' = chr 0x201E
- | c == '\x85' = chr 0x2026
- | c == '\x86' = chr 0x2020
- | c == '\x87' = chr 0x2021
- | c == '\x88' = chr 0x2C6
- | c == '\x89' = chr 0x2030
- | c == '\x8A' = chr 0x160
- | c == '\x8B' = chr 0x2039
- | c == '\x8C' = chr 0x152
- | c == '\x91' = chr 0x2018
- | c == '\x92' = chr 0x2019
- | c == '\x93' = chr 0x201C
- | c == '\x94' = chr 0x201D
- | c == '\x95' = chr 0x2022
- | c == '\x96' = chr 0x2013
- | c == '\x97' = chr 0x2014
- | c == '\x98' = chr 0x2DC
- | c == '\x99' = chr 0x2122
- | c == '\x9A' = chr 0x161
- | c == '\x9B' = chr 0x203A
- | c == '\x9C' = chr 0x153
- | c == '\x9F' = chr 0x178
- | c == '\xD0' = chr 0x11E
- | c == '\xDD' = chr 0x130
- | c == '\xDE' = chr 0x15E
- | c == '\xF0' = chr 0x11F
- | c == '\xFD' = chr 0x131
- | c == '\xFE' = chr 0x15F
- | otherwise = c
-
-encodeCP1254 = map convert where
- convert c
- | oc == 0x20AC = '\x80'
- | oc == 0x201A = '\x82'
- | oc == 0x192 = '\x83'
- | oc == 0x201E = '\x84'
- | oc == 0x2026 = '\x85'
- | oc == 0x2020 = '\x86'
- | oc == 0x2021 = '\x87'
- | oc == 0x2C6 = '\x88'
- | oc == 0x2030 = '\x89'
- | oc == 0x160 = '\x8A'
- | oc == 0x2039 = '\x8B'
- | oc == 0x152 = '\x8C'
- | oc == 0x2018 = '\x91'
- | oc == 0x2019 = '\x92'
- | oc == 0x201C = '\x93'
- | oc == 0x201D = '\x94'
- | oc == 0x2022 = '\x95'
- | oc == 0x2013 = '\x96'
- | oc == 0x2014 = '\x97'
- | oc == 0x2DC = '\x98'
- | oc == 0x2122 = '\x99'
- | oc == 0x161 = '\x9A'
- | oc == 0x203A = '\x9B'
- | oc == 0x153 = '\x9C'
- | oc == 0x178 = '\x9F'
- | oc == 0x11E = '\xD0'
- | oc == 0x130 = '\xDD'
- | oc == 0x15E = '\xDE'
- | oc == 0x11F = '\xF0'
- | oc == 0x131 = '\xFD'
- | oc == 0x15F = '\xFE'
- | otherwise = c
- where oc = ord c
diff --git a/src/compiler/GF/Text/Coding.hs b/src/compiler/GF/Text/Coding.hs index 3481b278d..a206bb4d2 100644 --- a/src/compiler/GF/Text/Coding.hs +++ b/src/compiler/GF/Text/Coding.hs @@ -1,24 +1,69 @@ module GF.Text.Coding where -import GF.Infra.Option -import GF.Text.UTF8 -import GF.Text.CP1250 -import GF.Text.CP1251 -import GF.Text.CP1252 -import GF.Text.CP1254 +import qualified Data.ByteString as BS +import Data.ByteString.Internal +import GHC.IO +import GHC.IO.Buffer +import GHC.IO.Encoding +import GHC.IO.Exception +import Control.Monad -encodeUnicode e = case e of - UTF_8 -> encodeUTF8 - CP_1250 -> encodeCP1250 - CP_1251 -> encodeCP1251 - CP_1252 -> encodeCP1252 - CP_1254 -> encodeCP1254 - _ -> id +encodeUnicode :: TextEncoding -> String -> ByteString +encodeUnicode enc s = + unsafePerformIO $ do + let len = length s + cbuf0 <- newCharBuffer (len*4) ReadBuffer + foldM (\i c -> writeCharBuf (bufRaw cbuf0) i c) 0 s + let cbuf = cbuf0{bufR=len} + case enc of + TextEncoding {mkTextEncoder=mk} -> do encoder <- mk + bss <- translate (encode encoder) cbuf + close encoder + return (BS.concat bss) + where + translate cod cbuf + | i < w = do bbuf <- newByteBuffer 128 WriteBuffer + (cbuf,bbuf) <- cod cbuf bbuf + if isEmptyBuffer bbuf + then ioe_invalidCharacter + else do let bs = PS (bufRaw bbuf) (bufL bbuf) (bufR bbuf-bufL bbuf) + bss <- translate cod cbuf + return (bs:bss) + | otherwise = return [] + where + i = bufL cbuf + w = bufR cbuf -decodeUnicode e = case e of - UTF_8 -> decodeUTF8 - CP_1250 -> decodeCP1250 - CP_1251 -> decodeCP1251 - CP_1252 -> decodeCP1252 - CP_1254 -> decodeCP1254 - _ -> id +decodeUnicode :: TextEncoding -> ByteString -> String +decodeUnicode enc (PS fptr l len) = + unsafePerformIO $ do + let bbuf = Buffer{bufRaw=fptr, bufState=ReadBuffer, bufSize=len, bufL=l, bufR=l+len} + cbuf <- newCharBuffer 128 WriteBuffer + case enc of + TextEncoding {mkTextDecoder=mk} -> do decoder <- mk + s <- translate (encode decoder) bbuf cbuf + close decoder + return s + where + translate cod bbuf cbuf + | i < w = do (bbuf,cbuf) <- cod bbuf cbuf + if isEmptyBuffer cbuf + then ioe_invalidCharacter + else unpack cod bbuf cbuf + | otherwise = return [] + where + i = bufL bbuf + w = bufR bbuf + + unpack cod bbuf cbuf + | i < w = do (c,i') <- readCharBuf (bufRaw cbuf) i + cs <- unpack cod bbuf cbuf{bufL=i'} + return (c:cs) + | otherwise = translate cod bbuf cbuf{bufL=0,bufR=0} + where + i = bufL cbuf + w = bufR cbuf + +ioe_invalidCharacter = ioException + (IOError Nothing InvalidArgument "" + ("invalid byte sequence for this encoding") Nothing Nothing) diff --git a/src/compiler/GF/Text/Lexing.hs b/src/compiler/GF/Text/Lexing.hs index a5a2c71eb..ec030e158 100644 --- a/src/compiler/GF/Text/Lexing.hs +++ b/src/compiler/GF/Text/Lexing.hs @@ -1,8 +1,6 @@ module GF.Text.Lexing (stringOp,opInEnv) where import GF.Text.Transliterations -import GF.Text.UTF8 -import GF.Text.CP1251 import Data.Char import Data.List (intersperse) @@ -23,10 +21,6 @@ stringOp name = case name of "unlexmixed" -> Just $ capitInit . appUnlexer (unlexMixed . unquote) "unwords" -> Just $ appUnlexer unwords "to_html" -> Just wrapHTML - "to_utf8" -> Just encodeUTF8 - "from_utf8" -> Just decodeUTF8 - "to_cp1251" -> Just encodeCP1251 - "from_cp1251" -> Just decodeCP1251 _ -> transliterate name -- perform op in environments beg--end, t.ex. between "--" diff --git a/src/compiler/GF/Text/Transliterations.hs b/src/compiler/GF/Text/Transliterations.hs index bd56f5f89..cbe8baf15 100644 --- a/src/compiler/GF/Text/Transliterations.hs +++ b/src/compiler/GF/Text/Transliterations.hs @@ -5,8 +5,6 @@ module GF.Text.Transliterations ( transliterationPrintNames ) where -import GF.Text.UTF8 - import Data.Char import Numeric import qualified Data.Map as Map diff --git a/src/compiler/GF/Text/UTF8.hs b/src/compiler/GF/Text/UTF8.hs deleted file mode 100644 index 5e9687684..000000000 --- a/src/compiler/GF/Text/UTF8.hs +++ /dev/null @@ -1,48 +0,0 @@ ----------------------------------------------------------------------- --- | --- Module : UTF8 --- Maintainer : AR --- Stability : (stable) --- Portability : (portable) --- --- > CVS $Date: 2005/04/21 16:23:42 $ --- > CVS $Author: bringert $ --- > CVS $Revision: 1.5 $ --- --- From the Char module supplied with HBC. --- code by Thomas Hallgren (Jul 10 1999) ------------------------------------------------------------------------------ - -module GF.Text.UTF8 (decodeUTF8, encodeUTF8) where - --- | Take a Unicode string and encode it as a string --- with the UTF8 method. -decodeUTF8 :: String -> String -decodeUTF8 "" = "" -decodeUTF8 (c:cs) | c < '\x80' = c : decodeUTF8 cs -decodeUTF8 (c:c':cs) | '\xc0' <= c && c <= '\xdf' && - '\x80' <= c' && c' <= '\xbf' = - toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : decodeUTF8 cs -decodeUTF8 (c:c':c'':cs) | '\xe0' <= c && c <= '\xef' && - '\x80' <= c' && c' <= '\xbf' && - '\x80' <= c'' && c'' <= '\xbf' = - toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : decodeUTF8 cs -decodeUTF8 s = s ---- AR workaround 22/6/2006 -----decodeUTF8 _ = error "UniChar.decodeUTF8: bad data" - -encodeUTF8 :: String -> String -encodeUTF8 "" = "" -encodeUTF8 (c:cs) = - if c > '\x0000' && c < '\x0080' then - c : encodeUTF8 cs - else if c < toEnum 0x0800 then - let i = fromEnum c - in toEnum (0xc0 + i `div` 0x40) : - toEnum (0x80 + i `mod` 0x40) : - encodeUTF8 cs - else - let i = fromEnum c - in toEnum (0xe0 + i `div` 0x1000) : - toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) : - toEnum (0x80 + i `mod` 0x40) : - encodeUTF8 cs |
