From 6313244eacf992fb10a5091bee28582e84540809 Mon Sep 17 00:00:00 2001 From: krasimir Date: Mon, 19 Apr 2010 09:38:36 +0000 Subject: use the native unicode support from GHC 6.12 --- src/compiler/GF/Text/Coding.hs | 85 ++++++++++++++++++++++++++++++++---------- 1 file changed, 65 insertions(+), 20 deletions(-) (limited to 'src/compiler/GF/Text/Coding.hs') diff --git a/src/compiler/GF/Text/Coding.hs b/src/compiler/GF/Text/Coding.hs index 3481b278d..a206bb4d2 100644 --- a/src/compiler/GF/Text/Coding.hs +++ b/src/compiler/GF/Text/Coding.hs @@ -1,24 +1,69 @@ module GF.Text.Coding where -import GF.Infra.Option -import GF.Text.UTF8 -import GF.Text.CP1250 -import GF.Text.CP1251 -import GF.Text.CP1252 -import GF.Text.CP1254 +import qualified Data.ByteString as BS +import Data.ByteString.Internal +import GHC.IO +import GHC.IO.Buffer +import GHC.IO.Encoding +import GHC.IO.Exception +import Control.Monad -encodeUnicode e = case e of - UTF_8 -> encodeUTF8 - CP_1250 -> encodeCP1250 - CP_1251 -> encodeCP1251 - CP_1252 -> encodeCP1252 - CP_1254 -> encodeCP1254 - _ -> id +encodeUnicode :: TextEncoding -> String -> ByteString +encodeUnicode enc s = + unsafePerformIO $ do + let len = length s + cbuf0 <- newCharBuffer (len*4) ReadBuffer + foldM (\i c -> writeCharBuf (bufRaw cbuf0) i c) 0 s + let cbuf = cbuf0{bufR=len} + case enc of + TextEncoding {mkTextEncoder=mk} -> do encoder <- mk + bss <- translate (encode encoder) cbuf + close encoder + return (BS.concat bss) + where + translate cod cbuf + | i < w = do bbuf <- newByteBuffer 128 WriteBuffer + (cbuf,bbuf) <- cod cbuf bbuf + if isEmptyBuffer bbuf + then ioe_invalidCharacter + else do let bs = PS (bufRaw bbuf) (bufL bbuf) (bufR bbuf-bufL bbuf) + bss <- translate cod cbuf + return (bs:bss) + | otherwise = return [] + where + i = bufL cbuf + w = bufR cbuf -decodeUnicode e = case e of - UTF_8 -> decodeUTF8 - CP_1250 -> decodeCP1250 - CP_1251 -> decodeCP1251 - CP_1252 -> decodeCP1252 - CP_1254 -> decodeCP1254 - _ -> id +decodeUnicode :: TextEncoding -> ByteString -> String +decodeUnicode enc (PS fptr l len) = + unsafePerformIO $ do + let bbuf = Buffer{bufRaw=fptr, bufState=ReadBuffer, bufSize=len, bufL=l, bufR=l+len} + cbuf <- newCharBuffer 128 WriteBuffer + case enc of + TextEncoding {mkTextDecoder=mk} -> do decoder <- mk + s <- translate (encode decoder) bbuf cbuf + close decoder + return s + where + translate cod bbuf cbuf + | i < w = do (bbuf,cbuf) <- cod bbuf cbuf + if isEmptyBuffer cbuf + then ioe_invalidCharacter + else unpack cod bbuf cbuf + | otherwise = return [] + where + i = bufL bbuf + w = bufR bbuf + + unpack cod bbuf cbuf + | i < w = do (c,i') <- readCharBuf (bufRaw cbuf) i + cs <- unpack cod bbuf cbuf{bufL=i'} + return (c:cs) + | otherwise = translate cod bbuf cbuf{bufL=0,bufR=0} + where + i = bufL cbuf + w = bufR cbuf + +ioe_invalidCharacter = ioException + (IOError Nothing InvalidArgument "" + ("invalid byte sequence for this encoding") Nothing Nothing) -- cgit v1.2.3