From 9d7fdf7c9a525a3b5659a566f76d26d151dcd664 Mon Sep 17 00:00:00 2001
From: hallgren <hallgren@chalmers.se>
Date: Mon, 25 Nov 2013 21:12:11 +0000
Subject: Change how GF deals with character encodings in grammar files

1. The default encoding is changed from Latin-1 to UTF-8.

2. Alternate encodings should be specified as "--# -coding=enc", the old
   "flags coding=enc" declarations have no effect but are still checked for
   consistency.

3. A transitional warning is generated for files that contain non-ASCII
   characters without specifying a character encoding:

	"Warning: default encoding has changed from Latin-1 to UTF-8"

4. Conversion to Unicode is now done *before* lexing. This makes it possible
   to allow arbitrary Unicode characters in identifiers. But identifiers are
   still stored as ByteStrings, so they are limited to Latin-1 characters
   for now.

5. Lexer.hs is no longer part of the repository. We now generate the lexer
   from Lexer.x with alex>=3. Some workarounds for bugs in alex-3.0 were
   needed. These bugs might already be fixed in newer versions of alex, but
   we should be compatible with what is shipped in the Haskell Platform.
---
 src/compiler/GF/Text/Coding.hs | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'src/compiler/GF/Text')

diff --git a/src/compiler/GF/Text/Coding.hs b/src/compiler/GF/Text/Coding.hs
index e347730e0..3669733d0 100644
--- a/src/compiler/GF/Text/Coding.hs
+++ b/src/compiler/GF/Text/Coding.hs
@@ -31,7 +31,7 @@ encodeUnicode enc s =
                        (cbuf,bbuf) <- cod cbuf bbuf
 #endif
                        if isEmptyBuffer bbuf
-                         then ioe_invalidCharacter
+                         then ioe_invalidCharacter1
                          else do let bs = PS (bufRaw bbuf) (bufL bbuf) (bufR bbuf-bufL bbuf)
                                  bss <- translate cod cbuf
                                  return (bs:bss)
@@ -41,8 +41,9 @@ encodeUnicode enc s =
         w = bufR cbuf
 
 decodeUnicode :: TextEncoding -> ByteString -> String
-decodeUnicode enc (PS fptr l len) =
-  unsafePerformIO $ do
+decodeUnicode enc bs = unsafePerformIO $ decodeUnicodeIO enc bs
+
+decodeUnicodeIO enc (PS fptr l len) = do
     let bbuf = Buffer{bufRaw=fptr, bufState=ReadBuffer, bufSize=len, bufL=l, bufR=l+len}
     cbuf <- newCharBuffer 128 WriteBuffer
     case enc of
@@ -59,7 +60,7 @@ decodeUnicode enc (PS fptr l len) =
                        (bbuf,cbuf) <- cod bbuf cbuf
 #endif
                        if isEmptyBuffer cbuf
-                         then ioe_invalidCharacter
+                         then ioe_invalidCharacter2
                          else unpack cod bbuf cbuf
       | otherwise = return []
       where
@@ -75,6 +76,10 @@ decodeUnicode enc (PS fptr l len) =
         i = bufL cbuf
         w = bufR cbuf
 
-ioe_invalidCharacter = ioException
+ioe_invalidCharacter1 = ioException
    (IOError Nothing InvalidArgument ""
         ("invalid byte sequence for this encoding") Nothing Nothing)
+
+ioe_invalidCharacter2 = ioException
+   (IOError Nothing InvalidArgument ""
+        ("invalid byte sequence for this decoding") Nothing Nothing)
-- 
cgit v1.2.3