summaryrefslogtreecommitdiff
path: root/src/compiler/GF/Text
diff options
context:
space:
mode:
authorhallgren <hallgren@chalmers.se>2013-11-25 21:12:11 +0000
committerhallgren <hallgren@chalmers.se>2013-11-25 21:12:11 +0000
commit9d7fdf7c9a525a3b5659a566f76d26d151dcd664 (patch)
tree9ea97377d9938fc382c2036fa4c8fef9c33e33d8 /src/compiler/GF/Text
parent3210a506484864430504ed1caf2f547bb674e701 (diff)
Change how GF deals with character encodings in grammar files
1. The default encoding is changed from Latin-1 to UTF-8. 2. Alternate encodings should be specified as "--# -coding=enc", the old "flags coding=enc" declarations have no effect but are still checked for consistency. 3. A transitional warning is generated for files that contain non-ASCII characters without specifying a character encoding: "Warning: default encoding has changed from Latin-1 to UTF-8" 4. Conversion to Unicode is now done *before* lexing. This makes it possible to allow arbitrary Unicode characters in identifiers. But identifiers are still stored as ByteStrings, so they are limited to Latin-1 characters for now. 5. Lexer.hs is no longer part of the repository. We now generate the lexer from Lexer.x with alex>=3. Some workarounds for bugs in alex-3.0 were needed. These bugs might already be fixed in newer versions of alex, but we should be compatible with what is shipped in the Haskell Platform.
Diffstat (limited to 'src/compiler/GF/Text')
-rw-r--r--src/compiler/GF/Text/Coding.hs15
1 files changed, 10 insertions, 5 deletions
diff --git a/src/compiler/GF/Text/Coding.hs b/src/compiler/GF/Text/Coding.hs
index e347730e0..3669733d0 100644
--- a/src/compiler/GF/Text/Coding.hs
+++ b/src/compiler/GF/Text/Coding.hs
@@ -31,7 +31,7 @@ encodeUnicode enc s =
(cbuf,bbuf) <- cod cbuf bbuf
#endif
if isEmptyBuffer bbuf
- then ioe_invalidCharacter
+ then ioe_invalidCharacter1
else do let bs = PS (bufRaw bbuf) (bufL bbuf) (bufR bbuf-bufL bbuf)
bss <- translate cod cbuf
return (bs:bss)
@@ -41,8 +41,9 @@ encodeUnicode enc s =
w = bufR cbuf
decodeUnicode :: TextEncoding -> ByteString -> String
-decodeUnicode enc (PS fptr l len) =
- unsafePerformIO $ do
+decodeUnicode enc bs = unsafePerformIO $ decodeUnicodeIO enc bs
+
+decodeUnicodeIO enc (PS fptr l len) = do
let bbuf = Buffer{bufRaw=fptr, bufState=ReadBuffer, bufSize=len, bufL=l, bufR=l+len}
cbuf <- newCharBuffer 128 WriteBuffer
case enc of
@@ -59,7 +60,7 @@ decodeUnicode enc (PS fptr l len) =
(bbuf,cbuf) <- cod bbuf cbuf
#endif
if isEmptyBuffer cbuf
- then ioe_invalidCharacter
+ then ioe_invalidCharacter2
else unpack cod bbuf cbuf
| otherwise = return []
where
@@ -75,6 +76,10 @@ decodeUnicode enc (PS fptr l len) =
i = bufL cbuf
w = bufR cbuf
-ioe_invalidCharacter = ioException
+ioe_invalidCharacter1 = ioException
(IOError Nothing InvalidArgument ""
("invalid byte sequence for this encoding") Nothing Nothing)
+
+ioe_invalidCharacter2 = ioException
+ (IOError Nothing InvalidArgument ""
+ ("invalid byte sequence for this decoding") Nothing Nothing)