diff options
| author | hallgren <hallgren@chalmers.se> | 2013-11-25 21:12:11 +0000 |
|---|---|---|
| committer | hallgren <hallgren@chalmers.se> | 2013-11-25 21:12:11 +0000 |
| commit | 9d7fdf7c9a525a3b5659a566f76d26d151dcd664 (patch) | |
| tree | 9ea97377d9938fc382c2036fa4c8fef9c33e33d8 /src/compiler/GF/Text | |
| parent | 3210a506484864430504ed1caf2f547bb674e701 (diff) | |
Change how GF deals with character encodings in grammar files
1. The default encoding is changed from Latin-1 to UTF-8.
2. Alternate encodings should be specified as "--# -coding=enc", the old
"flags coding=enc" declarations have no effect but are still checked for
consistency.
3. A transitional warning is generated for files that contain non-ASCII
characters without specifying a character encoding:
"Warning: default encoding has changed from Latin-1 to UTF-8"
4. Conversion to Unicode is now done *before* lexing. This makes it possible
to allow arbitrary Unicode characters in identifiers. But identifiers are
still stored as ByteStrings, so they are limited to Latin-1 characters
for now.
5. Lexer.hs is no longer part of the repository. We now generate the lexer
from Lexer.x with alex>=3. Some workarounds for bugs in alex-3.0 were
needed. These bugs might already be fixed in newer versions of alex, but
we should be compatible with what is shipped in the Haskell Platform.
Diffstat (limited to 'src/compiler/GF/Text')
| -rw-r--r-- | src/compiler/GF/Text/Coding.hs | 15 |
1 files changed, 10 insertions, 5 deletions
diff --git a/src/compiler/GF/Text/Coding.hs b/src/compiler/GF/Text/Coding.hs index e347730e0..3669733d0 100644 --- a/src/compiler/GF/Text/Coding.hs +++ b/src/compiler/GF/Text/Coding.hs @@ -31,7 +31,7 @@ encodeUnicode enc s = (cbuf,bbuf) <- cod cbuf bbuf #endif if isEmptyBuffer bbuf - then ioe_invalidCharacter + then ioe_invalidCharacter1 else do let bs = PS (bufRaw bbuf) (bufL bbuf) (bufR bbuf-bufL bbuf) bss <- translate cod cbuf return (bs:bss) @@ -41,8 +41,9 @@ encodeUnicode enc s = w = bufR cbuf decodeUnicode :: TextEncoding -> ByteString -> String -decodeUnicode enc (PS fptr l len) = - unsafePerformIO $ do +decodeUnicode enc bs = unsafePerformIO $ decodeUnicodeIO enc bs + +decodeUnicodeIO enc (PS fptr l len) = do let bbuf = Buffer{bufRaw=fptr, bufState=ReadBuffer, bufSize=len, bufL=l, bufR=l+len} cbuf <- newCharBuffer 128 WriteBuffer case enc of @@ -59,7 +60,7 @@ decodeUnicode enc (PS fptr l len) = (bbuf,cbuf) <- cod bbuf cbuf #endif if isEmptyBuffer cbuf - then ioe_invalidCharacter + then ioe_invalidCharacter2 else unpack cod bbuf cbuf | otherwise = return [] where @@ -75,6 +76,10 @@ decodeUnicode enc (PS fptr l len) = i = bufL cbuf w = bufR cbuf -ioe_invalidCharacter = ioException +ioe_invalidCharacter1 = ioException (IOError Nothing InvalidArgument "" ("invalid byte sequence for this encoding") Nothing Nothing) + +ioe_invalidCharacter2 = ioException + (IOError Nothing InvalidArgument "" + ("invalid byte sequence for this decoding") Nothing Nothing) |
