summaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
authorkrasimir <krasimir@chalmers.se>2015-05-06 15:51:12 +0000
committerkrasimir <krasimir@chalmers.se>2015-05-06 15:51:12 +0000
commit01c4836d4d2560347786175e8a4af5baa1bd23e2 (patch)
tree7410e51d67256ff42b2981a78b4d83ba0f0b0d5a /src/runtime
parentc1265db3a71e7c2708c56a7748eea5490c11cd2c (diff)
fix the UTF8 implementation in libgu
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/c/gu/utf8.c148
-rw-r--r--src/runtime/c/gu/utf8.h5
2 files changed, 82 insertions, 71 deletions
diff --git a/src/runtime/c/gu/utf8.c b/src/runtime/c/gu/utf8.c
index 8f22e5823..cd198a83d 100644
--- a/src/runtime/c/gu/utf8.c
+++ b/src/runtime/c/gu/utf8.c
@@ -33,20 +33,21 @@ gu_in_utf8_(GuIn* in, GuExn* err)
if (!gu_ok(err)) {
return 0;
}
- int len = (c < 0x80 ? 0 :
- c < 0xc2 ? -1 :
- c < 0xe0 ? 1 :
- c < 0xf0 ? 2 :
- c < 0xf5 ? 3 :
- -1);
- if (len < 0) {
- goto fail;
- } else if (len == 0) {
+ if (c < 0x80) {
return c;
+ }
+ if (c < 0xc2) {
+ goto fail;
}
- static const uint8_t mask[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
- uint32_t u = c & mask[len];
- uint8_t buf[3];
+ int len = (c < 0xe0 ? 1 :
+ c < 0xf0 ? 2 :
+ c < 0xf8 ? 3 :
+ c < 0xfc ? 4 :
+ 5
+ );
+ uint64_t mask = 0x0103070F1f7f;
+ uint32_t u = c & (mask >> (len * 8));
+ uint8_t buf[5];
// If reading the extra bytes causes EOF, it is an encoding
// error, not a legitimate end of character stream.
gu_in_bytes(in, buf, len, err);
@@ -78,55 +79,6 @@ fail:
extern inline GuUCS
gu_in_utf8(GuIn* in, GuExn* err);
-static size_t
-gu_advance_utf8(GuUCS ucs, uint8_t* buf)
-{
- gu_require(gu_ucs_valid(ucs));
- if (ucs < 0x80) {
- buf[0] = (uint8_t) ucs;
- return 1;
- } else if (ucs < 0x800) {
- buf[0] = 0xc0 | (ucs >> 6);
- buf[1] = 0x80 | (ucs & 0x3f);
- return 2;
- } else if (ucs < 0x10000) {
- buf[0] = 0xe0 | (ucs >> 12);
- buf[1] = 0x80 | ((ucs >> 6) & 0x3f);
- buf[2] = 0x80 | (ucs & 0x3f);
- return 3;
- } else {
- buf[0] = 0xf0 | (ucs >> 18);
- buf[1] = 0x80 | ((ucs >> 12) & 0x3f);
- buf[2] = 0x80 | ((ucs >> 6) & 0x3f);
- buf[3] = 0x80 | (ucs & 0x3f);
- return 4;
- }
-}
-
-
-void
-gu_out_utf8_(GuUCS ucs, GuOut* out, GuExn* err)
-{
- uint8_t buf[4];
- size_t sz = gu_advance_utf8(ucs, buf);
- switch (sz) {
- case 2:
- gu_out_bytes(out, buf, 2, err);
- break;
- case 3:
- gu_out_bytes(out, buf, 3, err);
- break;
- case 4:
- gu_out_bytes(out, buf, 4, err);
- break;
- default:
- gu_impossible();
- }
-}
-
-extern inline void
-gu_out_utf8(GuUCS ucs, GuOut* out, GuExn* err);
-
void
gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err)
{
@@ -137,18 +89,21 @@ gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err)
return;
}
*(p++) = c;
- int len = (c < 0x80 ? 0 :
- c < 0xc2 ? -1 :
- c < 0xe0 ? 1 :
- c < 0xf0 ? 2 :
- c < 0xf5 ? 3 :
- -1);
- if (len < 0) {
- goto fail;
- } else if (len == 0) {
+
+ if (c < 0x80) {
*buf = p;
return;
}
+ if (c < 0xc2) {
+ goto fail;
+ }
+
+ int len = (c < 0xe0 ? 1 :
+ c < 0xf0 ? 2 :
+ c < 0xf8 ? 3 :
+ c < 0xfc ? 4 :
+ 5
+ );
// If reading the extra bytes causes EOF, it is an encoding
// error, not a legitimate end of character stream.
gu_in_bytes(in, p, len, err);
@@ -166,3 +121,56 @@ fail:
gu_raise(err, GuUCSExn);
return;
}
+
+void
+gu_utf8_encode(GuUCS ucs, uint8_t** buf)
+{
+ gu_require(gu_ucs_valid(ucs));
+ uint8_t* p = *buf;
+ if (ucs < 0x80) {
+ p[0] = (uint8_t) ucs;
+ *buf = p+1;
+ } else if (ucs < 0x800) {
+ p[0] = 0xc0 | (ucs >> 6);
+ p[1] = 0x80 | (ucs & 0x3f);
+ *buf = p+2;
+ } else if (ucs < 0x10000) {
+ p[0] = 0xe0 | (ucs >> 12);
+ p[1] = 0x80 | ((ucs >> 6) & 0x3f);
+ p[2] = 0x80 | (ucs & 0x3f);
+ *buf = p+3;
+ } else if (ucs < 0x200000) {
+ p[0] = 0xf0 | (ucs >> 18);
+ p[1] = 0x80 | ((ucs >> 12) & 0x3f);
+ p[2] = 0x80 | ((ucs >> 6) & 0x3f);
+ p[3] = 0x80 | (ucs & 0x3f);
+ *buf = p+4;
+ } else if (ucs < 0x4000000) {
+ p[0] = 0xf8 | (ucs >> 24);
+ p[1] = 0x80 | ((ucs >> 18) & 0x3f);
+ p[2] = 0x80 | ((ucs >> 12) & 0x3f);
+ p[3] = 0x80 | ((ucs >> 6) & 0x3f);
+ p[4] = 0x80 | (ucs & 0x3f);
+ *buf = p+5;
+ } else {
+ p[0] = 0xfc | (ucs >> 30);
+ p[1] = 0x80 | ((ucs >> 24) & 0x3f);
+ p[2] = 0x80 | ((ucs >> 18) & 0x3f);
+ p[3] = 0x80 | ((ucs >> 12) & 0x3f);
+ p[4] = 0x80 | ((ucs >> 6) & 0x3f);
+ p[5] = 0x80 | (ucs & 0x3f);
+ *buf = p+6;
+ }
+}
+
+void
+gu_out_utf8_(GuUCS ucs, GuOut* out, GuExn* err)
+{
+ uint8_t buf[6];
+ uint8_t* p = buf;
+ gu_utf8_encode(ucs, &p);
+ gu_out_bytes(out, buf, p-buf, err);
+}
+
+extern inline void
+gu_out_utf8(GuUCS ucs, GuOut* out, GuExn* err);
diff --git a/src/runtime/c/gu/utf8.h b/src/runtime/c/gu/utf8.h
index db7dccaf9..3ad28946d 100644
--- a/src/runtime/c/gu/utf8.h
+++ b/src/runtime/c/gu/utf8.h
@@ -32,7 +32,10 @@ gu_out_utf8(GuUCS ucs, GuOut* out, GuExn* err)
// Helper functions used in other modules
GuUCS
-gu_utf8_decode(const uint8_t** utf8);
+gu_utf8_decode(const uint8_t** buf);
+
+void
+gu_utf8_encode(GuUCS ucs, uint8_t** buf);
void
gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err);