summaryrefslogtreecommitdiff
path: root/src/runtime/c/pgf/parser.c
diff options
context:
space:
mode:
authorkrasimir <krasimir@chalmers.se>2015-05-08 09:23:29 +0000
committerkrasimir <krasimir@chalmers.se>2015-05-08 09:23:29 +0000
commit365c7bb1d867b6080f8f75ae6bd4e5f68522a1be (patch)
tree93df808b50d6d894d644060bd26e60838ee49822 /src/runtime/c/pgf/parser.c
parentb961e9a2557ad3cc6023252abff1d02f3d067834 (diff)
the parser is now fully Unicode compatible
Diffstat (limited to 'src/runtime/c/pgf/parser.c')
-rw-r--r--src/runtime/c/pgf/parser.c86
1 files changed, 41 insertions, 45 deletions
diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c
index be0b1b361..b2ad5374d 100644
--- a/src/runtime/c/pgf/parser.c
+++ b/src/runtime/c/pgf/parser.c
@@ -145,41 +145,39 @@ pgf_prev_extern_sym(PgfSymbol sym)
}
}
-static void
-pgf_add_extern_tok(PgfSymbol* psym, PgfToken tok, GuPool* pool) {
- PgfSymbol new_sym;
- size_t tok_len = strlen(tok);
- PgfSymbolKS* sks = (PgfSymbolKS*)
- gu_alloc_variant(PGF_SYMBOL_KS,
- sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+tok_len+1,
- gu_alignof(PgfSymbolKS),
- &new_sym, pool);
- strcpy(sks->token, tok);
- *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1)) = *psym;
- *psym = new_sym;
-}
-
-PgfSymbol
-pgf_collect_extern_tok(PgfParsing* ps, size_t start, size_t end)
+static PgfSymbol
+pgf_collect_extern_tok(PgfParsing* ps, size_t start_offset, size_t end_offset)
{
PgfSymbol sym = gu_null_variant;
- size_t offset = start;
- while (offset < end) {
+ const uint8_t* start = (uint8_t*) ps->sentence+start_offset;
+ const uint8_t* end = (uint8_t*) ps->sentence+end_offset;
+
+ const uint8_t* p = start;
+ GuUCS ucs = gu_utf8_decode(&p);
+ while (start < end) {
size_t len = 0;
- while (!gu_is_space(ps->sentence[offset+len])) {
- len++;
+ while (p <= end && !gu_ucs_is_space(ucs)) {
+ len = (p - start);
+ ucs = gu_utf8_decode(&p);
}
- PgfToken tok = gu_malloc(ps->pool, len+1);
- memcpy((char*) tok, ps->sentence+offset, len);
- ((char*) tok)[len] = 0;
-
- pgf_add_extern_tok(&sym, tok, ps->pool);
-
- offset += len;
- while (gu_is_space(ps->sentence[offset]))
- offset++;
+ PgfSymbol new_sym;
+ PgfSymbolKS* sks = (PgfSymbolKS*)
+ gu_alloc_variant(PGF_SYMBOL_KS,
+ sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+len+1,
+ gu_alignof(PgfSymbolKS),
+ &new_sym, ps->pool);
+ memcpy((char*) sks->token, start, len);
+ ((char*) sks->token)[len] = 0;
+ *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+len+1)) = sym;
+ sym = new_sym;
+
+ start = p;
+ while (gu_ucs_is_space(ucs)) {
+ start = p;
+ ucs = gu_utf8_decode(&p);
+ }
}
return sym;
@@ -504,11 +502,11 @@ skip_space(GuString* psent, size_t* plen)
if (*plen == 0)
return false;
- char c = **psent;
- if (!gu_is_space(c))
+ const uint8_t* p = (uint8_t*) *psent;
+ if (!gu_ucs_is_space(gu_utf8_decode(&p)))
return false;
- (*psent)++;
+ *psent = (GuString) p;
return true;
}
@@ -2056,24 +2054,22 @@ pgf_parsing_last_token(PgfParsing* ps, GuPool* pool)
if (ps->before == NULL)
return "";
- size_t start = ps->before->end_offset;
- while (start > 0) {
- char c = ps->sentence[start-1];
- if (gu_is_space(c))
- break;
- start--;
+ const uint8_t* start = (uint8_t*) ps->sentence;
+ const uint8_t* end = (uint8_t*) ps->sentence + ps->before->end_offset;
+
+ const uint8_t* p = start;
+ while (p < end) {
+ if (gu_ucs_is_space(gu_utf8_decode(&p))) {
+ start = p;
+ }
}
- size_t end = ps->before->end_offset;
- while (ps->sentence[end] != 0) {
- char c = ps->sentence[end];
- if (gu_is_space(c))
- break;
- end++;
+ while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
+ end = p;
}
char* tok = gu_malloc(pool, end-start+1);
- memcpy(tok, ps->sentence+start, (end-start));
+ memcpy(tok, start, (end-start));
tok[end-start] = 0;
return tok;
}