summaryrefslogtreecommitdiff
path: root/src/runtime/c/pgf/parser.c
diff options
context:
space:
mode:
authorkr.angelov <kr.angelov@gmail.com>2013-09-27 15:09:48 +0000
committerkr.angelov <kr.angelov@gmail.com>2013-09-27 15:09:48 +0000
commit426bc49a52b4efa0ef0129d713842d8c9abdf0ff (patch)
treed9f5985559de0347448e77ff26ce5a2d3ee2f245 /src/runtime/c/pgf/parser.c
parentb138899512d9aea248160eb17df3007e55dd03da (diff)
a major refactoring in the C and the Haskell runtimes. Note incompatible change in the PGF format!!!
The following are the outcomes: - Predef.nonExist is fully supported by both the Haskell and the C runtimes - Predef.BIND is now an internal compiler defined token. For now it behaves just as usual for the Haskell runtime, i.e. it generates &+. However, the special treatment will let us to handle it properly in the C runtime. - This required a major change in the PGF format since both nonExist and BIND may appear inside 'pre' and this was not supported before.
Diffstat (limited to 'src/runtime/c/pgf/parser.c')
-rw-r--r--src/runtime/c/pgf/parser.c230
1 files changed, 98 insertions, 132 deletions
diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c
index 92c689fae..188672dd3 100644
--- a/src/runtime/c/pgf/parser.c
+++ b/src/runtime/c/pgf/parser.c
@@ -133,8 +133,8 @@ struct PgfItem {
PgfPArgs* args;
PgfSymbol curr_sym;
uint16_t seq_idx;
- uint8_t tok_idx;
- uint8_t alt;
+ uint8_t alt_idx; // position in the pre alternative
+ uint8_t alt; // the number of the alternative
prob_t inside_prob;
};
@@ -694,7 +694,7 @@ pgf_new_item(PgfItemConts* conts, PgfProduction prod,
item->prod = prod;
item->curr_sym = gu_null_variant;
item->seq_idx = 0;
- item->tok_idx = 0;
+ item->alt_idx = 0;
item->alt = 0;
conts->ref_count++;
@@ -758,8 +758,12 @@ pgf_item_update_arg(PgfItem* item, size_t d, PgfCCat *new_ccat,
static void
pgf_item_advance(PgfItem* item, GuPool* pool)
{
- item->seq_idx++;
- pgf_item_set_curr_symbol(item, pool);
+ if (GU_LIKELY(item->alt == 0)) {
+ item->seq_idx++;
+ pgf_item_set_curr_symbol(item, pool);
+ }
+ else
+ item->alt_idx++;
}
static void
@@ -1133,8 +1137,7 @@ pgf_parsing_meta_scan(PgfParseState* before, PgfParseState* after,
gu_alignof(PgfSymbolKS),
&item->curr_sym, after->ps->pool);
*((PgfSymbol*)(sks+1)) = prev;
- sks->tokens = gu_new_seq(PgfToken, 1, after->ps->pool);
- gu_seq_set(sks->tokens, PgfToken, 0, tok);
+ sks->token = tok;
gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
}
@@ -1218,76 +1221,54 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
case PGF_SYMBOL_KS: {
if (after != NULL) {
PgfSymbolKS* sks = gu_variant_data(sym);
- gu_assert(item->tok_idx < gu_seq_length(sks->tokens));
- PgfToken tok =
- gu_seq_get(sks->tokens, PgfToken, item->tok_idx++);
- if (item->tok_idx == gu_seq_length(sks->tokens)) {
- item->tok_idx = 0;
- pgf_item_advance(item, after->ps->pool);
- }
- pgf_parsing_add_transition(before, after, tok, item);
+ pgf_item_advance(item, after->ps->pool);
+ pgf_parsing_add_transition(before, after, sks->token, item);
}
break;
}
case PGF_SYMBOL_KP: {
if (after != NULL) {
PgfSymbolKP* skp = gu_variant_data(sym);
- size_t idx = item->tok_idx;
- uint8_t alt = item->alt;
- gu_assert(idx < gu_seq_length(skp->default_form));
- if (idx == 0) {
- PgfToken tok;
+
+ PgfSymbol sym;
+ if (item->alt == 0) {
PgfItem* new_item;
-
- tok = gu_seq_get(skp->default_form, PgfToken, 0);
+
new_item = pgf_item_copy(item, after->ps->pool, after->ps);
- new_item->tok_idx++;
- if (new_item->tok_idx == gu_seq_length(skp->default_form)) {
- new_item->tok_idx = 0;
- pgf_item_advance(new_item, after->ps->pool);
- }
- pgf_parsing_add_transition(before, after, tok, new_item);
+ new_item->alt = 1;
+ new_item->alt_idx = 0;
+ sym = gu_seq_get(skp->default_form, PgfSymbol, new_item->alt_idx);
+ pgf_parsing_symbol(before, after, new_item, sym);
for (size_t i = 0; i < skp->n_forms; i++) {
- // XXX: do nubbing properly
- PgfTokens* toks = skp->forms[i].form;
- PgfTokens* toks2 = skp->default_form;
- bool skip = pgf_tokens_equal(toks, toks2);
+ PgfSequence* syms = skp->forms[i].form;
+ PgfSequence* syms2 = skp->default_form;
+ bool skip = false; /*pgf_tokens_equal(toks, toks2);
for (size_t j = 0; j < i; j++) {
PgfTokens* toks2 = skp->forms[j].form;
skip |= pgf_tokens_equal(toks, toks2);
- }
+ }*/
if (!skip) {
- tok = gu_seq_get(toks, PgfToken, 0);
new_item = pgf_item_copy(item, after->ps->pool, after->ps);
- new_item->tok_idx++;
- new_item->alt = i;
- if (new_item->tok_idx == gu_seq_length(toks)) {
- new_item->tok_idx = 0;
- pgf_item_advance(new_item, after->ps->pool);
- }
- pgf_parsing_add_transition(before, after, tok, new_item);
+ new_item->alt = i+2;
+ new_item->alt_idx = 0;
+ sym = gu_seq_get(syms, PgfSymbol, new_item->alt_idx);
+ pgf_parsing_symbol(before, after, new_item, sym);
}
}
- } else if (alt == 0) {
- PgfToken tok =
- gu_seq_get(skp->default_form, PgfToken, idx);
- item->tok_idx++;
- if (item->tok_idx == gu_seq_length(skp->default_form)) {
- item->tok_idx = 0;
- pgf_item_advance(item, after->ps->pool);
- }
- pgf_parsing_add_transition(before, after, tok, item);
} else {
- gu_assert(alt <= skp->n_forms);
- PgfTokens* toks = skp->forms[alt - 1].form;
- PgfToken tok = gu_seq_get(toks, PgfToken, idx);
- item->tok_idx++;
- if (item->tok_idx == gu_seq_length(toks)) {
- item->tok_idx = 0;
+ PgfSequence* syms =
+ (item->alt == 1) ? skp->default_form :
+ skp->forms[item->alt-2].form;
+
+ if (item->alt_idx < gu_seq_length(syms)) {
+ sym = gu_seq_get(syms, PgfSymbol, item->alt_idx);
+ pgf_parsing_symbol(before, after, item, sym);
+ } else {
+ item->alt = 0;
pgf_item_advance(item, after->ps->pool);
+ gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
}
- pgf_parsing_add_transition(before, after, tok, item);
}
}
break;
@@ -1357,7 +1338,7 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
// XXX TODO proper support
break;
case PGF_SYMBOL_NE: {
- // Nothing to be done here
+ pgf_item_free(before, after, item);
break;
}
default:
@@ -1450,8 +1431,7 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
gu_alignof(PgfSymbolKS),
&item->curr_sym, after->ps->pool);
*((PgfSymbol*)(sks+1)) = prev;
- sks->tokens = gu_new_seq(PgfToken, 1, after->ps->pool);
- gu_seq_set(sks->tokens, PgfToken, 0, tok);
+ sks->token = tok;
item->seq_idx++;
pgf_parsing_add_transition(before, after, tok, item);
@@ -1755,9 +1735,7 @@ typedef struct {
} PgfPrefixTokenState;
static GuString
-pgf_get_tokens(PgfSequence* seq,
- uint16_t seq_idx, uint8_t tok_idx,
- GuPool* pool)
+pgf_get_tokens(PgfSequence* seq, uint16_t seq_idx, GuPool* pool)
{
GuPool* tmp_pool = gu_new_pool();
GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool);
@@ -1773,17 +1751,7 @@ pgf_get_tokens(PgfSequence* seq,
switch (i.tag) {
case PGF_SYMBOL_KS: {
PgfSymbolKS* symks = i.data;
- size_t len = gu_seq_length(symks->tokens);
- for (size_t i = tok_idx; i < len; i++) {
- if (i > 0) {
- gu_putc(' ', out, err);
- }
-
- PgfToken tok = gu_seq_get(symks->tokens, PgfToken, i);
- gu_string_write(tok, out, err);
- }
-
- tok_idx = 0;
+ gu_string_write(symks->token, out, err);
}
default:
goto end;
@@ -1809,18 +1777,9 @@ pgf_prefix_match_token(PgfTokenState* ts0, PgfToken tok, PgfItem* item)
PgfSequence* seq;
pgf_item_sequence(item, &lin_idx, &seq, ts->pool);
- uint16_t seq_idx = item->seq_idx;
- uint8_t tok_idx = item->tok_idx;
-
- // go one token back
- if (tok_idx > 0)
- tok_idx--;
- else
- seq_idx--;
-
ts->tp = gu_new(PgfTokenProb, ts->pool);
ts->tp->tok =
- pgf_get_tokens(seq, seq_idx, tok_idx, ts->pool);
+ pgf_get_tokens(seq, item->seq_idx-1, ts->pool);
ts->tp->cat = item->conts->ccat->cnccat->abscat->name;
ts->tp->prob = item->inside_prob+item->conts->outside_prob;
}
@@ -2346,17 +2305,15 @@ pgf_morpho_iter(GuMapItor* fn, const void* key, void* value, GuExn* err)
switch (i.tag) {
case PGF_SYMBOL_KS: {
PgfSymbolKS* symks = i.data;
- size_t len = gu_seq_length(symks->tokens);
- for (size_t i = 0; i < len; i++) {
- if (pos >= gu_seq_length(clo->tokens))
- goto cont;
-
- PgfToken tok1 = gu_seq_get(symks->tokens, PgfToken, i);
- PgfToken tok2 = gu_seq_get(clo->tokens, PgfToken, pos++);
-
- if (!gu_string_eq(tok1, tok2))
- goto cont;
- }
+
+ if (pos >= gu_seq_length(clo->tokens))
+ goto cont;
+
+ PgfToken tok1 = symks->token;
+ PgfToken tok2 = gu_seq_get(clo->tokens, PgfToken, pos++);
+
+ if (!gu_string_eq(tok1, tok2))
+ goto cont;
}
default:
continue;
@@ -2443,7 +2400,7 @@ pgf_fullform_iter(GuMapItor* fn, const void* key, void* value, GuExn* err)
PgfProductionApply* papp = i.data;
PgfSequence* seq = papp->fun->lins[cfc.lin_idx];
- GuString tokens = pgf_get_tokens(seq, 0, 0, st->pool);
+ GuString tokens = pgf_get_tokens(seq, 0, st->pool);
// create a new production index with keys that
// are multiword units
@@ -2531,12 +2488,10 @@ pgf_fullform_get_analyses(PgfFullFormEntry* entry,
static void
pgf_parser_index_token(PgfConcr* concr,
- PgfTokens* tokens,
+ PgfToken tok,
PgfCCat* ccat, size_t lin_idx, PgfProduction prod,
GuPool *pool)
{
- PgfToken tok = gu_seq_get(tokens, PgfToken, 0);
-
PgfProductionIdx* set =
gu_map_get(concr->leftcorner_tok_idx, &tok, PgfProductionIdx*);
if (set == NULL) {
@@ -2570,6 +2525,47 @@ pgf_parser_index_epsilon(PgfConcr* concr,
gu_buf_push(prods, PgfProduction, prod);
}
+static void
+pgf_parser_index_symbol(PgfConcr* concr, PgfSymbol sym,
+ PgfCCat* ccat, size_t lin_idx, PgfProduction prod,
+ GuPool *pool)
+{
+ GuVariantInfo i = gu_variant_open(sym);
+ switch (i.tag) {
+ case PGF_SYMBOL_KS: {
+ PgfSymbolKS* sks = i.data;
+ pgf_parser_index_token(concr,
+ sks->token,
+ ccat, lin_idx, prod,
+ pool);
+ break;
+ }
+ case PGF_SYMBOL_KP: {
+ PgfSymbolKP* skp = i.data;
+ PgfSymbol sym =
+ gu_seq_get(skp->default_form, PgfSymbol, 0);
+ pgf_parser_index_symbol(concr, sym,
+ ccat, lin_idx, prod,
+ pool);
+ for (size_t i = 0; i < skp->n_forms; i++) {
+ sym = gu_seq_get(skp->forms[i].form, PgfSymbol, 0);
+ pgf_parser_index_symbol(concr, sym,
+ ccat, lin_idx, prod,
+ pool);
+ }
+ break;
+ }
+ case PGF_SYMBOL_CAT:
+ case PGF_SYMBOL_LIT:
+ case PGF_SYMBOL_NE:
+ case PGF_SYMBOL_VAR:
+ // Nothing to be done here
+ break;
+ default:
+ gu_impossible();
+ }
+}
+
void
pgf_parser_index(PgfConcr* concr,
PgfCCat* ccat, PgfProduction prod,
@@ -2586,39 +2582,9 @@ pgf_parser_index(PgfConcr* concr,
PgfSequence* seq = papp->fun->lins[lin_idx];
if (gu_seq_length(seq) > 0) {
- GuVariantInfo i = gu_variant_open(gu_seq_get(seq, PgfSymbol, 0));
- switch (i.tag) {
- case PGF_SYMBOL_KS: {
- PgfSymbolKS* sks = i.data;
- pgf_parser_index_token(concr,
- sks->tokens,
- ccat, lin_idx, prod,
- pool);
- break;
- }
- case PGF_SYMBOL_KP: {
- PgfSymbolKP* skp = i.data;
- pgf_parser_index_token(concr,
- skp->default_form,
- ccat, lin_idx, prod,
- pool);
- for (size_t i = 0; i < skp->n_forms; i++) {
- pgf_parser_index_token(concr,
- skp->forms[i].form,
- ccat, lin_idx, prod,
- pool);
- }
- break;
- }
- case PGF_SYMBOL_CAT:
- case PGF_SYMBOL_LIT:
- case PGF_SYMBOL_NE:
- case PGF_SYMBOL_VAR:
- // Nothing to be done here
- break;
- default:
- gu_impossible();
- }
+ pgf_parser_index_symbol(concr, gu_seq_get(seq, PgfSymbol, 0),
+ ccat, lin_idx, prod,
+ pool);
} else {
pgf_parser_index_epsilon(concr,
ccat, lin_idx, prod,