From 426bc49a52b4efa0ef0129d713842d8c9abdf0ff Mon Sep 17 00:00:00 2001 From: "kr.angelov" Date: Fri, 27 Sep 2013 15:09:48 +0000 Subject: a major refactoring in the C and the Haskell runtimes. Note incompatible change in the PGF format!!! The following are the outcomes: - Predef.nonExist is fully supported by both the Haskell and the C runtimes - Predef.BIND is now an internal compiler defined token. For now it behaves just as usual for the Haskell runtime, i.e. it generates &+. However, the special treatment will let us to handle it properly in the C runtime. - This required a major change in the PGF format since both nonExist and BIND may appear inside 'pre' and this was not supported before. --- src/runtime/c/pgf/parser.c | 230 +++++++++++++++++++-------------------------- 1 file changed, 98 insertions(+), 132 deletions(-) (limited to 'src/runtime/c/pgf/parser.c') diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index 92c689fae..188672dd3 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -133,8 +133,8 @@ struct PgfItem { PgfPArgs* args; PgfSymbol curr_sym; uint16_t seq_idx; - uint8_t tok_idx; - uint8_t alt; + uint8_t alt_idx; // position in the pre alternative + uint8_t alt; // the number of the alternative prob_t inside_prob; }; @@ -694,7 +694,7 @@ pgf_new_item(PgfItemConts* conts, PgfProduction prod, item->prod = prod; item->curr_sym = gu_null_variant; item->seq_idx = 0; - item->tok_idx = 0; + item->alt_idx = 0; item->alt = 0; conts->ref_count++; @@ -758,8 +758,12 @@ pgf_item_update_arg(PgfItem* item, size_t d, PgfCCat *new_ccat, static void pgf_item_advance(PgfItem* item, GuPool* pool) { - item->seq_idx++; - pgf_item_set_curr_symbol(item, pool); + if (GU_LIKELY(item->alt == 0)) { + item->seq_idx++; + pgf_item_set_curr_symbol(item, pool); + } + else + item->alt_idx++; } static void @@ -1133,8 +1137,7 @@ pgf_parsing_meta_scan(PgfParseState* before, PgfParseState* after, gu_alignof(PgfSymbolKS), &item->curr_sym, after->ps->pool); *((PgfSymbol*)(sks+1)) = prev; - sks->tokens = gu_new_seq(PgfToken, 1, after->ps->pool); - gu_seq_set(sks->tokens, PgfToken, 0, tok); + sks->token = tok; gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item); } @@ -1218,76 +1221,54 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after, case PGF_SYMBOL_KS: { if (after != NULL) { PgfSymbolKS* sks = gu_variant_data(sym); - gu_assert(item->tok_idx < gu_seq_length(sks->tokens)); - PgfToken tok = - gu_seq_get(sks->tokens, PgfToken, item->tok_idx++); - if (item->tok_idx == gu_seq_length(sks->tokens)) { - item->tok_idx = 0; - pgf_item_advance(item, after->ps->pool); - } - pgf_parsing_add_transition(before, after, tok, item); + pgf_item_advance(item, after->ps->pool); + pgf_parsing_add_transition(before, after, sks->token, item); } break; } case PGF_SYMBOL_KP: { if (after != NULL) { PgfSymbolKP* skp = gu_variant_data(sym); - size_t idx = item->tok_idx; - uint8_t alt = item->alt; - gu_assert(idx < gu_seq_length(skp->default_form)); - if (idx == 0) { - PgfToken tok; + + PgfSymbol sym; + if (item->alt == 0) { PgfItem* new_item; - - tok = gu_seq_get(skp->default_form, PgfToken, 0); + new_item = pgf_item_copy(item, after->ps->pool, after->ps); - new_item->tok_idx++; - if (new_item->tok_idx == gu_seq_length(skp->default_form)) { - new_item->tok_idx = 0; - pgf_item_advance(new_item, after->ps->pool); - } - pgf_parsing_add_transition(before, after, tok, new_item); + new_item->alt = 1; + new_item->alt_idx = 0; + sym = gu_seq_get(skp->default_form, PgfSymbol, new_item->alt_idx); + pgf_parsing_symbol(before, after, new_item, sym); for (size_t i = 0; i < skp->n_forms; i++) { - // XXX: do nubbing properly - PgfTokens* toks = skp->forms[i].form; - PgfTokens* toks2 = skp->default_form; - bool skip = pgf_tokens_equal(toks, toks2); + PgfSequence* syms = skp->forms[i].form; + PgfSequence* syms2 = skp->default_form; + bool skip = false; /*pgf_tokens_equal(toks, toks2); for (size_t j = 0; j < i; j++) { PgfTokens* toks2 = skp->forms[j].form; skip |= pgf_tokens_equal(toks, toks2); - } + }*/ if (!skip) { - tok = gu_seq_get(toks, PgfToken, 0); new_item = pgf_item_copy(item, after->ps->pool, after->ps); - new_item->tok_idx++; - new_item->alt = i; - if (new_item->tok_idx == gu_seq_length(toks)) { - new_item->tok_idx = 0; - pgf_item_advance(new_item, after->ps->pool); - } - pgf_parsing_add_transition(before, after, tok, new_item); + new_item->alt = i+2; + new_item->alt_idx = 0; + sym = gu_seq_get(syms, PgfSymbol, new_item->alt_idx); + pgf_parsing_symbol(before, after, new_item, sym); } } - } else if (alt == 0) { - PgfToken tok = - gu_seq_get(skp->default_form, PgfToken, idx); - item->tok_idx++; - if (item->tok_idx == gu_seq_length(skp->default_form)) { - item->tok_idx = 0; - pgf_item_advance(item, after->ps->pool); - } - pgf_parsing_add_transition(before, after, tok, item); } else { - gu_assert(alt <= skp->n_forms); - PgfTokens* toks = skp->forms[alt - 1].form; - PgfToken tok = gu_seq_get(toks, PgfToken, idx); - item->tok_idx++; - if (item->tok_idx == gu_seq_length(toks)) { - item->tok_idx = 0; + PgfSequence* syms = + (item->alt == 1) ? skp->default_form : + skp->forms[item->alt-2].form; + + if (item->alt_idx < gu_seq_length(syms)) { + sym = gu_seq_get(syms, PgfSymbol, item->alt_idx); + pgf_parsing_symbol(before, after, item, sym); + } else { + item->alt = 0; pgf_item_advance(item, after->ps->pool); + gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item); } - pgf_parsing_add_transition(before, after, tok, item); } } break; @@ -1357,7 +1338,7 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after, // XXX TODO proper support break; case PGF_SYMBOL_NE: { - // Nothing to be done here + pgf_item_free(before, after, item); break; } default: @@ -1450,8 +1431,7 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item) gu_alignof(PgfSymbolKS), &item->curr_sym, after->ps->pool); *((PgfSymbol*)(sks+1)) = prev; - sks->tokens = gu_new_seq(PgfToken, 1, after->ps->pool); - gu_seq_set(sks->tokens, PgfToken, 0, tok); + sks->token = tok; item->seq_idx++; pgf_parsing_add_transition(before, after, tok, item); @@ -1755,9 +1735,7 @@ typedef struct { } PgfPrefixTokenState; static GuString -pgf_get_tokens(PgfSequence* seq, - uint16_t seq_idx, uint8_t tok_idx, - GuPool* pool) +pgf_get_tokens(PgfSequence* seq, uint16_t seq_idx, GuPool* pool) { GuPool* tmp_pool = gu_new_pool(); GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool); @@ -1773,17 +1751,7 @@ pgf_get_tokens(PgfSequence* seq, switch (i.tag) { case PGF_SYMBOL_KS: { PgfSymbolKS* symks = i.data; - size_t len = gu_seq_length(symks->tokens); - for (size_t i = tok_idx; i < len; i++) { - if (i > 0) { - gu_putc(' ', out, err); - } - - PgfToken tok = gu_seq_get(symks->tokens, PgfToken, i); - gu_string_write(tok, out, err); - } - - tok_idx = 0; + gu_string_write(symks->token, out, err); } default: goto end; @@ -1809,18 +1777,9 @@ pgf_prefix_match_token(PgfTokenState* ts0, PgfToken tok, PgfItem* item) PgfSequence* seq; pgf_item_sequence(item, &lin_idx, &seq, ts->pool); - uint16_t seq_idx = item->seq_idx; - uint8_t tok_idx = item->tok_idx; - - // go one token back - if (tok_idx > 0) - tok_idx--; - else - seq_idx--; - ts->tp = gu_new(PgfTokenProb, ts->pool); ts->tp->tok = - pgf_get_tokens(seq, seq_idx, tok_idx, ts->pool); + pgf_get_tokens(seq, item->seq_idx-1, ts->pool); ts->tp->cat = item->conts->ccat->cnccat->abscat->name; ts->tp->prob = item->inside_prob+item->conts->outside_prob; } @@ -2346,17 +2305,15 @@ pgf_morpho_iter(GuMapItor* fn, const void* key, void* value, GuExn* err) switch (i.tag) { case PGF_SYMBOL_KS: { PgfSymbolKS* symks = i.data; - size_t len = gu_seq_length(symks->tokens); - for (size_t i = 0; i < len; i++) { - if (pos >= gu_seq_length(clo->tokens)) - goto cont; - - PgfToken tok1 = gu_seq_get(symks->tokens, PgfToken, i); - PgfToken tok2 = gu_seq_get(clo->tokens, PgfToken, pos++); - - if (!gu_string_eq(tok1, tok2)) - goto cont; - } + + if (pos >= gu_seq_length(clo->tokens)) + goto cont; + + PgfToken tok1 = symks->token; + PgfToken tok2 = gu_seq_get(clo->tokens, PgfToken, pos++); + + if (!gu_string_eq(tok1, tok2)) + goto cont; } default: continue; @@ -2443,7 +2400,7 @@ pgf_fullform_iter(GuMapItor* fn, const void* key, void* value, GuExn* err) PgfProductionApply* papp = i.data; PgfSequence* seq = papp->fun->lins[cfc.lin_idx]; - GuString tokens = pgf_get_tokens(seq, 0, 0, st->pool); + GuString tokens = pgf_get_tokens(seq, 0, st->pool); // create a new production index with keys that // are multiword units @@ -2531,12 +2488,10 @@ pgf_fullform_get_analyses(PgfFullFormEntry* entry, static void pgf_parser_index_token(PgfConcr* concr, - PgfTokens* tokens, + PgfToken tok, PgfCCat* ccat, size_t lin_idx, PgfProduction prod, GuPool *pool) { - PgfToken tok = gu_seq_get(tokens, PgfToken, 0); - PgfProductionIdx* set = gu_map_get(concr->leftcorner_tok_idx, &tok, PgfProductionIdx*); if (set == NULL) { @@ -2570,6 +2525,47 @@ pgf_parser_index_epsilon(PgfConcr* concr, gu_buf_push(prods, PgfProduction, prod); } +static void +pgf_parser_index_symbol(PgfConcr* concr, PgfSymbol sym, + PgfCCat* ccat, size_t lin_idx, PgfProduction prod, + GuPool *pool) +{ + GuVariantInfo i = gu_variant_open(sym); + switch (i.tag) { + case PGF_SYMBOL_KS: { + PgfSymbolKS* sks = i.data; + pgf_parser_index_token(concr, + sks->token, + ccat, lin_idx, prod, + pool); + break; + } + case PGF_SYMBOL_KP: { + PgfSymbolKP* skp = i.data; + PgfSymbol sym = + gu_seq_get(skp->default_form, PgfSymbol, 0); + pgf_parser_index_symbol(concr, sym, + ccat, lin_idx, prod, + pool); + for (size_t i = 0; i < skp->n_forms; i++) { + sym = gu_seq_get(skp->forms[i].form, PgfSymbol, 0); + pgf_parser_index_symbol(concr, sym, + ccat, lin_idx, prod, + pool); + } + break; + } + case PGF_SYMBOL_CAT: + case PGF_SYMBOL_LIT: + case PGF_SYMBOL_NE: + case PGF_SYMBOL_VAR: + // Nothing to be done here + break; + default: + gu_impossible(); + } +} + void pgf_parser_index(PgfConcr* concr, PgfCCat* ccat, PgfProduction prod, @@ -2586,39 +2582,9 @@ pgf_parser_index(PgfConcr* concr, PgfSequence* seq = papp->fun->lins[lin_idx]; if (gu_seq_length(seq) > 0) { - GuVariantInfo i = gu_variant_open(gu_seq_get(seq, PgfSymbol, 0)); - switch (i.tag) { - case PGF_SYMBOL_KS: { - PgfSymbolKS* sks = i.data; - pgf_parser_index_token(concr, - sks->tokens, - ccat, lin_idx, prod, - pool); - break; - } - case PGF_SYMBOL_KP: { - PgfSymbolKP* skp = i.data; - pgf_parser_index_token(concr, - skp->default_form, - ccat, lin_idx, prod, - pool); - for (size_t i = 0; i < skp->n_forms; i++) { - pgf_parser_index_token(concr, - skp->forms[i].form, - ccat, lin_idx, prod, - pool); - } - break; - } - case PGF_SYMBOL_CAT: - case PGF_SYMBOL_LIT: - case PGF_SYMBOL_NE: - case PGF_SYMBOL_VAR: - // Nothing to be done here - break; - default: - gu_impossible(); - } + pgf_parser_index_symbol(concr, gu_seq_get(seq, PgfSymbol, 0), + ccat, lin_idx, prod, + pool); } else { pgf_parser_index_epsilon(concr, ccat, lin_idx, prod, -- cgit v1.2.3