From 2483dc772897eb0909664f1a88cc7f8ec50ebd5b Mon Sep 17 00:00:00 2001 From: "kr.angelov" Date: Wed, 6 Nov 2013 10:21:46 +0000 Subject: the content of ParseEngAbs3.probs is now merged with ParseEngAbs.probs. The later is now retrained. Once the grammar is compiled with the .probs file now it doesn't need anything more to do robust parsing. The robustness itself is controlled by the flags 'heuristic_search_factor', 'meta_prob' and 'meta_token_prob' in ParseEngAbs.gf --- src/runtime/c/pgf/data.h | 5 +- src/runtime/c/pgf/parser.c | 92 +++++++++++++++++++++++-------------- src/runtime/c/pgf/pgf.c | 57 ----------------------- src/runtime/c/pgf/pgf.h | 5 -- src/runtime/c/pgf/printer.c | 2 +- src/runtime/c/pgf/reader.c | 8 ++-- src/runtime/c/utils/pgf-translate.c | 19 ++------ 7 files changed, 69 insertions(+), 119 deletions(-) (limited to 'src/runtime/c') diff --git a/src/runtime/c/pgf/data.h b/src/runtime/c/pgf/data.h index 5b0401764..9bc73dd0a 100644 --- a/src/runtime/c/pgf/data.h +++ b/src/runtime/c/pgf/data.h @@ -87,9 +87,7 @@ typedef struct { PgfCId name; PgfHypos* context; - prob_t meta_prob; - prob_t meta_token_prob; - PgfMetaChildMap* meta_child_probs; + prob_t prob; void* predicate; } PgfAbsCat; @@ -230,6 +228,7 @@ typedef GuSeq PgfCncFuns; struct PgfConcr { PgfCId name; + PgfAbstr* abstr; PgfFlags* cflags; PgfPrintNames* printnames; GuMap* ccats; diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index 4e4724c75..ec21fc84e 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -63,7 +63,10 @@ typedef struct { int prod_full_count; #endif PgfItem* free_item; - prob_t beam_size; + + prob_t heuristic_factor; + prob_t meta_prob; + prob_t meta_token_prob; } PgfParsing; typedef enum { BIND_NONE, BIND_HARD, BIND_SOFT } BIND_TYPE; @@ -1389,12 +1392,14 @@ pgf_parsing_meta_predict(GuMapItor* fn, const void* key, void* value, GuExn* err { (void) (err); - PgfAbsCat* abscat = (PgfAbsCat*) key; - prob_t meta_prob = *((prob_t*) value); + PgfAbsCat* abscat = *((PgfAbsCat**) value); PgfMetaPredictFn* clo = (PgfMetaPredictFn*) fn; PgfParsing* ps = clo->ps; PgfItem* meta_item = clo->meta_item; + if (abscat->prob == INFINITY) + return; + PgfCncCat* cnccat = gu_map_get(ps->concr->cnccats, abscat->name, PgfCncCat*); if (cnccat == NULL) @@ -1412,7 +1417,7 @@ pgf_parsing_meta_predict(GuMapItor* fn, const void* key, void* value, GuExn* err PgfItem* item = pgf_item_copy(meta_item, ps); item->inside_prob += - ccat->viterbi_prob+meta_prob; + ccat->viterbi_prob+abscat->prob; size_t nargs = gu_seq_length(meta_item->args); item->args = gu_new_seq(PgfPArg, nargs+1, ps->pool); @@ -1698,18 +1703,14 @@ pgf_parsing_item(PgfParsing* ps, PgfItem* item) } pgf_parsing_complete(ps, item, ep); } else { - prob_t meta_token_prob = - item->conts->ccat->cnccat->abscat->meta_token_prob; + prob_t meta_token_prob = + ps->meta_token_prob; if (meta_token_prob != INFINITY) { pgf_parsing_meta_scan(ps, item, meta_token_prob); } - PgfCIdMap* meta_child_probs = - item->conts->ccat->cnccat->abscat->meta_child_probs; - if (meta_child_probs != NULL) { - PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, ps, item }; - gu_map_iter(meta_child_probs, &clo.fn, NULL); - } + PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, ps, item }; + gu_map_iter(ps->concr->abstr->cats, &clo.fn, NULL); } } else { pgf_parsing_symbol(ps, item, item->curr_sym); @@ -1721,22 +1722,38 @@ pgf_parsing_item(PgfParsing* ps, PgfItem* item) } } -static prob_t -pgf_parsing_default_beam_size(PgfConcr* concr) +static void +pgf_parsing_set_default_factors(PgfParsing* ps, PgfAbstr* abstr) { - PgfLiteral lit = gu_map_get(concr->cflags, "beam_size", PgfLiteral); + PgfLiteral lit; - if (gu_variant_is_null(lit)) - return 0; + lit = + gu_map_get(abstr->aflags, "heuristic_search_factor", PgfLiteral); + if (!gu_variant_is_null(lit)) { + GuVariantInfo pi = gu_variant_open(lit); + gu_assert (pi.tag == PGF_LITERAL_FLT); + ps->heuristic_factor = ((PgfLiteralFlt*) pi.data)->val; + } + + lit = + gu_map_get(abstr->aflags, "meta_prob", PgfLiteral); + if (!gu_variant_is_null(lit)) { + GuVariantInfo pi = gu_variant_open(lit); + gu_assert (pi.tag == PGF_LITERAL_FLT); + ps->meta_prob = - log(((PgfLiteralFlt*) pi.data)->val); + } - GuVariantInfo pi = gu_variant_open(lit); - gu_assert (pi.tag == PGF_LITERAL_FLT); - return ((PgfLiteralFlt*) pi.data)->val; + lit = + gu_map_get(abstr->aflags, "meta_token_prob", PgfLiteral); + if (!gu_variant_is_null(lit)) { + GuVariantInfo pi = gu_variant_open(lit); + gu_assert (pi.tag == PGF_LITERAL_FLT); + ps->meta_token_prob = - log(((PgfLiteralFlt*) pi.data)->val); + } } static PgfParsing* -pgf_new_parsing(PgfConcr* concr, - GuString sentence, double heuristics, +pgf_new_parsing(PgfConcr* concr, GuString sentence, GuPool* pool, GuPool* out_pool) { PgfParsing* ps = gu_new(PgfParsing, pool); @@ -1756,7 +1773,11 @@ pgf_new_parsing(PgfConcr* concr, ps->prod_full_count = 0; #endif ps->free_item = NULL; - ps->beam_size = heuristics; + ps->heuristic_factor = 0; + ps->meta_prob = INFINITY; + ps->meta_token_prob = INFINITY; + + pgf_parsing_set_default_factors(ps, concr->abstr); PgfExprMeta *expr_meta = gu_new_variant(PGF_EXPR_META, @@ -2107,7 +2128,7 @@ pgf_parse_result_is_new(PgfExprState* st) // TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat static PgfParsing* pgf_parsing_init(PgfConcr* concr, PgfCId cat, size_t lin_idx, - GuString sentence, double heuristics, + GuString sentence, double heuristic_factor, GuExn* err, GuPool* pool, GuPool* out_pool) { @@ -2121,12 +2142,13 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat, size_t lin_idx, gu_assert(lin_idx < cnccat->n_lins); - if (heuristics < 0) { - heuristics = pgf_parsing_default_beam_size(concr); + PgfParsing* ps = + pgf_new_parsing(concr, sentence, pool, out_pool); + + if (heuristic_factor >= 0) { + ps->heuristic_factor = heuristic_factor; } - PgfParsing* ps = - pgf_new_parsing(concr, sentence, heuristics, pool, out_pool); PgfParseState* state = pgf_new_parse_state(ps, 0, BIND_SOFT); @@ -2156,11 +2178,13 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat, size_t lin_idx, gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); } - PgfItem *item = - pgf_new_item(ps, conts, ps->meta_prod); - item->inside_prob = - ccat->cnccat->abscat->meta_prob; - gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); + if (ps->meta_prob != INFINITY) { + PgfItem *item = + pgf_new_item(ps, conts, ps->meta_prod); + item->inside_prob = + ps->meta_prob; + gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); + } } } @@ -2200,7 +2224,7 @@ pgf_parsing_proceed(PgfParsing* ps) prob_t state_delta = (st->viterbi_prob-(st->next ? st->next->viterbi_prob : 0))* - ps->beam_size; + ps->heuristic_factor; delta_prob += state_delta; st = st->next; } diff --git a/src/runtime/c/pgf/pgf.c b/src/runtime/c/pgf/pgf.c index e804f5ce7..93dea300a 100644 --- a/src/runtime/c/pgf/pgf.c +++ b/src/runtime/c/pgf/pgf.c @@ -35,63 +35,6 @@ pgf_read(const char* fpath, return pgf; } -void -pgf_load_meta_child_probs(PgfPGF* pgf, const char* fpath, - GuPool* pool, GuExn* err) -{ - FILE *fp = fopen(fpath, "r"); - if (!fp) { - gu_raise_errno(err); - return; - } - - GuPool* tmp_pool = gu_new_pool(); - - for (;;) { - char cat1[21]; - char cat2[21]; - prob_t prob; - - if (fscanf(fp, "%20s\t%20s\t%f", cat1, cat2, &prob) < 3) - break; - - prob = - log(prob); - - PgfAbsCat* abscat1 = - gu_map_get(pgf->abstract.cats, cat1, PgfAbsCat*); - if (abscat1 == NULL) { - GuExnData* exn = gu_raise(err, PgfExn); - exn->data = "Unknown category name"; - goto close; - } - - if (strcmp(cat2, "*") == 0) { - abscat1->meta_prob = prob; - } else if (strcmp(cat2, "_") == 0) { - abscat1->meta_token_prob = prob; - } else { - PgfAbsCat* abscat2 = gu_map_get(pgf->abstract.cats, cat2, PgfAbsCat*); - if (abscat2 == NULL) { - gu_raise(err, PgfExn); - GuExnData* exn = gu_raise(err, PgfExn); - exn->data = "Unknown category name"; - goto close; - } - - if (abscat1->meta_child_probs == NULL) { - abscat1->meta_child_probs = - gu_map_type_new(PgfMetaChildMap, pool); - } - - gu_map_put(abscat1->meta_child_probs, abscat2, prob_t, prob); - } - } - -close: - gu_pool_free(tmp_pool); - fclose(fp); -} - GuString pgf_abstract_name(PgfPGF* pgf) { diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index 61b8bea6c..ffc293306 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -80,11 +80,6 @@ pgf_read(const char* fpath, * */ - -void -pgf_load_meta_child_probs(PgfPGF*, const char* fpath, - GuPool* pool, GuExn* err); - GuString pgf_abstract_name(PgfPGF*); diff --git a/src/runtime/c/pgf/printer.c b/src/runtime/c/pgf/printer.c index da7c70d7c..78c2b74db 100644 --- a/src/runtime/c/pgf/printer.c +++ b/src/runtime/c/pgf/printer.c @@ -48,7 +48,7 @@ pgf_print_cat(GuMapItor* fn, const void* key, void* value, ctxt = next; } - gu_printf(out, err, " ; -- %f\n",cat->meta_prob); + gu_printf(out, err, " ; -- %f\n", cat->prob); } void diff --git a/src/runtime/c/pgf/reader.c b/src/runtime/c/pgf/reader.c index d215f25e1..12605b89a 100644 --- a/src/runtime/c/pgf/reader.c +++ b/src/runtime/c/pgf/reader.c @@ -516,10 +516,6 @@ pgf_read_abscat(PgfReader* rdr, PgfAbstr* abstr, PgfCIdMap* abscats) gu_return_on_exn(rdr->err, NULL); } - abscat->meta_prob = INFINITY; - abscat->meta_token_prob = INFINITY; - abscat->meta_child_probs = NULL; - GuBuf* functions = gu_new_buf(PgfAbsFun*, rdr->tmp_pool); size_t n_functions = pgf_read_len(rdr); @@ -538,6 +534,8 @@ pgf_read_abscat(PgfReader* rdr, PgfAbstr* abstr, PgfCIdMap* abscats) gu_buf_push(functions, PgfAbsFun*, absfun); } + abscat->prob = - log(gu_in_f64be(rdr->in, rdr->err)); + pgf_jit_predicate(rdr->jit_state, abscats, abscat, functions); return abscat; @@ -1155,6 +1153,8 @@ pgf_read_concrete(PgfReader* rdr, PgfAbstr* abstr, PgfAbsFun* abs_lin_fun) pgf_read_cid(rdr, rdr->opool); gu_return_on_exn(rdr->err, NULL); + concr->abstr = abstr; + concr->cflags = pgf_read_flags(rdr); gu_return_on_exn(rdr->err, NULL); diff --git a/src/runtime/c/utils/pgf-translate.c b/src/runtime/c/utils/pgf-translate.c index 79a4fdd42..32f8323ab 100644 --- a/src/runtime/c/utils/pgf-translate.c +++ b/src/runtime/c/utils/pgf-translate.c @@ -53,18 +53,17 @@ int main(int argc, char* argv[]) { // Create the pool that is used to allocate everything GuPool* pool = gu_new_pool(); int status = EXIT_SUCCESS; - if (argc < 5 || argc > 6) { - fprintf(stderr, "usage: %s pgf cat from-lang to-lang [probs-file]\n", argv[0]); + if (argc < 5) { + fprintf(stderr, "usage: %s pgf cat from-lang to-lang\n", argv[0]); status = EXIT_FAILURE; goto fail; } - char* filename = argv[1]; + GuString filename = argv[1]; GuString cat = argv[2]; - GuString from_lang = argv[3]; GuString to_lang = argv[4]; - + // Create an exception frame that catches all errors. GuExn* err = gu_new_exn(NULL, gu_kind(type), pool); @@ -78,16 +77,6 @@ int main(int argc, char* argv[]) { goto fail; } - if (argc == 6) { - char* meta_probs_filename = argv[5]; - pgf_load_meta_child_probs(pgf, meta_probs_filename, pool, err); - if (!gu_ok(err)) { - fprintf(stderr, "Loading meta child probs failed\n"); - status = EXIT_FAILURE; - goto fail; - } - } - // Look up the source and destination concrete categories PgfConcr* from_concr = pgf_get_language(pgf, from_lang); PgfConcr* to_concr = pgf_get_language(pgf, to_lang); -- cgit v1.2.3