summaryrefslogtreecommitdiff
path: root/src/runtime/c
diff options
context:
space:
mode:
authorkr.angelov <kr.angelov@gmail.com>2012-05-08 12:13:28 +0000
committerkr.angelov <kr.angelov@gmail.com>2012-05-08 12:13:28 +0000
commita6800fc0da1d90dad0362c806037f9a92ab3e813 (patch)
treead383d165e5d2fe36fe10729d83ff5aa201b0f6c /src/runtime/c
parent931066f6fc004c7a193e5200d13ea651c7e02fd1 (diff)
a new unbiased statistical parser. it is still far from perfect use it on your own risk.
Diffstat (limited to 'src/runtime/c')
-rw-r--r--src/runtime/c/pgf/data.h6
-rw-r--r--src/runtime/c/pgf/lexer.c2
-rw-r--r--src/runtime/c/pgf/literals.c2
-rw-r--r--src/runtime/c/pgf/parser.c1350
-rw-r--r--src/runtime/c/pgf/parser.h16
-rw-r--r--src/runtime/c/pgf/printer.c4
-rw-r--r--src/runtime/c/pgf/reader.c45
-rw-r--r--src/runtime/c/utils/pgf-translate.c158
8 files changed, 822 insertions, 761 deletions
diff --git a/src/runtime/c/pgf/data.h b/src/runtime/c/pgf/data.h
index 8063c4ff3..cdbbf8d6b 100644
--- a/src/runtime/c/pgf/data.h
+++ b/src/runtime/c/pgf/data.h
@@ -125,7 +125,7 @@ struct PgfPGF {
extern GU_DECLARE_TYPE(PgfPGF, struct);
typedef struct {
- double prob;
+ float prob;
PgfExpr expr;
} PgfExprProb;
@@ -148,6 +148,9 @@ struct PgfCatFun {
struct PgfCat {
// TODO: Add cid here
PgfHypos context;
+
+ float meta_prob;
+
GuLength n_functions;
PgfCatFun functions[]; // XXX: resolve to PgfFunDecl*?
};
@@ -189,6 +192,7 @@ struct PgfCCat {
PgfFunIds* lindefs;
size_t n_synprods;
PgfProductionSeq prods;
+ float viterbi_prob;
int fid;
};
diff --git a/src/runtime/c/pgf/lexer.c b/src/runtime/c/pgf/lexer.c
index 27fe97e34..05372eca0 100644
--- a/src/runtime/c/pgf/lexer.c
+++ b/src/runtime/c/pgf/lexer.c
@@ -46,7 +46,7 @@ pgf_lexer_next_token(PgfLexer *lexer, GuExn* err, GuPool *pool)
if (gu_exn_is_raised(err))
goto stop;
- if (lexer->ucs == '.' && counter < 3) {
+ if (lexer->ucs == '.' && counter < 4) {
// perhaps an abreviation
gu_ucs_write(lexer->ucs, wtr, err);
if (gu_exn_is_raised(err))
diff --git a/src/runtime/c/pgf/literals.c b/src/runtime/c/pgf/literals.c
index 985e29f1b..ea082e28a 100644
--- a/src/runtime/c/pgf/literals.c
+++ b/src/runtime/c/pgf/literals.c
@@ -235,6 +235,8 @@ pgf_match_name_lit(PgfConcr* concr, PgfItem* item, PgfToken tok,
lit_str->val = gu_string_buf_freeze(sbuf, pool);
*out_ep = ep;
+ } else {
+ *out_ep = NULL;
}
gu_pool_free(tmp_pool);
diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c
index b110caf67..4cb1c1b4d 100644
--- a/src/runtime/c/pgf/parser.c
+++ b/src/runtime/c/pgf/parser.c
@@ -1,45 +1,83 @@
#include <pgf/parser.h>
-#include <gu/choice.h>
#include <gu/seq.h>
#include <gu/assert.h>
#include <gu/log.h>
#include <gu/file.h>
#include <math.h>
-#include <stdlib.h>
+
+//#define PGF_PARSER_DEBUG
typedef GuBuf PgfItemBuf;
+static GU_DEFINE_TYPE(PgfItemBuf, abstract, _);
+
typedef GuList(PgfItemBuf*) PgfItemBufs;
+static GU_DEFINE_TYPE(PgfItemBufs, abstract, _);
+
+typedef GuMap PgfContsMap;
+GU_DEFINE_TYPE(PgfContsMap, GuMap,
+ gu_type(PgfCCat), NULL,
+ gu_ptr_type(PgfItemBufs), &gu_null_struct);
+
+typedef GuMap PgfGenCatMap;
+GU_DEFINE_TYPE(PgfGenCatMap, GuMap,
+ gu_type(PgfItemBuf), NULL,
+ gu_ptr_type(PgfCCat), &gu_null_struct);
typedef GuBuf PgfCCatBuf;
-struct PgfParse {
+typedef struct {
PgfConcr* concr;
- PgfItemBuf* agenda;
+ PgfCCat* completed;
+ PgfItem* target;
+ PgfProduction meta_prod;
int max_fid;
-};
+} PgfParsing;
typedef struct {
- PgfExprProb ep;
+ PgfToken tok;
+ PgfItemBuf* lexicon_idx;
+} PgfTokenState;
+
+struct PgfParseState {
+ GuPool* pool;
+ PgfParseState* next;
+
+ PgfItemBuf* agenda;
+ PgfContsMap* conts_map;
+ PgfGenCatMap* generated_cats;
+ PgfExprProb* meta_ep;
+ int offset;
+
+ PgfParsing* ps;
+ PgfTokenState* ts;
+};
+
+typedef struct PgfExprState PgfExprState;
+
+struct PgfExprState {
+ PgfExprState* cont;
+ PgfExpr expr;
PgfPArgs args;
size_t arg_idx;
-} PgfExprState;
+};
-typedef GuMap PgfExprCache;
-static GU_DEFINE_TYPE(PgfExprCache, GuMap,
- gu_type(PgfCCat), NULL,
- gu_ptr_type(PgfExprProb), &gu_null_struct);
+typedef struct {
+ PgfExprState *st;
+ float prob;
+} PgfExprQState;
typedef struct PgfParseResult PgfParseResult;
struct PgfParseResult {
- PgfConcr* concr;
- PgfCCatBuf* completed;
- GuChoice* choice;
+ GuPool* tmp_pool;
+ PgfParseState* state;
+ GuBuf* pqueue;
PgfExprEnum en;
};
typedef struct PgfItemBase PgfItemBase;
+
struct PgfItemBase {
PgfItemBuf* conts;
PgfCCat* ccat;
@@ -48,12 +86,17 @@ struct PgfItemBase {
};
struct PgfItem {
+#ifdef PGF_PARSER_DEBUG
+ int start,end;
+#endif
PgfItemBase* base;
PgfPArgs args;
PgfSymbol curr_sym;
uint16_t seq_idx;
uint8_t tok_idx;
uint8_t alt;
+ float inside_prob;
+ float outside_prob;
};
typedef struct {
@@ -67,50 +110,16 @@ static GU_DEFINE_TYPE(PgfCFCat, struct,
extern GuHasher pgf_cfcat_hasher;
-static GU_DEFINE_TYPE(PgfItemBuf, abstract, _);
-static GU_DEFINE_TYPE(PgfItemBufs, abstract, _);
-
-typedef GuMap PgfContsMap;
-GU_DEFINE_TYPE(PgfContsMap, GuMap,
- gu_type(PgfCCat), NULL,
- gu_ptr_type(PgfItemBufs), &gu_null_struct);
-
typedef GuMap PgfEpsilonIdx;
GU_DEFINE_TYPE(PgfEpsilonIdx, GuMap,
gu_type(PgfCFCat), &pgf_cfcat_hasher,
gu_ptr_type(PgfCCat), &gu_null_struct);
-typedef GuMap PgfGenCatMap;
-GU_DEFINE_TYPE(PgfGenCatMap, GuMap,
- gu_type(PgfItemBuf), NULL,
- gu_ptr_type(PgfCCat), &gu_null_struct);
-
-// GuString -> PgfItemBuf*
+// GuString -> PgfItemBuf*
typedef GuStringMap PgfTransitions;
GU_DEFINE_TYPE(PgfTransitions, GuStringMap,
gu_ptr_type(PgfItemBuf), &gu_null_struct);
-typedef const struct PgfLexCallback PgfLexCallback;
-
-struct PgfLexCallback {
- void (*lex)(PgfLexCallback* self, PgfToken tok, PgfItem* item);
-};
-
-typedef struct {
- GuPool* pool;
- GuPool* tmp_pool;
- PgfConcr* concr;
- PgfContsMap* conts_map;
- PgfGenCatMap* generated_cats;
- PgfCCatBuf* completed;
- PgfLexCallback* lex_callback;
- PgfItemBuf* lexicon_idx;
- PgfExprProb* meta_ep;
- PgfItemBuf *metas;
- PgfToken tok;
- int max_fid;
-} PgfParsing;
-
static PgfSymbol
pgf_prev_extern_sym(PgfSymbol sym)
{
@@ -316,7 +325,10 @@ pgf_print_item_seq(PgfItem *item,
static void
pgf_print_item(PgfItem* item, GuWriter* wtr, GuExn* err, GuPool* pool)
{
- gu_printf(wtr, err, "[C%d -> ",item->base->ccat->fid);
+ gu_printf(wtr, err, "[%d-%d; C%d -> ",
+ item->start,
+ item->end,
+ item->base->ccat->fid);
GuVariantInfo i = gu_variant_open(item->base->prod);
switch (i.tag) {
@@ -352,15 +364,44 @@ pgf_print_item(PgfItem* item, GuWriter* wtr, GuExn* err, GuPool* pool)
gu_impossible();
}
- pgf_print_item_seq(item, wtr, err, pool);
- gu_printf(wtr, err, "]\n");
+ pgf_print_item_seq(item, wtr, err, pool);
+ gu_printf(wtr, err, "; %f+%f=%f]\n",
+ item->inside_prob,
+ item->outside_prob,
+ item->inside_prob+item->outside_prob);
}
#endif
+static int
+cmp_item_prob(GuOrder* self, const void* a, const void* b)
+{
+ PgfItem *item1 = *((PgfItem **) a);
+ PgfItem *item2 = *((PgfItem **) b);
+
+ float prob1 = item1->inside_prob + item1->outside_prob;
+ float prob2 = item2->inside_prob + item2->outside_prob;
+
+ if (prob1 < prob2)
+ return -1;
+ else if (prob1 > prob2)
+ return 1;
+ else
+ return 0;
+}
+
+static GuOrder
+pgf_item_prob_order = { cmp_item_prob };
+
static void
-pgf_parsing_add_transition(PgfParsing* parsing, PgfToken tok, PgfItem* item)
+pgf_parsing_add_transition(PgfParseState* before, PgfParseState* after,
+ PgfToken tok, PgfItem* item)
{
- parsing->lex_callback->lex(parsing->lex_callback, tok, item);
+ if (gu_string_eq(tok, after->ts->tok)) {
+ if (after->next == NULL)
+ after->ps->target = item;
+
+ gu_buf_heap_push(after->agenda, &pgf_item_prob_order, &item);
+ }
}
static PgfItemBufs*
@@ -394,35 +435,24 @@ pgf_parsing_get_conts(PgfContsMap* conts_map,
return conts;
}
-static bool
-pgf_parsing_has_conts(PgfContsMap* conts_map,
- PgfCCat* ccat, size_t lin_idx)
-{
- gu_require(lin_idx < ccat->cnccat->n_lins);
- PgfItemBufs* contss = gu_map_get(conts_map, ccat, PgfItemBufs*);
- if (contss == NULL)
- return false;
-
- return (gu_list_index(contss, lin_idx) != NULL);
-}
-
static PgfCCat*
-pgf_parsing_create_completed(PgfParsing* parsing, PgfItemBuf* conts,
- PgfCncCat* cnccat)
+pgf_parsing_create_completed(PgfParseState* state, PgfItemBuf* conts,
+ float viterbi_prob, PgfCncCat* cnccat)
{
- PgfCCat* cat = gu_new(PgfCCat, parsing->pool);
+ PgfCCat* cat = gu_new(PgfCCat, state->pool);
cat->cnccat = cnccat;
- cat->fid = parsing->max_fid++;
- cat->prods = gu_buf_seq(gu_new_buf(PgfProduction, parsing->pool));
+ cat->viterbi_prob = viterbi_prob;
+ cat->fid = state->ps->max_fid++;
+ cat->prods = gu_buf_seq(gu_new_buf(PgfProduction, state->pool));
cat->n_synprods = 0;
- gu_map_put(parsing->generated_cats, conts, PgfCCat*, cat);
+ gu_map_put(state->generated_cats, conts, PgfCCat*, cat);
return cat;
}
static PgfCCat*
-pgf_parsing_get_completed(PgfParsing* parsing, PgfItemBuf* conts)
+pgf_parsing_get_completed(PgfParseState* state, PgfItemBuf* conts)
{
- return gu_map_get(parsing->generated_cats, conts, PgfCCat*);
+ return gu_map_get(state->generated_cats, conts, PgfCCat*);
}
static void
@@ -463,8 +493,8 @@ pgf_item_set_curr_symbol(PgfItem* item, GuPool* pool)
}
static PgfItem*
-pgf_new_item(PgfCCat* ccat, size_t lin_idx,
- PgfProduction prod, PgfItemBuf* conts, GuPool* pool)
+pgf_new_item(int pos, PgfCCat* ccat, size_t lin_idx,
+ PgfProduction prod, PgfItemBuf* conts, GuPool* pool)
{
PgfItemBase* base = gu_new(PgfItemBase, pool);
base->ccat = ccat;
@@ -478,6 +508,13 @@ pgf_new_item(PgfCCat* ccat, size_t lin_idx,
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = pi.data;
item->args = papp->args;
+ item->inside_prob = papp->fun->ep->prob;
+
+ int n_args = gu_seq_length(item->args);
+ for (int i = 0; i < n_args; i++) {
+ PgfPArg *arg = gu_seq_index(item->args, PgfPArg, i);
+ item->inside_prob += arg->ccat->viterbi_prob;
+ }
break;
}
case PGF_PRODUCTION_COERCE: {
@@ -486,22 +523,43 @@ pgf_new_item(PgfCCat* ccat, size_t lin_idx,
PgfPArg* parg = gu_seq_index(item->args, PgfPArg, 0);
parg->hypos = NULL;
parg->ccat = pcoerce->coerce;
+ item->inside_prob = pcoerce->coerce->viterbi_prob;
break;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = pi.data;
item->args = pext->args;
+ item->inside_prob = pext->fun ? pext->fun->ep->prob : 0;
+
+ int n_args = gu_seq_length(item->args);
+ for (int i = 0; i < n_args; i++) {
+ PgfPArg *arg = gu_seq_index(item->args, PgfPArg, i);
+ item->inside_prob += arg->ccat->viterbi_prob;
+ }
break;
}
default:
gu_impossible();
}
+#ifdef PGF_PARSER_DEBUG
+ item->start = pos;
+ item->end = pos;
+#endif
item->base = base;
item->curr_sym = gu_null_variant;
item->seq_idx = 0;
item->tok_idx = 0;
item->alt = 0;
-
+
+ item->outside_prob = 0;
+ if (gu_buf_length(conts) > 0) {
+ PgfItem* best_cont = gu_buf_get(conts, PgfItem*, 0);
+ if (best_cont != NULL)
+ item->outside_prob =
+ best_cont->inside_prob-ccat->viterbi_prob+
+ best_cont->outside_prob;
+ }
+
pgf_item_set_curr_symbol(item, pool);
return item;
}
@@ -523,16 +581,22 @@ pgf_item_copy(PgfItem* item, GuPool* pool)
}
static PgfItem*
-pgf_item_update_arg(PgfItem* item, size_t d, PgfCCat *ccat, GuPool* pool)
+pgf_item_update_arg(PgfItem* item, size_t d, PgfCCat *new_ccat,
+ GuPool* pool)
{
+ PgfCCat *old_ccat =
+ gu_seq_index(item->args, PgfPArg, d)->ccat;
+
PgfItem* new_item = pgf_item_copy(item, pool);
size_t nargs = gu_seq_length(item->args);
new_item->args = gu_new_seq(PgfPArg, nargs, pool);
memcpy(gu_seq_data(new_item->args), gu_seq_data(item->args),
nargs * sizeof(PgfPArg));
gu_seq_set(new_item->args, PgfPArg, d,
- ((PgfPArg) { .hypos = NULL, .ccat = ccat }));
-
+ ((PgfPArg) { .hypos = NULL, .ccat = new_ccat }));
+ new_item->inside_prob +=
+ new_ccat->viterbi_prob - old_ccat->viterbi_prob;
+
return new_item;
}
@@ -544,14 +608,12 @@ pgf_item_advance(PgfItem* item, GuPool* pool)
}
static void
-pgf_parsing_item(PgfParsing* parsing, PgfItem* item);
-
-static void
-pgf_parsing_combine(PgfParsing* parsing, PgfItem* cont,
- PgfCCat* cat, int lin_idx)
+pgf_parsing_combine(PgfParseState* before, PgfParseState* after,
+ PgfItem* cont, PgfCCat* cat, int lin_idx)
{
if (cont == NULL) {
- gu_buf_push(parsing->completed, PgfCCat*, cat);
+ if (after == NULL)
+ before->ps->completed = cat;
return;
}
@@ -571,48 +633,53 @@ pgf_parsing_combine(PgfParsing* parsing, PgfItem* cont,
switch (gu_variant_tag(cont->curr_sym)) {
case PGF_SYMBOL_CAT: {
PgfSymbolCat* scat = gu_variant_data(cont->curr_sym);
- item = pgf_item_update_arg(cont, scat->d, cat, parsing->pool);
+ item = pgf_item_update_arg(cont, scat->d, cat, before->pool);
break;
}
case PGF_SYMBOL_LIT: {
PgfSymbolLit* slit = gu_variant_data(cont->curr_sym);
- item = pgf_item_update_arg(cont, slit->d, cat, parsing->pool);
+ item = pgf_item_update_arg(cont, slit->d, cat, before->pool);
break;
}
default:
gu_impossible();
}
} else {
- item = pgf_item_copy(cont, parsing->pool);
+ item = pgf_item_copy(cont, before->pool);
size_t nargs = gu_seq_length(cont->args);
- item->args = gu_new_seq(PgfPArg, nargs+1, parsing->pool);
+ item->args = gu_new_seq(PgfPArg, nargs+1, before->pool);
memcpy(gu_seq_data(item->args), gu_seq_data(cont->args),
nargs * sizeof(PgfPArg));
gu_seq_set(item->args, PgfPArg, nargs,
((PgfPArg) { .hypos = NULL, .ccat = cat }));
-
+ item->inside_prob += cat->viterbi_prob;
+
PgfSymbol prev = item->curr_sym;
PgfSymbolCat* scat = (PgfSymbolCat*)
gu_alloc_variant(PGF_SYMBOL_CAT,
sizeof(PgfSymbolCat)+sizeof(PgfSymbol),
gu_alignof(PgfSymbolCat),
- &item->curr_sym, parsing->pool);
+ &item->curr_sym, before->pool);
*((PgfSymbol*)(scat+1)) = prev;
scat->d = nargs;
scat->r = lin_idx;
}
- pgf_item_advance(item, parsing->pool);
- pgf_parsing_item(parsing, item);
+#ifdef PGF_PARSER_DEBUG
+ item->end = before->offset;
+#endif
+ pgf_item_advance(item, before->pool);
+ gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
}
static void
-pgf_parsing_production(PgfParsing* parsing, PgfCCat* ccat, size_t lin_idx,
- PgfProduction prod, PgfItemBuf* conts)
+pgf_parsing_production(PgfParseState* state,
+ PgfCCat* ccat, size_t lin_idx,
+ PgfProduction prod, PgfItemBuf* conts)
{
PgfItem* item =
- pgf_new_item(ccat, lin_idx, prod, conts, parsing->pool);
- pgf_parsing_item(parsing, item);
+ pgf_new_item(state->offset, ccat, lin_idx, prod, conts, state->pool);
+ gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
}
static PgfProduction
@@ -686,19 +753,20 @@ pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool)
}
static void
-pgf_parsing_complete(PgfParsing* parsing, PgfItem* item, PgfExprProb *ep)
+pgf_parsing_complete(PgfParseState* before, PgfParseState* after,
+ PgfItem* item, PgfExprProb *ep)
{
PgfProduction prod =
- pgf_parsing_new_production(item, ep, parsing->pool);
+ pgf_parsing_new_production(item, ep, before->pool);
PgfItemBuf* conts = item->base->conts;
- PgfCCat* tmp_cat = pgf_parsing_get_completed(parsing, conts);
+ PgfCCat* tmp_cat = pgf_parsing_get_completed(before, conts);
PgfCCat* cat = tmp_cat;
if (cat == NULL) {
- cat = pgf_parsing_create_completed(parsing, conts,
- item->base->ccat->cnccat);
+ cat = pgf_parsing_create_completed(before, conts,
+ item->inside_prob, item->base->ccat->cnccat);
}
-
+
GuBuf* prodbuf = gu_seq_buf(cat->prods);
gu_buf_push(prodbuf, PgfProduction, prod);
cat->n_synprods++;
@@ -709,7 +777,9 @@ pgf_parsing_complete(PgfParsing* parsing, PgfItem* item, PgfExprProb *ep)
GuWriter* wtr = gu_new_utf8_writer(out, tmp_pool);
GuExn* err = gu_exn(NULL, type, tmp_pool);
if (tmp_cat == NULL)
- gu_printf(wtr, err, "[C%d; %d; C%d]\n",
+ gu_printf(wtr, err, "[%d-%d; C%d; %d; C%d]\n",
+ item->start,
+ item->end,
item->base->ccat->fid,
item->base->lin_idx,
cat->fid);
@@ -718,11 +788,8 @@ pgf_parsing_complete(PgfParsing* parsing, PgfItem* item, PgfExprProb *ep)
#endif
if (tmp_cat != NULL) {
- // The category has already been created. If it has also been
- // predicted already, then process a new item for this production.
- PgfItemBufs* contss =
- pgf_parsing_get_contss(parsing->conts_map, cat,
- parsing->tmp_pool);
+ PgfItemBufs* contss =
+ pgf_parsing_get_contss(before->conts_map, cat, before->pool);
size_t n_contss = gu_list_length(contss);
for (size_t i = 0; i < n_contss; i++) {
PgfItemBuf* conts2 = gu_list_index(contss, i);
@@ -732,64 +799,97 @@ pgf_parsing_complete(PgfParsing* parsing, PgfItem* item, PgfExprProb *ep)
* production immediately to the agenda,
* i.e. process it. */
if (conts2) {
- pgf_parsing_production(parsing, cat, i,
- prod, conts2);
+ pgf_parsing_production(before, cat, i,
+ prod, conts2);
}
}
+
+ // The category has already been created. If it has also been
+ // predicted already, then process a new item for this production.
+ PgfParseState* state = after;
+ while (state != NULL) {
+ PgfItemBufs* contss =
+ pgf_parsing_get_contss(state->conts_map, cat, state->pool);
+ size_t n_contss = gu_list_length(contss);
+ for (size_t i = 0; i < n_contss; i++) {
+ PgfItemBuf* conts2 = gu_list_index(contss, i);
+ /* If there are continuations for
+ * linearization index i, then (cat, i) has
+ * already been predicted. Add the new
+ * production immediately to the agenda,
+ * i.e. process it. */
+ if (conts2) {
+ pgf_parsing_production(state, cat, i,
+ prod, conts2);
+ }
+ }
+
+ state = state->next;
+ }
} else {
size_t n_conts = gu_buf_length(conts);
for (size_t i = 0; i < n_conts; i++) {
PgfItem* cont = gu_buf_get(conts, PgfItem*, i);
- pgf_parsing_combine(parsing, cont, cat, item->base->lin_idx);
+ pgf_parsing_combine(before, after, cont, cat, item->base->lin_idx);
}
}
}
static void
-pgf_parsing_td_predict(PgfParsing* parsing, PgfItem* item,
- PgfCCat* ccat, size_t lin_idx)
+pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
+ PgfItem* item, PgfCCat* ccat, size_t lin_idx)
{
gu_enter("-> cat: %d", ccat->fid);
if (gu_seq_is_null(ccat->prods)) {
// Empty category
return;
}
+
PgfItemBuf* conts =
- pgf_parsing_get_conts(parsing->conts_map, ccat, lin_idx,
- parsing->pool, parsing->tmp_pool);
+ pgf_parsing_get_conts(before->conts_map, ccat, lin_idx,
+ before->pool, before->pool);
gu_buf_push(conts, PgfItem*, item);
if (gu_buf_length(conts) == 1) {
/* First time we encounter this linearization
* of this category at the current position,
* so predict it. */
-
+
// Top-down prediction for syntactic rules
PgfProductionSeq prods = ccat->prods;
for (size_t i = 0; i < ccat->n_synprods; i++) {
PgfProduction prod =
gu_seq_get(prods, PgfProduction, i);
- pgf_parsing_production(parsing, ccat, lin_idx,
- prod, conts);
+ pgf_parsing_production(before, ccat, lin_idx, prod, conts);
+ }
+
+ if (ccat->cnccat->abscat->meta_prob != INFINITY &&
+ ccat->fid < before->ps->concr->total_cats) {
+ // Top-down prediction for meta rules
+ PgfItem *item =
+ pgf_new_item(before->offset, ccat, lin_idx, before->ps->meta_prod, conts, before->pool);
+ item->inside_prob =
+ 1000000 + ccat->cnccat->abscat->meta_prob * 1000;
+ gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
}
// Bottom-up prediction for lexical rules
- if (parsing->lexicon_idx != NULL) {
- size_t n_items = gu_buf_length(parsing->lexicon_idx);
+ if (after != NULL && after->ts->lexicon_idx != NULL) {
+ size_t n_items = gu_buf_length(after->ts->lexicon_idx);
for (size_t i = 0; i < n_items; i++) {
- PgfItem* item = gu_buf_get(parsing->lexicon_idx, PgfItem*, i);
+ PgfItem* new_item = gu_buf_get(after->ts->lexicon_idx, PgfItem*, i);
- if (item->base->ccat == ccat &&
- item->base->lin_idx == lin_idx &&
- gu_seq_length(item->args) == 0) {
- pgf_parsing_production(parsing, ccat, lin_idx,
- item->base->prod, conts);
+ if (new_item->base->ccat == ccat &&
+ new_item->base->lin_idx == lin_idx &&
+ gu_seq_length(new_item->args) == 0) {
+ pgf_parsing_production(before, ccat, lin_idx,
+ new_item->base->prod, conts);
}
}
}
// Bottom-up prediction for epsilon rules
PgfCFCat cfc = {ccat->fid, lin_idx};
- PgfCCat* eps_ccat = gu_map_get(parsing->concr->epsilon_idx,
+ PgfCCat* eps_ccat = gu_map_get(before->ps->concr->epsilon_idx,
&cfc, PgfCCat*);
if (eps_ccat != NULL) {
size_t n_prods = gu_seq_length(eps_ccat->prods);
@@ -802,7 +902,7 @@ pgf_parsing_td_predict(PgfParsing* parsing, PgfItem* item,
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
if (gu_seq_length(papp->args) == 0) {
- pgf_parsing_production(parsing, ccat, lin_idx,
+ pgf_parsing_production(before, ccat, lin_idx,
prod, conts);
}
break;
@@ -812,153 +912,207 @@ pgf_parsing_td_predict(PgfParsing* parsing, PgfItem* item,
}
} else {
/* If it has already been completed, combine. */
- PgfCCat* completed =
- pgf_parsing_get_completed(parsing, conts);
+
+ PgfCCat* completed =
+ pgf_parsing_get_completed(before, conts);
if (completed) {
- pgf_parsing_combine(parsing, item, completed, lin_idx);
+ pgf_parsing_combine(before, after, item, completed, lin_idx);
+ }
+
+ PgfParseState* state = after;
+ while (state != NULL) {
+ PgfCCat* completed =
+ pgf_parsing_get_completed(state, conts);
+ if (completed) {
+ pgf_parsing_combine(state, state->next, item, completed, lin_idx);
+ }
+
+ state = state->next;
}
}
- gu_exit(NULL);
+ gu_exit("<-");
}
static void
-pgf_parsing_bu_predict(PgfParsing* parsing, PgfItem* item,
- PgfItemBuf* metas, PgfItemBuf* agenda)
+pgf_parsing_bu_predict(PgfParseState* before, PgfParseState* after,
+ PgfItem* item, PgfItem* meta_item, PgfItemBuf* agenda,
+ bool print)
{
PgfItemBuf* conts =
- pgf_parsing_get_conts(parsing->conts_map,
+ pgf_parsing_get_conts(before->conts_map,
item->base->ccat, item->base->lin_idx,
- parsing->pool, parsing->tmp_pool);
-
- PgfItem* copy = pgf_item_copy(item, parsing->pool);
- copy->base = pgf_item_base_copy(item->base, parsing->pool);
- copy->base->conts = conts;
- gu_buf_push(agenda, PgfItem*, copy);
-
- if (gu_buf_length(conts) == 0) {
- size_t n_items;
-
- n_items = gu_buf_length(item->base->conts);
- for (size_t i = 0; i < n_items; i++) {
- PgfItem *item_ = gu_buf_get(item->base->conts, PgfItem*, i);
-
+ before->pool, before->pool);
+ gu_buf_push(conts, PgfItem*, meta_item);
+ if (gu_buf_length(conts) == 1) {
+ PgfItem* copy = pgf_item_copy(item, after->pool);
+ copy->base = pgf_item_base_copy(item->base, after->pool);
+ copy->base->conts = conts;
+ copy->outside_prob =
+ meta_item->inside_prob+meta_item->outside_prob;
+
#ifdef PGF_PARSER_DEBUG
+ copy->start = before->offset;
+ copy->end = before->offset;
+
+ if (print) {
GuPool* tmp_pool = gu_new_pool();
GuOut* out = gu_file_out(stderr, tmp_pool);
GuWriter* wtr = gu_new_utf8_writer(out, tmp_pool);
GuExn* err = gu_exn(NULL, type, tmp_pool);
- pgf_print_item(item_, wtr, err, tmp_pool);
+ pgf_print_item(copy, wtr, err, tmp_pool);
gu_pool_free(tmp_pool);
+ } else {
+ copy->end = after->offset;
+ }
#endif
- pgf_parsing_bu_predict(parsing, item_, metas, conts);
- }
+ gu_buf_push(agenda, PgfItem*, copy);
- n_items = gu_buf_length(metas);
+ size_t n_items = gu_buf_length(item->base->conts);
for (size_t i = 0; i < n_items; i++) {
- PgfItem *item_ = gu_buf_get(metas, PgfItem*, i);
- gu_buf_push(conts, PgfItem*, item_);
+ PgfItem *item_ = gu_buf_get(item->base->conts, PgfItem*, i);
+ pgf_parsing_bu_predict(before, after, item_, meta_item, conts, true);
+ }
+ } else {
+ /* If it has already been completed, combine. */
+
+ /*PgfCCat* completed =
+ pgf_parsing_get_completed(before, conts);
+ if (completed) {
+ pgf_parsing_combine(before, after, meta_item, completed, item->base->lin_idx);
+ }*/
+
+ PgfParseState* state = after;
+ while (state != NULL) {
+ PgfCCat* completed =
+ pgf_parsing_get_completed(state, conts);
+ if (completed) {
+ pgf_parsing_combine(state, state->next, meta_item, completed, item->base->lin_idx);
+ }
+
+ state = state->next;
}
}
+
}
static void
-pgf_parsing_symbol(PgfParsing* parsing, PgfItem* item, PgfSymbol sym) {
+pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
+ PgfItem* item, PgfSymbol sym) {
switch (gu_variant_tag(sym)) {
case PGF_SYMBOL_CAT: {
PgfSymbolCat* scat = gu_variant_data(sym);
PgfPArg* parg = gu_seq_index(item->args, PgfPArg, scat->d);
gu_assert(!parg->hypos || !parg->hypos->len);
- pgf_parsing_td_predict(parsing, item, parg->ccat, scat->r);
+ pgf_parsing_td_predict(before, after, item, parg->ccat, scat->r);
break;
}
case PGF_SYMBOL_KS: {
- PgfSymbolKS* sks = gu_variant_data(sym);
- gu_assert(item->tok_idx < gu_seq_length(sks->tokens));
- PgfToken tok =
- gu_seq_get(sks->tokens, PgfToken, item->tok_idx++);
- if (item->tok_idx == gu_seq_length(sks->tokens)) {
- item->tok_idx = 0;
- pgf_item_advance(item, parsing->pool);
- }
- pgf_parsing_add_transition(parsing, tok, item);
+ if (after != NULL) {
+ PgfSymbolKS* sks = gu_variant_data(sym);
+ gu_assert(item->tok_idx < gu_seq_length(sks->tokens));
+ PgfToken tok =
+ gu_seq_get(sks->tokens, PgfToken, item->tok_idx++);
+ if (item->tok_idx == gu_seq_length(sks->tokens)) {
+ item->tok_idx = 0;
+ pgf_item_advance(item, after->pool);
+ }
+#ifdef PGF_PARSER_DEBUG
+ item->end++;
+#endif
+ pgf_parsing_add_transition(before, after, tok, item);
+ }
break;
}
case PGF_SYMBOL_KP: {
- PgfSymbolKP* skp = gu_variant_data(sym);
- size_t idx = item->tok_idx;
- uint8_t alt = item->alt;
- gu_assert(idx < gu_seq_length(skp->default_form));
- if (idx == 0) {
- PgfToken tok;
- PgfItem* new_item;
-
- tok = gu_seq_get(skp->default_form, PgfToken, 0);
- new_item = pgf_item_copy(item, parsing->pool);
- new_item->tok_idx++;
- if (new_item->tok_idx == gu_seq_length(skp->default_form)) {
- new_item->tok_idx = 0;
- pgf_item_advance(new_item, parsing->pool);
- }
- pgf_parsing_add_transition(parsing, tok, new_item);
-
- for (size_t i = 0; i < skp->n_forms; i++) {
- // XXX: do nubbing properly
- PgfTokens toks = skp->forms[i].form;
- PgfTokens toks2 = skp->default_form;
- bool skip = pgf_tokens_equal(toks, toks2);
- for (size_t j = 0; j < i; j++) {
- PgfTokens toks2 = skp->forms[j].form;
- skip |= pgf_tokens_equal(toks, toks2);
+ if (after != NULL) {
+ PgfSymbolKP* skp = gu_variant_data(sym);
+ size_t idx = item->tok_idx;
+ uint8_t alt = item->alt;
+ gu_assert(idx < gu_seq_length(skp->default_form));
+ if (idx == 0) {
+ PgfToken tok;
+ PgfItem* new_item;
+
+ tok = gu_seq_get(skp->default_form, PgfToken, 0);
+ new_item = pgf_item_copy(item, after->pool);
+ new_item->tok_idx++;
+#ifdef PGF_PARSER_DEBUG
+ new_item->end++;
+#endif
+ if (new_item->tok_idx == gu_seq_length(skp->default_form)) {
+ new_item->tok_idx = 0;
+ pgf_item_advance(new_item, after->pool);
}
- if (!skip) {
- tok = gu_seq_get(toks, PgfToken, 0);
- new_item = pgf_item_copy(item, parsing->pool);
- new_item->tok_idx++;
- new_item->alt = i;
- if (new_item->tok_idx == gu_seq_length(toks)) {
- new_item->tok_idx = 0;
- pgf_item_advance(new_item, parsing->pool);
- }
- pgf_parsing_add_transition(parsing, tok, new_item);
+ pgf_parsing_add_transition(before, after, tok, new_item);
+
+ for (size_t i = 0; i < skp->n_forms; i++) {
+ // XXX: do nubbing properly
+ PgfTokens toks = skp->forms[i].form;
+ PgfTokens toks2 = skp->default_form;
+ bool skip = pgf_tokens_equal(toks, toks2);
+ for (size_t j = 0; j < i; j++) {
+ PgfTokens toks2 = skp->forms[j].form;
+ skip |= pgf_tokens_equal(toks, toks2);
+ }
+ if (!skip) {
+ tok = gu_seq_get(toks, PgfToken, 0);
+ new_item = pgf_item_copy(item, after->pool);
+ new_item->tok_idx++;
+#ifdef PGF_PARSER_DEBUG
+ new_item->end++;
+#endif
+ new_item->alt = i;
+ if (new_item->tok_idx == gu_seq_length(toks)) {
+ new_item->tok_idx = 0;
+ pgf_item_advance(new_item, after->pool);
+ }
+ pgf_parsing_add_transition(before, after, tok, new_item);
+ }
}
+ } else if (alt == 0) {
+ PgfToken tok =
+ gu_seq_get(skp->default_form, PgfToken, idx);
+ item->tok_idx++;
+#ifdef PGF_PARSER_DEBUG
+ item->end++;
+#endif
+ if (item->tok_idx == gu_seq_length(skp->default_form)) {
+ item->tok_idx = 0;
+ pgf_item_advance(item, after->pool);
+ }
+ pgf_parsing_add_transition(before, after, tok, item);
+ } else {
+ gu_assert(alt <= skp->n_forms);
+ PgfTokens toks = skp->forms[alt - 1].form;
+ PgfToken tok = gu_seq_get(toks, PgfToken, idx);
+ item->tok_idx++;
+#ifdef PGF_PARSER_DEBUG
+ item->end++;
+#endif
+ if (item->tok_idx == gu_seq_length(toks)) {
+ item->tok_idx = 0;
+ pgf_item_advance(item, after->pool);
+ }
+ pgf_parsing_add_transition(before, after, tok, item);
}
- } else if (alt == 0) {
- PgfToken tok =
- gu_seq_get(skp->default_form, PgfToken, idx);
- item->tok_idx++;
- if (item->tok_idx == gu_seq_length(skp->default_form)) {
- item->tok_idx = 0;
- pgf_item_advance(item, parsing->pool);
- }
- pgf_parsing_add_transition(parsing, tok, item);
- } else {
- gu_assert(alt <= skp->n_forms);
- PgfTokens toks = skp->forms[alt - 1].form;
- PgfToken tok = gu_seq_get(toks, PgfToken, idx);
- item->tok_idx++;
- if (item->tok_idx == gu_seq_length(toks)) {
- item->tok_idx = 0;
- pgf_item_advance(item, parsing->pool);
- }
- pgf_parsing_add_transition(parsing, tok, item);
}
break;
}
case PGF_SYMBOL_LIT: {
- if (!gu_string_eq(parsing->tok, gu_empty_string)) {
+ if (after != NULL) {
PgfSymbolLit* slit = gu_variant_data(sym);
PgfPArg* parg = gu_seq_index(item->args, PgfPArg, slit->d);
gu_assert(!parg->hypos || !parg->hypos->len);
if (parg->ccat->fid > 0 &&
- parg->ccat->fid >= parsing->concr->total_cats)
- pgf_parsing_td_predict(parsing, item, parg->ccat, slit->r);
+ parg->ccat->fid >= before->ps->concr->total_cats)
+ pgf_parsing_td_predict(before, after, item, parg->ccat, slit->r);
else {
PgfItemBuf* conts =
- pgf_parsing_get_conts(parsing->conts_map,
+ pgf_parsing_get_conts(before->conts_map,
parg->ccat, slit->r,
- parsing->pool, parsing->tmp_pool);
+ before->pool, before->pool);
gu_buf_push(conts, PgfItem*, item);
if (gu_buf_length(conts) == 1) {
@@ -966,7 +1120,7 @@ pgf_parsing_symbol(PgfParsing* parsing, PgfItem* item, PgfSymbol sym) {
* literal category so we must call the callback */
PgfLiteralCallback* callback =
- gu_map_get(parsing->concr->callbacks,
+ gu_map_get(before->ps->concr->callbacks,
parg->ccat->cnccat,
PgfLiteralCallback*);
@@ -975,14 +1129,33 @@ pgf_parsing_symbol(PgfParsing* parsing, PgfItem* item, PgfSymbol sym) {
PgfProductionExtern* pext =
gu_new_variant(PGF_PRODUCTION_EXTERN,
PgfProductionExtern,
- &prod, parsing->pool);
+ &prod, before->pool);
pext->fun = NULL;
- pext->args = gu_new_seq(PgfPArg, 0, parsing->pool);
+ pext->args = gu_new_seq(PgfPArg, 0, before->pool);
pext->callback = callback;
- pgf_parsing_production(parsing, parg->ccat, slit->r,
+ pgf_parsing_production(before, parg->ccat, slit->r,
prod, conts);
}
+ } else {
+ /* If it has already been completed, combine. */
+
+ PgfCCat* completed =
+ pgf_parsing_get_completed(before, conts);
+ if (completed) {
+ pgf_parsing_combine(before, after, item, completed, slit->r);
+ }
+
+ PgfParseState* state = after;
+ while (state != NULL) {
+ PgfCCat* completed =
+ pgf_parsing_get_completed(state, conts);
+ if (completed) {
+ pgf_parsing_combine(state, state->next, item, completed, slit->r);
+ }
+
+ state = state->next;
+ }
}
}
}
@@ -996,8 +1169,10 @@ pgf_parsing_symbol(PgfParsing* parsing, PgfItem* item, PgfSymbol sym) {
}
}
+PgfParseState *meta_after = NULL;
+
static void
-pgf_parsing_item(PgfParsing* parsing, PgfItem* item)
+pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
{
#ifdef PGF_PARSER_DEBUG
GuPool* tmp_pool = gu_new_pool();
@@ -1015,11 +1190,11 @@ pgf_parsing_item(PgfParsing* parsing, PgfItem* item)
PgfCncFun* fun = papp->fun;
PgfSequence seq = fun->lins[item->base->lin_idx];
if (item->seq_idx == gu_seq_length(seq)) {
- pgf_parsing_complete(parsing, item, NULL);
+ pgf_parsing_complete(before, after, item, NULL);
} else {
PgfSymbol sym =
gu_seq_get(seq, PgfSymbol, item->seq_idx);
- pgf_parsing_symbol(parsing, item, sym);
+ pgf_parsing_symbol(before, after, item, sym);
}
break;
}
@@ -1027,12 +1202,12 @@ pgf_parsing_item(PgfParsing* parsing, PgfItem* item)
PgfProductionCoerce* pcoerce = i.data;
switch (item->seq_idx) {
case 0:
- pgf_parsing_td_predict(parsing, item,
+ pgf_parsing_td_predict(before, after, item,
pcoerce->coerce,
item->base->lin_idx);
break;
case 1:
- pgf_parsing_complete(parsing, item, NULL);
+ pgf_parsing_complete(before, after, item, NULL);
break;
default:
gu_impossible();
@@ -1047,33 +1222,42 @@ pgf_parsing_item(PgfParsing* parsing, PgfItem* item)
if (fun != NULL &&
!gu_seq_is_null(seq = fun->lins[item->base->lin_idx])) {
if (item->seq_idx == gu_seq_length(seq)) {
- pgf_parsing_complete(parsing, item, NULL);
+ pgf_parsing_complete(before, after, item, NULL);
} else {
PgfSymbol sym =
gu_seq_get(seq, PgfSymbol, item->seq_idx);
- pgf_parsing_symbol(parsing, item, sym);
+ pgf_parsing_symbol(before, after, item, sym);
}
} else {
+ PgfToken tok = (after != NULL) ? after->ts->tok
+ : gu_empty_string;
+
+ meta_after = after;
bool accepted =
- pext->callback->match(parsing->concr, item, parsing->tok,
- &parsing->meta_ep, parsing->pool);
+ pext->callback->match(before->ps->concr, item,
+ tok,
+ &before->meta_ep, before->pool);
+ meta_after = NULL;
- if (parsing->meta_ep != NULL)
- pgf_parsing_complete(parsing, item, parsing->meta_ep);
+ if (before->meta_ep != NULL)
+ pgf_parsing_complete(before, after, item, before->meta_ep);
- if (accepted) {
+ if (accepted && after != NULL) {
PgfSymbol prev = item->curr_sym;
PgfSymbolKS* sks = (PgfSymbolKS*)
gu_alloc_variant(PGF_SYMBOL_KS,
sizeof(PgfSymbolKS)+sizeof(PgfSymbol),
gu_alignof(PgfSymbolKS),
- &item->curr_sym, parsing->pool);
+ &item->curr_sym, after->pool);
*((PgfSymbol*)(sks+1)) = prev;
- sks->tokens = gu_new_seq(PgfToken, 1, parsing->pool);
- gu_seq_set(sks->tokens, PgfToken, 0, parsing->tok);
+ sks->tokens = gu_new_seq(PgfToken, 1, after->pool);
+ gu_seq_set(sks->tokens, PgfToken, 0, tok);
item->seq_idx++;
- pgf_parsing_add_transition(parsing, parsing->tok, item);
+#ifdef PGF_PARSER_DEBUG
+ item->end++;
+#endif
+ pgf_parsing_add_transition(before, after, tok, item);
}
}
break;
@@ -1083,80 +1267,6 @@ pgf_parsing_item(PgfParsing* parsing, PgfItem* item)
}
}
-static PgfParsing*
-pgf_new_parsing(PgfConcr* concr, PgfLexCallback* callback, int max_fid,
- GuPool* parse_pool, GuPool* out_pool)
-{
- PgfParsing* parsing = gu_new(PgfParsing, out_pool);
- parsing->concr = concr;
- parsing->generated_cats = gu_map_type_new(PgfGenCatMap, out_pool);
- parsing->conts_map = gu_map_type_new(PgfContsMap, out_pool);
- parsing->completed = gu_new_buf(PgfCCat*, parse_pool);
- parsing->lex_callback = callback;
- parsing->lexicon_idx = NULL;
- parsing->meta_ep = NULL;
- parsing->metas = gu_new_buf(PgfItem*, out_pool);
- parsing->pool = parse_pool;
- parsing->tmp_pool = out_pool;
- parsing->tok = gu_empty_string;
- parsing->max_fid = max_fid;
- return parsing;
-}
-
-static PgfParse*
-pgf_new_parse(PgfConcr* concr, int max_fid, GuPool* pool)
-{
- PgfParse* parse = gu_new(PgfParse, pool);
- parse->concr = concr;
- parse->agenda = NULL;
- parse->max_fid = max_fid;
- return parse;
-}
-
-typedef struct {
- PgfLexCallback fn;
- PgfToken tok;
- PgfItemBuf* agenda;
-} PgfParseTokenCallback;
-
-static void
-pgf_match_token(PgfLexCallback* self, PgfToken tok, PgfItem* item)
-{
- PgfParseTokenCallback *clo = (PgfParseTokenCallback *) self;
-
- if (gu_string_eq(tok, clo->tok)) {
- gu_buf_push(clo->agenda, PgfItem*, item);
- }
-}
-
-typedef struct {
- GuMapItor fn;
- PgfProduction prod;
- PgfParsing *parsing;
-} PgfGetMetaFn;
-
-static void
-pgf_parsing_get_metas(GuMapItor* fn, const void* key, void* value,
- GuExn* err)
-{
- PgfGetMetaFn* clo = (PgfGetMetaFn*) fn;
- PgfCCat *ccat = (PgfCCat *) key;
- PgfItemBufs* contss = *((PgfItemBufs **) value);
- PgfProduction prod = clo->prod;
- PgfParsing *parsing = clo->parsing;
-
- size_t n_lins = gu_list_length(contss);
- for (size_t lin_idx = 0; lin_idx < n_lins; lin_idx++) {
- PgfItemBuf* conts = gu_list_index(contss, lin_idx);
-
- if (conts != NULL) {
- PgfItem *item =
- pgf_new_item(ccat, lin_idx, prod, conts, parsing->pool);
- gu_buf_push(parsing->metas, PgfItem*, item);
- }
- }
-}
-
static bool
pgf_match_meta(PgfConcr* concr, PgfItem *item, PgfToken tok,
PgfExprProb** out_ep, GuPool *pool)
@@ -1165,7 +1275,12 @@ pgf_match_meta(PgfConcr* concr, PgfItem *item, PgfToken tok,
if (n_syms > 0) {
PgfExprProb *ep = gu_new(PgfExprProb, pool);
- ep->prob = 100000000000 + rand();
+ ep->prob = item->inside_prob;
+ size_t n_args = gu_seq_length(item->args);
+ for (size_t i = 0; i < n_args; i++) {
+ PgfPArg* arg = gu_seq_index(item->args, PgfPArg, i);
+ ep->prob -= arg->ccat->viterbi_prob;
+ }
PgfExprMeta *expr_meta =
gu_new_variant(PGF_EXPR_META,
PgfExprMeta,
@@ -1176,15 +1291,25 @@ pgf_match_meta(PgfConcr* concr, PgfItem *item, PgfToken tok,
} else {
*out_ep = NULL;
}
-
- if (gu_map_get(concr->lexicon_idx, &tok, GuBuf*) == NULL)
- return true;
- else {
- PgfParsing* parsing =
- gu_container(out_ep, PgfParsing, meta_ep);
-
- gu_buf_push(parsing->metas, PgfItem*, item);
- return false;
+
+ PgfParseState* after = meta_after;
+ if (after != NULL) {
+ if (after->ts->lexicon_idx == NULL)
+ return true;
+ else {
+ PgfParseState* before =
+ gu_container(out_ep, PgfParseState, meta_ep);
+
+ size_t n_items = gu_buf_length(after->ts->lexicon_idx);
+ for (size_t i = 0; i < n_items; i++) {
+ PgfItem* item_ =
+ gu_buf_get(after->ts->lexicon_idx, PgfItem*, i);
+ pgf_parsing_bu_predict(before, after,
+ item_, item, after->agenda, false);
+ after->ps->target = item_;
+ }
+ return false;
+ }
}
return false;
@@ -1193,260 +1318,125 @@ pgf_match_meta(PgfConcr* concr, PgfItem *item, PgfToken tok,
static PgfLiteralCallback pgf_meta_callback =
{ pgf_match_meta } ;
-static void
-pgf_parsing_collect_metas(PgfParsing* parsing, bool print)
-{
- PgfProduction prod;
- PgfProductionExtern* pext =
- gu_new_variant(PGF_PRODUCTION_EXTERN,
- PgfProductionExtern,
- &prod, parsing->pool);
- pext->fun = NULL;
- pext->args = gu_new_seq(PgfPArg, 0, parsing->pool);
- pext->callback = &pgf_meta_callback;
-#ifdef PGF_PARSER_DEBUG
- int n_zero_metas = gu_buf_length(parsing->metas);
-#endif
-
- PgfGetMetaFn clo2 = { { pgf_parsing_get_metas }, prod, parsing };
- gu_map_iter(parsing->conts_map, &clo2.fn, NULL);
-
-#ifdef PGF_PARSER_DEBUG
- if (print) {
- GuPool* tmp_pool = gu_new_pool();
- GuOut* out = gu_file_out(stderr, tmp_pool);
- GuWriter* wtr = gu_new_utf8_writer(out, tmp_pool);
- GuExn* err = gu_exn(NULL, type, tmp_pool);
-
- size_t n_items = gu_buf_length(parsing->metas);
- for (size_t i = n_zero_metas; i < n_items; i++) {
- PgfItem* item = gu_buf_get(parsing->metas, PgfItem*, i);
- pgf_print_item(item, wtr, err, tmp_pool);
- }
- gu_pool_free(tmp_pool);
- }
-#endif
-}
-
-PgfParse*
-pgf_parse_token(PgfParse* parse, PgfToken tok, bool robust, GuPool* pool)
-{
- PgfItemBuf* agenda = gu_new_buf(PgfItem*, pool);
-
- PgfParseTokenCallback clo1 = {{ pgf_match_token }, tok, agenda };
-
- GuPool* tmp_pool = gu_new_pool();
- PgfParsing* parsing = pgf_new_parsing(parse->concr, &clo1.fn, parse->max_fid, pool, tmp_pool);
- parsing->tok = tok;
- parsing->lexicon_idx = gu_map_get(parse->concr->lexicon_idx, &tok, GuBuf*);
-
- size_t n_items = gu_buf_length(parse->agenda);
- for (size_t i = 0; i < n_items; i++) {
- PgfItem* item = gu_buf_get(parse->agenda, PgfItem*, i);
- pgf_parsing_item(parsing, item);
- }
-
- if (robust) {
- if (parsing->lexicon_idx != NULL) {
- bool flag = false;
-
- size_t n_items = gu_buf_length(parsing->lexicon_idx);
- for (size_t i = 0; i < n_items; i++) {
- PgfItem* item = gu_buf_get(parsing->lexicon_idx, PgfItem*, i);
-
- if (!pgf_parsing_has_conts(parsing->conts_map,
- item->base->ccat, item->base->lin_idx)) {
- if (!flag) {
- pgf_parsing_collect_metas(parsing, true);
- flag = true;
- }
- pgf_parsing_bu_predict(parsing, item, parsing->metas, agenda);
+static void
+pgf_parsing_proceed(PgfParseState* state, void** output) {
+ while (*output == NULL) {
+ float best_prob = INFINITY;
+ PgfParseState* before = NULL;
+
+ PgfParseState* st = state;
+ while (st != NULL) {
+ if (gu_buf_length(st->agenda) > 0) {
+ PgfItem* item = gu_buf_get(st->agenda, PgfItem*, 0);
+ float item_prob = item->inside_prob+item->outside_prob;
+ if (item_prob < best_prob) {
+ best_prob = item_prob;
+ before = st;
}
}
- } else {
- // We have unknown word
- pgf_parsing_collect_metas(parsing, false);
-
- size_t n_items = gu_buf_length(parsing->metas);
- for (size_t i = 0; i < n_items; i++) {
- PgfItem* item = gu_buf_get(parsing->metas, PgfItem*, i);
- pgf_parsing_item(parsing, item);
- }
+ st = st->next;
}
- }
-
- PgfParse* next_parse = NULL;
- if (gu_buf_length(agenda) > 0) {
- next_parse = pgf_new_parse(parse->concr, parse->max_fid, pool);
- next_parse->agenda = agenda;
- next_parse->max_fid= parsing->max_fid;
- }
- gu_pool_free(tmp_pool);
- return next_parse;
-}
+ if (before == NULL)
+ break;
-static PgfExpr
-pgf_cat_to_expr(PgfConcr* concr, PgfCCat* cat,
- PgfCCatBuf* visited, GuChoice* choice,
- GuPool* pool);
+ PgfParseState* after = NULL;
-static PgfExpr
-pgf_production_to_expr(PgfConcr* concr, PgfProduction prod,
- PgfCCatBuf* visited, GuChoice* choice,
- GuPool* pool)
-{
- GuVariantInfo pi = gu_variant_open(prod);
- switch (pi.tag) {
- case PGF_PRODUCTION_APPLY: {
- PgfProductionApply* papp = pi.data;
- PgfExpr expr = papp->fun->ep->expr;
- size_t n_args = gu_seq_length(papp->args);
- for (size_t i = 0; i < n_args; i++) {
- PgfPArg* parg = gu_seq_index(papp->args, PgfPArg, i);
- gu_assert(!parg->hypos || !parg->hypos->len);
- PgfExpr earg = pgf_cat_to_expr(concr, parg->ccat, visited, choice, pool);
- if (gu_variant_is_null(earg))
- return gu_null_variant;
- expr = gu_new_variant_i(pool, PGF_EXPR_APP,
- PgfExprApp,
- .fun = expr, .arg = earg);
- }
- return expr;
- }
- case PGF_PRODUCTION_COERCE: {
- PgfProductionCoerce* pcoerce = pi.data;
- return pgf_cat_to_expr(concr, pcoerce->coerce, visited, choice, pool);
- }
- case PGF_PRODUCTION_EXTERN: {
- PgfProductionExtern* pext = pi.data;
- PgfExpr expr = pext->fun->ep->expr;
- size_t n_args = gu_seq_length(pext->args);
- for (size_t i = 0; i < n_args; i++) {
- PgfPArg* parg = gu_seq_index(pext->args, PgfPArg, i);
- gu_assert(!parg->hypos || !parg->hypos->len);
- PgfExpr earg = pgf_cat_to_expr(concr, parg->ccat, visited, choice, pool);
- if (gu_variant_is_null(earg))
- return gu_null_variant;
- expr = gu_new_variant_i(pool, PGF_EXPR_APP,
- PgfExprApp,
- .fun = expr, .arg = earg);
+ st = state;
+ while (st != before) {
+ PgfParseState* tmp = st->next;
+ st->next = after;
+ after = st;
+ st = tmp;
}
- return expr;
- }
- default:
- gu_impossible();
- }
- return gu_null_variant;
-}
-static PgfExpr
-pgf_cat_to_expr(PgfConcr* concr, PgfCCat* cat,
- PgfCCatBuf* visited, GuChoice* choice,
- GuPool* pool)
-{
- if (cat->fid < concr->total_cats) {
- // XXX: What should the PgfMetaId be?
- return gu_new_variant_i(pool, PGF_EXPR_META,
- PgfExprMeta,
- .id = 0);
- }
+ PgfItem* item;
+ gu_buf_heap_pop(before->agenda, &pgf_item_prob_order, &item);
+ pgf_parsing_item(before, after, item);
- size_t n_prods = gu_seq_length(cat->prods);
- for (size_t i = 0; i < gu_buf_length(visited); i++) {
- if (gu_buf_get(visited, PgfCCat*, i) == cat) {
- n_prods = 0;
- break;
+ while (after != NULL) {
+ PgfParseState* tmp = after->next;
+ after->next = before;
+ before = after;
+ after = tmp;
}
+ state = before;
}
- gu_buf_push(visited, PgfCCat*, cat);
-
- int i = gu_choice_next(choice, n_prods);
- if (i == -1) {
- return gu_null_variant;
- }
- PgfProduction prod = gu_seq_get(cat->prods, PgfProduction, i);
- return pgf_production_to_expr(concr, prod, visited, choice, pool);
}
-static PgfExpr
-pgf_parse_result_next(PgfParseResult* pr, GuPool* pool)
+static PgfParsing*
+pgf_new_parsing(PgfConcr* concr, GuPool* pool)
{
- if (pr->choice == NULL) {
- return gu_null_variant;
- }
-
- PgfExpr ret = gu_null_variant;
-
- do {
- size_t n_results = gu_buf_length(pr->completed);
- GuChoiceMark mark = gu_choice_mark(pr->choice);
- int i = gu_choice_next(pr->choice, n_results);
- if (i == -1) {
- return gu_null_variant;
- }
- PgfCCat* cat = gu_buf_get(pr->completed, PgfCCat*, i);
-
- GuPool* tmp_pool = gu_new_pool();
- PgfCCatBuf* visited = gu_new_buf(PgfCCat*, tmp_pool);
- ret = pgf_cat_to_expr(pr->concr, cat, visited, pr->choice, pool);
- gu_pool_free(tmp_pool);
-
- gu_choice_reset(pr->choice, mark);
- if (!gu_choice_advance(pr->choice)) {
- pr->choice = NULL;
- };
- } while (gu_variant_is_null(ret));
-
- return ret;
+ PgfParsing* ps = gu_new(PgfParsing, pool);
+ ps->concr = concr;
+ ps->completed = NULL;
+ ps->target = NULL;
+ ps->max_fid = concr->max_fid;
+
+ PgfProductionExtern* pext =
+ gu_new_variant(PGF_PRODUCTION_EXTERN,
+ PgfProductionExtern,
+ &ps->meta_prod, pool);
+ pext->fun = NULL;
+ pext->args = gu_new_seq(PgfPArg, 0, pool);
+ pext->callback = &pgf_meta_callback;
+
+ return ps;
}
-static void
-pgf_parse_result_enum_next(GuEnum* self, void* to, GuPool* pool)
+static PgfParseState*
+pgf_new_parse_state(PgfParsing* ps,
+ PgfParseState* next,
+ PgfTokenState* ts, int offset,
+ GuPool* pool)
{
- PgfParseResult* pr = gu_container(self, PgfParseResult, en);
- *(PgfExpr*)to = pgf_parse_result_next(pr, pool);
+ PgfParseState* state = gu_new(PgfParseState, pool);
+ state->pool = pool;
+ state->next = next;
+ state->agenda = gu_new_buf(PgfItem*, pool);
+ state->generated_cats = gu_map_type_new(PgfGenCatMap, pool);
+ state->conts_map = gu_map_type_new(PgfContsMap, pool);
+ state->meta_ep = NULL;
+ state->offset = offset;
+ state->ps = ps;
+ state->ts = ts;
+ return state;
}
-static void
-pgf_lex_noop(PgfLexCallback* self, PgfToken tok, PgfItem* item)
+static PgfTokenState*
+pgf_new_token_state(PgfConcr *concr, PgfToken tok, GuPool* pool)
{
+ PgfTokenState* ts = gu_new(PgfTokenState, pool);
+ ts->tok = tok;
+ ts->lexicon_idx = gu_map_get(concr->lexicon_idx, &tok, GuBuf*);
+ return ts;
}
-static PgfLexCallback lex_callback_noop =
- { pgf_lex_noop };
-
-PgfExprEnum*
-pgf_parse_result(PgfParse* parse, GuPool* pool)
+PgfParseState*
+pgf_parser_next_state(PgfParseState* prev, PgfToken tok, GuPool* pool)
{
- GuPool* tmp_pool = gu_new_pool();
- PgfParsing* parsing =
- pgf_new_parsing(parse->concr,
- &lex_callback_noop,
- parse->max_fid,
- pool, tmp_pool);
- size_t n_items = gu_buf_length(parse->agenda);
- for (size_t i = 0; i < n_items; i++) {
- PgfItem* item = gu_buf_get(parse->agenda, PgfItem*, i);
- pgf_parsing_item(parsing, item);
- }
-
- PgfExprEnum* en =
- &gu_new_i(pool, PgfParseResult,
- .concr = parse->concr,
- .completed = parsing->completed,
- .choice = gu_new_choice(pool),
- .en.next = pgf_parse_result_enum_next)->en;
+ PgfTokenState* ts =
+ pgf_new_token_state(prev->ps->concr,tok,pool);
+ PgfParseState* state =
+ pgf_new_parse_state(prev->ps, prev,
+ ts, prev->offset+1,
+ pool);
+
+ state->ps->target = NULL;
+ pgf_parsing_proceed(state, (void**) &state->ps->target);
+ if (state->ps->target != NULL) {
+ return state;
+ }
- gu_pool_free(tmp_pool);
- return en;
+ return NULL;
}
-int cmp_expr_prob(GuOrder* self, const void* a, const void* b)
+static int
+cmp_expr_qstate(GuOrder* self, const void* a, const void* b)
{
- PgfExprProb *s1 = *((PgfExprProb **) a);
- PgfExprProb *s2 = *((PgfExprProb **) b);
+ PgfExprQState *s1 = (PgfExprQState *) a;
+ PgfExprQState *s2 = (PgfExprQState *) b;
if (s1->prob < s2->prob)
return -1;
@@ -1457,12 +1447,28 @@ int cmp_expr_prob(GuOrder* self, const void* a, const void* b)
}
static GuOrder
-pgf_expr_prob_order = { cmp_expr_prob };
+pgf_expr_qstate_order = { cmp_expr_qstate };
static void
-pgf_parse_best_result_init(PgfCCat *ccat, GuBuf *pqueue,
- GuPool *tmp_pool, GuPool *out_pool)
+pgf_result_cat_init(PgfParseResult* pr,
+ PgfExprState* cont, float cont_prob, PgfCCat* ccat)
{
+ // Checking for loops in the chart
+ if (cont != NULL) {
+ PgfExprState* st = cont->cont;
+ while (st != NULL) {
+ PgfPArg* arg = gu_seq_index(st->args, PgfPArg, st->arg_idx);
+ if (arg->ccat == ccat)
+ return;
+
+ st = st->cont;
+ }
+ }
+
+ if (gu_seq_is_null(ccat->prods))
+ return;
+
+ // Generation
size_t n_prods = gu_seq_length(ccat->prods);
for (size_t i = 0; i < n_prods; i++) {
PgfProduction prod =
@@ -1473,27 +1479,44 @@ pgf_parse_best_result_init(PgfCCat *ccat, GuBuf *pqueue,
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = pi.data;
- PgfExprState *st = gu_new(PgfExprState, tmp_pool);
- st->ep = *papp->fun->ep;
+ PgfExprState *st = gu_new(PgfExprState, pr->tmp_pool);
+ st->cont = cont;
+ st->expr = papp->fun->ep->expr;
st->args = papp->args;
st->arg_idx = 0;
- gu_buf_heap_push(pqueue, &pgf_expr_prob_order, &st);
+
+ PgfExprQState q = {st, cont_prob + papp->fun->ep->prob};
+ size_t n_args = gu_seq_length(st->args);
+ for (size_t k = 0; k < n_args; k++) {
+ PgfPArg* parg = gu_seq_index(st->args, PgfPArg, k);
+ q.prob += parg->ccat->viterbi_prob;
+ }
+
+ gu_buf_heap_push(pr->pqueue, &pgf_expr_qstate_order, &q);
break;
}
case PGF_PRODUCTION_COERCE: {
PgfProductionCoerce* pcoerce = pi.data;
- pgf_parse_best_result_init(pcoerce->coerce, pqueue,
- tmp_pool, out_pool);
+ pgf_result_cat_init(pr, cont, cont_prob, pcoerce->coerce);
break;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = pi.data;
- PgfExprState *st = gu_new(PgfExprState, tmp_pool);
- st->ep = *pext->fun->ep;
+ PgfExprState *st = gu_new(PgfExprState, pr->tmp_pool);
+ st->cont = cont;
+ st->expr = pext->fun->ep->expr;
st->args = pext->args;
st->arg_idx = 0;
- gu_buf_heap_push(pqueue, &pgf_expr_prob_order, &st);
+
+ PgfExprQState q = {st, cont_prob + pext->fun->ep->prob};
+ size_t n_args = gu_seq_length(st->args);
+ for (size_t k = 0; k < n_args; k++) {
+ PgfPArg* parg = gu_seq_index(st->args, PgfPArg, k);
+ q.prob += parg->ccat->viterbi_prob;
+ }
+
+ gu_buf_heap_push(pr->pqueue, &pgf_expr_qstate_order, &q);
break;
}
}
@@ -1501,115 +1524,103 @@ pgf_parse_best_result_init(PgfCCat *ccat, GuBuf *pqueue,
}
static PgfExprProb*
-pgf_parse_best_ccat_result(
- PgfExprCache *cache, PgfConcr *concr,
- PgfCCat *ccat, GuPool *pool)
+pgf_parse_result_next(PgfParseResult* pr, GuPool* pool)
{
- PgfExprProb* ep = NULL;
- if (gu_map_has(cache, ccat)) {
- ep = gu_map_get(cache, ccat, PgfExprProb*);
- return ep;
- }
-
- gu_map_put(cache, ccat, PgfExprProb*, NULL);
-
- GuPool *tmp_pool = gu_new_pool();
-
- GuBuf* pqueue = gu_new_buf(PgfExprState*, tmp_pool);
- pgf_parse_best_result_init(ccat, pqueue, tmp_pool, pool);
-
- while (gu_buf_length(pqueue) > 0) {
- PgfExprState *st;
- gu_buf_heap_pop(pqueue, &pgf_expr_prob_order, &st);
-
- if (st->arg_idx >= gu_seq_length(st->args)) {
- ep = gu_new(PgfExprProb, pool);
- *ep = st->ep;
- gu_map_put(cache, ccat, PgfExprProb*, ep);
- break;
+ if (pr->pqueue == NULL)
+ return NULL;
+
+ for (;;) {
+ if (pr->state->ps->completed == NULL) {
+ pgf_parsing_proceed(pr->state,
+ (void**) &pr->state->ps->completed);
+ if (pr->state->ps->completed == NULL)
+ return NULL;
+ pgf_result_cat_init(pr, NULL, 0, pr->state->ps->completed);
}
- PgfPArg *parg = gu_seq_index(st->args, PgfPArg, st->arg_idx++);
-
- double prob = 0;
- PgfExpr fun = st->ep.expr;
- PgfExpr arg;
+ while (gu_buf_length(pr->pqueue) > 0) {
+ PgfExprQState q;
+ gu_buf_heap_pop(pr->pqueue, &pgf_expr_qstate_order, &q);
+
+ if (q.st->arg_idx < gu_seq_length(q.st->args)) {
+ PgfPArg* arg = gu_seq_index(q.st->args, PgfPArg, q.st->arg_idx);
+ float cont_prob = q.prob - arg->ccat->viterbi_prob;
+ if (arg->ccat->fid < pr->state->ps->concr->total_cats) {
+ PgfExpr arg_meta;
+ PgfExprMeta *expr_meta =
+ gu_new_variant(PGF_EXPR_META,
+ PgfExprMeta,
+ &arg_meta, pool);
+ expr_meta->id = 0;
+
+ q.st->expr =
+ gu_new_variant_i(pool, PGF_EXPR_APP,
+ PgfExprApp,
+ .fun = q.st->expr,
+ .arg = arg_meta);
+ q.st->arg_idx++;
+ gu_buf_heap_push(pr->pqueue, &pgf_expr_qstate_order, &q);
+ } else {
+ pgf_result_cat_init(pr, q.st, cont_prob, arg->ccat);
+ }
+ } else {
+ if (q.st->cont == NULL) {
+ PgfExprProb* ep = gu_new(PgfExprProb, pool);
+ ep->expr = q.st->expr;
+ ep->prob = q.prob;
+ return ep;
+ }
- if (parg->ccat->fid < concr->total_cats) {
- PgfExprMeta *expr_meta =
- gu_new_variant(PGF_EXPR_META,
- PgfExprMeta,
- &arg, pool);
- expr_meta->id = 0;
- } else {
- PgfExprProb *ep_arg =
- pgf_parse_best_ccat_result(cache, concr,
- parg->ccat, pool);
- if (ep_arg == NULL)
- continue;
+ PgfExprState* st = gu_new(PgfExprState, pr->tmp_pool);
+ st->cont = q.st->cont->cont;
+ st->expr =
+ gu_new_variant_i(pool, PGF_EXPR_APP,
+ PgfExprApp,
+ .fun = q.st->cont->expr, .arg = q.st->expr);
+ st->args = q.st->cont->args;
+ st->arg_idx = q.st->cont->arg_idx+1;
- arg = ep_arg->expr;
- prob = ep_arg->prob;
+ PgfExprQState q2 = {st, q.prob};
+ gu_buf_heap_push(pr->pqueue, &pgf_expr_qstate_order, &q2);
+ }
}
- PgfExprApp *expr_apply =
- gu_new_variant(PGF_EXPR_APP,
- PgfExprApp,
- &st->ep.expr, pool);
- expr_apply->fun = fun;
- expr_apply->arg = arg;
- st->ep.prob += prob;
- gu_buf_heap_push(pqueue, &pgf_expr_prob_order, &st);
+
+ pr->state->ps->completed = NULL;
}
- gu_pool_free(tmp_pool);
+ gu_pool_free(pr->tmp_pool);
+ pr->tmp_pool = NULL;
+ pr->pqueue = NULL;
+ return NULL;
+}
- return ep;
+static void
+pgf_parse_result_enum_next(GuEnum* self, void* to, GuPool* pool)
+{
+ PgfParseResult* pr = gu_container(self, PgfParseResult, en);
+ *(PgfExprProb**)to = pgf_parse_result_next(pr, pool);
}
-PgfExpr
-pgf_parse_best_result(PgfParse* parse, GuPool* pool)
+PgfExprEnum*
+pgf_parse_result(PgfParseState* state, GuPool* pool)
{
GuPool* tmp_pool = gu_new_pool();
- PgfParsing* parsing =
- pgf_new_parsing(parse->concr,
- &lex_callback_noop,
- parse->max_fid,
- pool, tmp_pool);
- size_t n_items = gu_buf_length(parse->agenda);
- for (size_t i = 0; i < n_items; i++) {
- PgfItem* item = gu_buf_get(parse->agenda, PgfItem*, i);
- pgf_parsing_item(parsing, item);
- }
-
- PgfExprCache *cache = gu_map_type_new(PgfExprCache, tmp_pool);
-
- GuBuf* pqueue = gu_new_buf(PgfExprProb*, tmp_pool);
+ GuBuf* pqueue = gu_new_buf(PgfExprQState, tmp_pool);
- size_t n_completed = gu_buf_length(parsing->completed);
- for (size_t i = 0; i < n_completed; i++) {
- PgfCCat *ccat = gu_buf_get(parsing->completed, PgfCCat*, i);
-
- PgfExprProb *ep =
- pgf_parse_best_ccat_result(cache, parse->concr, ccat, pool);
- if (ep != NULL)
- gu_buf_heap_push(pqueue, &pgf_expr_prob_order, &ep);
- }
-
- PgfExpr expr = gu_null_variant;
- if (gu_buf_length(pqueue) > 0) {
- PgfExprProb* ep = NULL;
- gu_buf_heap_pop(pqueue, &pgf_expr_prob_order, &ep);
- expr = ep->expr;
- }
-
- gu_pool_free(tmp_pool);
+ PgfExprEnum* en =
+ &gu_new_i(pool, PgfParseResult,
+ .tmp_pool = tmp_pool,
+ .state = state,
+ .pqueue = pqueue,
+ .en.next = pgf_parse_result_enum_next)->en;
- return expr;
+ return en;
}
+
// TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat
-PgfParse*
-pgf_parser_parse(PgfConcr* concr, PgfCId cat, size_t lin_idx, GuPool* pool)
+PgfParseState*
+pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx, GuPool* pool)
{
PgfCncCat* cnccat =
gu_map_get(concr->cnccats, &cat, PgfCncCat*);
@@ -1619,8 +1630,10 @@ pgf_parser_parse(PgfConcr* concr, PgfCId cat, size_t lin_idx, GuPool* pool)
}
gu_assert(lin_idx < cnccat->n_lins);
- PgfParse* parse = pgf_new_parse(concr, concr->max_fid, pool);
- parse->agenda = gu_new_buf(PgfItem*, pool);
+ PgfParsing* ps =
+ pgf_new_parsing(concr, pool);
+ PgfParseState* state =
+ pgf_new_parse_state(ps, NULL, NULL, 0, pool);
PgfItemBuf* conts = gu_new_buf(PgfItem*, pool);
gu_buf_push(conts, PgfItem*, NULL);
@@ -1640,12 +1653,18 @@ pgf_parser_parse(PgfConcr* concr, PgfCId cat, size_t lin_idx, GuPool* pool)
PgfProduction prod =
gu_seq_get(prods, PgfProduction, i);
PgfItem* item =
- pgf_new_item(ccat, lin_idx, prod, conts, pool);
- gu_buf_push(parse->agenda, PgfItem*, item);
+ pgf_new_item(0, ccat, lin_idx, prod, conts, pool);
+ gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
}
+
+ PgfItem *item =
+ pgf_new_item(0, ccat, lin_idx, ps->meta_prod, conts, pool);
+ item->inside_prob =
+ 1000000 + ccat->cnccat->abscat->meta_prob * 1000;
+ gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
}
}
- return parse;
+ return state;
}
void
@@ -1767,6 +1786,7 @@ pgf_parser_bu_item(PgfConcr* concr, PgfItem* item,
if (eps_ccat == NULL) {
eps_ccat = gu_new(PgfCCat, pool);
eps_ccat->cnccat = item->base->ccat->cnccat;
+ eps_ccat->viterbi_prob = INFINITY;
eps_ccat->fid = concr->max_fid++;
eps_ccat->prods =
gu_buf_seq(gu_new_buf(PgfProduction, pool));
@@ -1875,7 +1895,7 @@ pgf_parser_bu_index(PgfConcr* concr, PgfCCat* ccat, PgfProduction prod,
pgf_parsing_get_conts(conts_map, ccat, lin_idx,
pool, tmp_pool);
PgfItem* item =
- pgf_new_item(ccat, lin_idx, prod, conts, pool);
+ pgf_new_item(0, ccat, lin_idx, prod, conts, pool);
pgf_parser_bu_item(concr, item, conts_map, pool, tmp_pool);
}
diff --git a/src/runtime/c/pgf/parser.h b/src/runtime/c/pgf/parser.h
index 4e42fbaa0..b1dc3f32b 100644
--- a/src/runtime/c/pgf/parser.h
+++ b/src/runtime/c/pgf/parser.h
@@ -15,7 +15,7 @@
* @todo HOAS, dependent types...
*/
-typedef struct PgfParse PgfParse;
+typedef struct PgfParseState PgfParseState;
/** @}
*
@@ -32,8 +32,9 @@ typedef struct PgfParse PgfParse;
*/
/// Begin parsing
-PgfParse*
-pgf_parser_parse(PgfConcr* concr, PgfCId cat, size_t lin_idx, GuPool* pool);
+PgfParseState*
+pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx,
+ GuPool* pool);
/**<
* @param parser The parser to use
*
@@ -48,8 +49,9 @@ pgf_parser_parse(PgfConcr* concr, PgfCId cat, size_t lin_idx, GuPool* pool);
/// Feed a token to the parser
-PgfParse*
-pgf_parse_token(PgfParse* parse, PgfToken tok, bool robust, GuPool* pool);
+PgfParseState*
+pgf_parser_next_state(PgfParseState* prev, PgfToken tok,
+ GuPool* pool);
/**<
* @param parse The current parse state
*
@@ -87,7 +89,7 @@ typedef GuEnum PgfExprEnum;
/// Retrieve the current parses from the parse state.
PgfExprEnum*
-pgf_parse_result(PgfParse* parse, GuPool* pool);
+pgf_parse_result(PgfParseState* state, GuPool* pool);
/**<
* @param parse A parse state
*
@@ -101,7 +103,7 @@ pgf_parse_result(PgfParse* parse, GuPool* pool);
*/
PgfExpr
-pgf_parse_best_result(PgfParse* parse, GuPool* pool);
+pgf_parse_best_result(PgfParseState* state, GuPool* pool);
int
diff --git a/src/runtime/c/pgf/printer.c b/src/runtime/c/pgf/printer.c
index 2b1729749..140688eb5 100644
--- a/src/runtime/c/pgf/printer.c
+++ b/src/runtime/c/pgf/printer.c
@@ -40,7 +40,7 @@ pgf_print_cat(GuMapItor* fn, const void* key, void* value,
pgf_print_hypo(hypo, 4, wtr, err);
}
- gu_puts(" ;\n", wtr, err);
+ gu_printf(wtr, err, " ; -- %f\n",cat->meta_prob);
}
void
@@ -56,7 +56,7 @@ pgf_print_absfun(GuMapItor* fn, const void* key, void* value,
gu_string_write(name, wtr, err);
gu_puts(" : ", wtr, err);
pgf_print_type(fun->type, 0, wtr, err);
- gu_puts(" ;\n", wtr, err);
+ gu_printf(wtr, err, " ; -- %f\n", fun->ep.prob);
}
static void
pgf_print_abstract(PgfCId absname, PgfAbstr* abstr,
diff --git a/src/runtime/c/pgf/reader.c b/src/runtime/c/pgf/reader.c
index 6b287f68e..1fee45f83 100644
--- a/src/runtime/c/pgf/reader.c
+++ b/src/runtime/c/pgf/reader.c
@@ -34,7 +34,6 @@
#define GU_LOG_ENABLE
#include <gu/log.h>
-typedef struct PgfIdContext PgfIdContext;
typedef GuMap PgfContsMap;
@@ -443,6 +442,7 @@ pgf_read_to_PgfCCatId(GuType* type, PgfReader* rdr, void* to)
ccat->lindefs = gu_map_get(rdr->curr_lindefs, &fid, PgfFunIds*);
ccat->n_synprods = 0;
ccat->prods = gu_null_seq;
+ ccat->viterbi_prob = 0;
ccat->fid = fid;
gu_map_put(rdr->curr_concr->ccats, &fid, PgfCCat*, ccat);
@@ -465,6 +465,7 @@ pgf_read_to_PgfCCat(GuType* type, PgfReader* rdr, void* to)
ccat->cnccat = NULL;
ccat->lindefs = gu_map_get(rdr->curr_lindefs, fidp, PgfFunIds*);
ccat->prods = gu_new_seq(PgfProduction, n_prods, rdr->opool);
+ ccat->viterbi_prob = 0;
ccat->fid = *fidp;
size_t top = 0;
@@ -600,7 +601,7 @@ pgf_read_new_PgfFunDecl(GuType* type, PgfReader* rdr, GuPool* pool, size_t* size
}
absfun->ep.prob = - log(gu_in_f64be(rdr->in, rdr->err));
-
+
PgfExprFun* expr_fun =
gu_new_variant(PGF_EXPR_FUN,
PgfExprFun,
@@ -638,11 +639,33 @@ pgf_read_to_PgfFunId(GuType* type, PgfReader* rdr, void* to)
*(PgfFunId*) to = gu_list_elems(rdr->curr_concr->cncfuns)[id];
}
+typedef struct {
+ GuMapItor fn;
+ PgfReader* rdr;
+} PgfIndexFn;
+
+static void
+pgf_compute_meta_probs(GuMapItor* fn, const void* key, void* value, GuExn* err)
+{
+ (void) (key && err);
+
+ PgfCat* cat = *((PgfCat**) value);
+
+ double mass = 0;
+ for (size_t i = 0; i < cat->n_functions; i++) {
+ mass += cat->functions[i].prob;
+ }
+ cat->meta_prob = - log(fabs(1 - mass));
+}
+
static void
pgf_read_to_PgfAbstr(GuType* type, PgfReader* rdr, void* to)
{
rdr->curr_abstr = to;
pgf_read_to_struct(type, rdr, to);
+
+ PgfIndexFn clo = { { pgf_compute_meta_probs }, rdr };
+ gu_map_iter(rdr->curr_abstr->cats, &clo.fn, NULL);
}
static GU_DEFINE_TYPE(PgfLinDefs, GuIntMap, gu_ptr_type(PgfFunIds),
@@ -691,11 +714,6 @@ pgf_ccat_set_cnccat(PgfCCat* ccat)
return ccat->cnccat;
}
-typedef struct {
- GuMapItor fn;
- PgfReader* rdr;
-} PgfIndexFn;
-
static void
pgf_read_ccat_cb(GuMapItor* fn, const void* key, void* value, GuExn* err)
{
@@ -771,12 +789,6 @@ pgf_read_new_PgfConcr(GuType* type, PgfReader* rdr, GuPool* pool,
concr->total_cats = pgf_read_int(rdr);
concr->max_fid = concr->total_cats;
- PgfIndexFn clo1 = { { pgf_read_ccat_cb }, rdr };
- gu_map_iter(concr->ccats, &clo1.fn, NULL);
-
- PgfIndexFn clo2 = { { pgf_index_prods }, rdr };
- gu_map_iter(concr->ccats, &clo2.fn, NULL);
-
// set the function ids
int n_funs = gu_list_length(concr->cncfuns);
for (int funid = 0; funid < n_funs; funid++) {
@@ -788,6 +800,12 @@ pgf_read_new_PgfConcr(GuType* type, PgfReader* rdr, GuPool* pool,
cncfun->ep = (absfun == NULL) ? NULL : &absfun->ep;
}
+ PgfIndexFn clo1 = { { pgf_read_ccat_cb }, rdr };
+ gu_map_iter(concr->ccats, &clo1.fn, NULL);
+
+ PgfIndexFn clo2 = { { pgf_index_prods }, rdr };
+ gu_map_iter(concr->ccats, &clo2.fn, NULL);
+
return concr;
}
@@ -821,6 +839,7 @@ pgf_read_new_PgfCncCat(GuType* type, PgfReader* rdr, GuPool* pool,
ccat->lindefs = gu_map_get(rdr->curr_lindefs, &fid, PgfFunIds*);
ccat->n_synprods = 0;
ccat->prods = gu_null_seq;
+ ccat->viterbi_prob = 0;
ccat->fid = fid;
gu_map_put(rdr->curr_concr->ccats, &fid, PgfCCat*, ccat);
diff --git a/src/runtime/c/utils/pgf-translate.c b/src/runtime/c/utils/pgf-translate.c
index 878e07992..e9f482b84 100644
--- a/src/runtime/c/utils/pgf-translate.c
+++ b/src/runtime/c/utils/pgf-translate.c
@@ -18,6 +18,33 @@
#include <locale.h>
#include <time.h>
+static void
+print_result(PgfExprProb* ep, PgfConcr* to_concr,
+ GuWriter* wtr, GuExn* err, GuPool* ppool)
+{
+ // Write out the abstract syntax tree
+ gu_printf(wtr, err, " [%f] ", ep->prob);
+ pgf_print_expr(ep->expr, 0, wtr, err);
+ gu_putc('\n', wtr, err);
+
+ // Enumerate the concrete syntax trees corresponding
+ // to the abstract tree.
+ GuEnum* cts = pgf_lzr_concretize(to_concr, ep->expr, ppool);
+ while (true) {
+ PgfCncTree ctree =
+ gu_next(cts, PgfCncTree, ppool);
+ if (gu_variant_is_null(ctree)) {
+ break;
+ }
+ gu_putc(' ', wtr, err);
+ // Linearize the concrete tree as a simple
+ // sequence of strings.
+ pgf_lzr_linearize_simple(to_concr , ctree, 0, wtr, err);
+ gu_putc('\n', wtr, err);
+ gu_writer_flush(wtr, err);
+ }
+}
+
int main(int argc, char* argv[]) {
// Set the character locale, so we can produce proper output.
setlocale(LC_CTYPE, "");
@@ -32,15 +59,7 @@ int main(int argc, char* argv[]) {
}
char* filename = argv[1];
- GuString cat;
- bool robust_mode;
- if (argv[2][0] == '.') {
- cat = gu_str_string(argv[2]+1, pool);
- robust_mode = true;
- } else {
- cat = gu_str_string(argv[2], pool);
- robust_mode = false;
- }
+ GuString cat = gu_str_string(argv[2], pool);
GuString from_lang = gu_str_string(argv[3], pool);
GuString to_lang = gu_str_string(argv[4], pool);
@@ -83,10 +102,6 @@ int main(int argc, char* argv[]) {
pgf_parser_add_literal(from_concr, gu_str_string("Symb", pool),
&pgf_nerc_literal_callback);
- // Arbitrarily choose linearization index 0. Usually the initial
- // categories we are interested in only have one field.
- int lin_idx = 0;
-
// Create an output stream for stdout
GuOut* out = gu_file_out(stdout, pool);
@@ -95,6 +110,11 @@ int main(int argc, char* argv[]) {
// Use a writer with hard-coded utf-8 encoding for now.
GuWriter* wtr = gu_new_utf8_writer(out, pool);
+ // We will keep the latest results in the 'ppool' and
+ // we will iterate over them by using 'result'.
+ GuPool* ppool = NULL;
+ GuEnum* result = NULL;
+
// The interactive translation loop.
// XXX: This currently reads stdin directly, so it doesn't support
// encodings properly. TODO: use a locale reader for input
@@ -109,20 +129,49 @@ int main(int argc, char* argv[]) {
status = EXIT_FAILURE;
}
break;
- } else if (line[0] == '\0') {
+ } else if (strcmp(line, "") == 0) {
// End nicely on empty input
break;
+ } else if (strcmp(line, "\n") == 0) {
+ // Empty line -> show the next tree for the last sentence
+
+ if (result != NULL) {
+ clock_t start = clock();
+
+ PgfExprProb* ep = gu_next(result, PgfExprProb*, ppool);
+
+ clock_t end = clock();
+ double cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
+ printf("%.2f sec\n", cpu_time_used);
+
+ // The enumerator will return a null variant at the
+ // end of the results.
+ if (ep == NULL) {
+ goto fail_parse;
+ }
+
+ print_result(ep, to_concr, wtr, err, ppool);
+ }
+ continue;
+ }
+
+ // We release the last results
+ if (ppool != NULL) {
+ gu_pool_free(ppool);
+ ppool = NULL;
+ result = NULL;
}
+
// We create a temporary pool for translating a single
// sentence, so our memory usage doesn't increase over time.
- GuPool* ppool = gu_new_pool();
+ ppool = gu_new_pool();
clock_t start = clock();
// Begin parsing a sentence of the specified category
- PgfParse* parse =
- pgf_parser_parse(from_concr, cat, lin_idx, pool);
- if (parse == NULL) {
+ PgfParseState* state =
+ pgf_parser_init_state(from_concr, cat, 0, pool);
+ if (state == NULL) {
fprintf(stderr, "Couldn't begin parsing\n");
status = EXIT_FAILURE;
break;
@@ -133,13 +182,13 @@ int main(int argc, char* argv[]) {
PgfLexer *lexer =
pgf_new_lexer(rdr, pool);
- // naive tokenization
+ // Tokenization
GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), pool);
PgfToken tok = pgf_lexer_next_token(lexer, lex_err, pool);
while (!gu_exn_is_raised(lex_err)) {
// feed the token to get a new parse state
- parse = pgf_parse_token(parse, tok, robust_mode, ppool);
- if (!parse) {
+ state = pgf_parser_next_state(state, tok, ppool);
+ if (!state) {
gu_puts("Unexpected token: \"", wtr, err);
gu_string_write(tok, wtr, err);
gu_puts("\"\n", wtr, err);
@@ -149,64 +198,29 @@ int main(int argc, char* argv[]) {
tok = pgf_lexer_next_token(lexer, lex_err, pool);
}
- if (robust_mode) {
- PgfExpr expr = pgf_parse_best_result(parse, ppool);
-
- clock_t end = clock();
-
- double cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
- printf("%.2f sec\n", cpu_time_used);
-
- if (!gu_variant_is_null(expr)) {
- gu_putc(' ', wtr, err);
- // Write out the abstract syntax tree
- pgf_print_expr(expr, 0, wtr, err);
- gu_putc('\n', wtr, err);
- }
- } else {
- // Now begin enumerating the resulting syntax trees
- GuEnum* result = pgf_parse_result(parse, ppool);
-
- clock_t end = clock();
+ // Now begin enumerating the resulting syntax trees
+ result = pgf_parse_result(state, ppool);
- double cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
- printf("%.2f sec\n", cpu_time_used);
+ PgfExprProb* ep = gu_next(result, PgfExprProb*, ppool);
- while (true) {
- PgfExpr expr = gu_next(result, PgfExpr, ppool);
+ clock_t end = clock();
+ double cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
+ printf("%.2f sec\n", cpu_time_used);
- // The enumerator will return a null variant at the
- // end of the results.
- if (gu_variant_is_null(expr)) {
- break;
- }
- gu_putc(' ', wtr, err);
- // Write out the abstract syntax tree
- pgf_print_expr(expr, 0, wtr, err);
- gu_putc('\n', wtr, err);
-
- // Enumerate the concrete syntax trees corresponding
- // to the abstract tree.
- GuEnum* cts = pgf_lzr_concretize(to_concr, expr, ppool);
- while (true) {
- PgfCncTree ctree =
- gu_next(cts, PgfCncTree, ppool);
- if (gu_variant_is_null(ctree)) {
- break;
- }
- gu_puts(" ", wtr, err);
- // Linearize the concrete tree as a simple
- // sequence of strings.
- pgf_lzr_linearize_simple(to_concr , ctree, lin_idx,
- wtr, err);
- gu_putc('\n', wtr, err);
- gu_writer_flush(wtr, err);
- }
- }
+ // The enumerator will return a null variant at the
+ // end of the results.
+ if (ep == NULL) {
+ goto fail_parse;
}
+
+ print_result(ep, to_concr, wtr, err, ppool);
+
+ continue;
fail_parse:
// Free all resources allocated during parsing and linearization
gu_pool_free(ppool);
+ ppool = NULL;
+ result = NULL;
}
fail_concr:
fail_read: