summaryrefslogtreecommitdiff
path: root/src/runtime/c/pgf/parser.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/runtime/c/pgf/parser.c')
-rw-r--r--src/runtime/c/pgf/parser.c1901
1 files changed, 914 insertions, 987 deletions
diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c
index 0ab757a62..476dda827 100644
--- a/src/runtime/c/pgf/parser.c
+++ b/src/runtime/c/pgf/parser.c
@@ -4,6 +4,7 @@
#include <gu/assert.h>
#include <gu/file.h>
+#include <gu/utf8.h>
#include <math.h>
#include <stdlib.h>
@@ -14,6 +15,8 @@
typedef GuBuf PgfItemBuf;
static GU_DEFINE_TYPE(PgfItemBuf, abstract, _);
+typedef struct PgfParseState PgfParseState;
+
struct PgfItemConts {
PgfCCat* ccat;
size_t lin_idx;
@@ -44,10 +47,14 @@ typedef struct {
PgfConcr* concr;
GuPool* pool; // this pool is used for structures internal to the parser
GuPool* out_pool; // this pool is used for the allocating the final abstract trees
- GuBuf* expr_queue;
+ GuString sentence; // the sentence to be parsed
+ GuBuf* expr_queue; // during the extraction of abstract trees we push them in this queue
PgfExpr meta_var;
PgfProduction meta_prod;
int max_fid;
+ PgfParseState *before;
+ PgfParseState *after;
+ PgfExprEnum en; // enumeration for the generated trees
#ifdef PGF_COUNTS_DEBUG
int item_full_count;
int item_real_count;
@@ -59,33 +66,15 @@ typedef struct {
prob_t beam_size;
} PgfParsing;
-typedef struct {
- PgfCCat* ccat;
- size_t lin_idx;
-} PgfCFCat;
-
-static GU_DEFINE_TYPE(PgfCFCat, struct,
- GU_MEMBER(PgfCFCat, ccat, PgfCCat),
- GU_MEMBER(PgfCFCat, lin_idx, size_t));
-
-extern GuHasher pgf_cfcat_hasher;
-
-GU_DEFINE_TYPE(PgfProductionIdx, GuMap,
- gu_type(PgfCFCat), &pgf_cfcat_hasher,
- gu_ptr_type(PgfProductionBuf), &gu_null_struct);
-
-typedef struct PgfTokenState PgfTokenState;
+typedef enum { BIND_NONE, BIND_HARD, BIND_SOFT } BIND_TYPE;
typedef struct {
- bool (*match_token)(PgfTokenState* ts, PgfToken tok, PgfItem* item);
- PgfToken (*get_token)(PgfTokenState* ts);
- PgfProductionIdx* (*get_lexicon_idx)(PgfTokenState* ts);
-} PgfTokenFn;
-
-struct PgfTokenState {
- PgfTokenFn* fn;
- prob_t lexical_prob;
-};
+ PgfProductionIdx* idx;
+ BIND_TYPE bind_type;
+ size_t offset;
+} PgfLexiconIdxEntry;
+
+typedef GuBuf PgfLexiconIdx;
struct PgfParseState {
PgfParseState* next;
@@ -94,12 +83,14 @@ struct PgfParseState {
PgfItem* meta_item;
PgfContsMap* conts_map;
PgfGenCatMap* generated_cats;
- unsigned short offset;
+
+ bool needs_bind;
+ size_t start_offset;
+ size_t end_offset;
prob_t viterbi_prob;
- PgfParsing* ps;
- PgfTokenState* ts;
+ PgfLexiconIdx* lexicon_idx;
};
typedef struct PgfAnswers {
@@ -115,13 +106,6 @@ typedef struct {
size_t arg_idx;
} PgfExprState;
-typedef struct PgfParseResult PgfParseResult;
-
-struct PgfParseResult {
- PgfParseState* state;
- PgfExprEnum en;
-};
-
typedef struct PgfItemBase PgfItemBase;
struct PgfItem {
@@ -133,15 +117,12 @@ struct PgfItem {
PgfProduction prod;
PgfPArgs* args;
PgfSymbol curr_sym;
- uint16_t seq_idx;
+ uint16_t sym_idx;
uint8_t alt_idx; // position in the pre alternative
uint8_t alt; // the number of the alternative
prob_t inside_prob;
};
-GU_DEFINE_TYPE(PgfLeftcornerTokIdx, GuStringMap,
- gu_ptr_type(PgfProductionIdx), &gu_null_struct);
-
static PgfSymbol
pgf_prev_extern_sym(PgfSymbol sym)
{
@@ -151,10 +132,11 @@ pgf_prev_extern_sym(PgfSymbol sym)
return *((PgfSymbol*) (((PgfSymbolCat*) i.data)+1));
case PGF_SYMBOL_KP:
return *((PgfSymbol*) (((PgfSymbolKP*) i.data)+1));
- case PGF_SYMBOL_KS:
+ case PGF_SYMBOL_KS: {
PgfSymbolKS* sks = (PgfSymbolKS*) i.data;
size_t tok_len = strlen(sks->token);
return *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1));
+ }
case PGF_SYMBOL_LIT:
return *((PgfSymbol*) (((PgfSymbolLit*) i.data)+1));
case PGF_SYMBOL_VAR:
@@ -169,30 +151,53 @@ pgf_prev_extern_sym(PgfSymbol sym)
}
}
-size_t
-pgf_item_lin_idx(PgfItem* item) {
- return item->conts->lin_idx;
+void
+pgf_add_extern_tok(PgfSymbol* psym, PgfToken tok, GuPool* pool) {
+ PgfSymbol new_sym;
+ size_t tok_len = strlen(tok);
+ PgfSymbolKS* sks = (PgfSymbolKS*)
+ gu_alloc_variant(PGF_SYMBOL_KS,
+ sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+tok_len+1,
+ gu_alignof(PgfSymbolKS),
+ &new_sym, pool);
+ strcpy(sks->token, tok);
+ *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1)) = *psym;
+ *psym = new_sym;
}
-int
-pgf_item_sequence_length(PgfItem* item)
+void
+pgf_add_extern_cat(PgfSymbol* psym, int d, int r, GuPool* pool) {
+ PgfSymbol new_sym;
+ PgfSymbolCat* scat = (PgfSymbolCat*)
+ gu_alloc_variant(PGF_SYMBOL_CAT,
+ sizeof(PgfSymbolCat)+sizeof(PgfSymbol),
+ gu_alignof(PgfSymbolCat),
+ &new_sym, pool);
+ *((PgfSymbol*) (scat+1)) = *psym;
+ scat->d = d;
+ scat->r = r;
+ *psym = new_sym;
+}
+
+static size_t
+pgf_item_symbols_length(PgfItem* item)
{
GuVariantInfo i = gu_variant_open(item->prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
- return gu_seq_length(papp->fun->lins[item->conts->lin_idx]);
+ return gu_seq_length(papp->fun->lins[item->conts->lin_idx]->syms);
}
case PGF_PRODUCTION_COERCE: {
return 1;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
- PgfSequence* seq;
-
+ PgfSymbols* syms;
+
if (pext->lins != NULL &&
- (seq = gu_seq_get(pext->lins,PgfSequence*,item->conts->lin_idx)) != NULL) {
- return gu_seq_length(seq);
+ (syms = gu_seq_get(pext->lins,PgfSymbols*,item->conts->lin_idx)) != NULL) {
+ return gu_seq_length(syms);
} else {
int seq_len = 0;
PgfSymbol sym = item->curr_sym;
@@ -220,33 +225,33 @@ pgf_item_sequence_length(PgfItem* item)
}
}
-static PgfSequence*
-pgf_extern_seq_get(PgfItem* item, GuPool* pool)
+static PgfSymbols*
+pgf_extern_syms_get(PgfItem* item, GuPool* pool)
{
- int seq_len = pgf_item_sequence_length(item);
+ int syms_len = pgf_item_symbols_length(item);
- PgfSequence* seq =
- gu_new_seq(PgfSymbol, seq_len, pool);
+ PgfSymbols* syms =
+ gu_new_seq(PgfSymbol, syms_len, pool);
PgfSymbol sym = item->curr_sym;
while (!gu_variant_is_null(sym)) {
- gu_seq_set(seq, PgfSymbol, --seq_len, sym);
+ gu_seq_set(syms, PgfSymbol, --syms_len, sym);
sym = pgf_prev_extern_sym(sym);
}
- return seq;
+ return syms;
}
-void
-pgf_item_sequence(PgfItem* item,
- size_t* lin_idx, PgfSequence** seq,
- GuPool* pool) {
+static void
+pgf_item_symbols(PgfItem* item,
+ size_t* lin_idx, PgfSymbols** syms,
+ GuPool* pool) {
*lin_idx = item->conts->lin_idx;
GuVariantInfo i = gu_variant_open(item->prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
- *seq = papp->fun->lins[item->conts->lin_idx];
+ *syms = papp->fun->lins[item->conts->lin_idx]->syms;
break;
}
case PGF_PRODUCTION_COERCE: {
@@ -254,21 +259,21 @@ pgf_item_sequence(PgfItem* item,
gu_new_variant_i(pool, PGF_SYMBOL_CAT,
PgfSymbolCat,
.d = 0, .r = item->conts->lin_idx);
- *seq = gu_new_seq(PgfSequence*, 1, pool);
- gu_seq_set(*seq, PgfSymbol, 0, sym);
+ *syms = gu_new_seq(PgfSymbol, 1, pool);
+ gu_seq_set(*syms, PgfSymbol, 0, sym);
break;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
if (pext->lins == NULL ||
- (*seq = gu_seq_get(pext->lins, PgfSequence*, item->conts->lin_idx)) == NULL) {
- *seq = pgf_extern_seq_get(item, pool);
+ (*syms = gu_seq_get(pext->lins, PgfSymbols*, item->conts->lin_idx)) == NULL) {
+ *syms = pgf_extern_syms_get(item, pool);
}
break;
}
case PGF_PRODUCTION_META: {
- *seq = pgf_extern_seq_get(item, pool);
+ *syms = pgf_extern_syms_get(item, pool);
break;
}
default:
@@ -285,7 +290,7 @@ pgf_print_production_args(PgfPArgs* args,
for (size_t j = 0; j < n_args; j++) {
if (j > 0)
gu_putc(',',out,err);
-
+
PgfPArg arg = gu_seq_get(args, PgfPArg, j);
if (arg.hypos != NULL &&
@@ -351,31 +356,38 @@ pgf_print_item_seq(PgfItem *item,
GuOut *out, GuExn* err, GuPool* pool)
{
size_t lin_idx;
- PgfSequence seq;
- pgf_item_sequence(item, &lin_idx, &seq, pool);
+ PgfSymbols* syms = NULL;
+ pgf_item_symbols(item, &lin_idx, &syms, pool);
gu_printf(out, err, "%d : ",lin_idx);
size_t index;
- for (index = 0; index < gu_seq_length(seq); index++) {
- if (item->seq_idx == index)
+ for (index = 0; index < gu_seq_length(syms); index++) {
+ if (item->sym_idx == index)
gu_printf(out, err, " . ");
- PgfSymbol *sym = gu_seq_index(seq, PgfSymbol, index);
- pgf_print_symbol(*sym, out, err);
+ PgfSymbol sym = gu_seq_get(syms, PgfSymbol, index);
+ pgf_print_symbol(sym, out, err);
}
- if (item->seq_idx == index)
+ if (item->sym_idx == index)
gu_printf(out, err, " .");
}
static void
+pgf_print_range(PgfParseState* start, PgfParseState* end, GuOut* out, GuExn* err)
+{
+ gu_printf(out, err, "%d-%d",
+ (start != NULL) ? start->end_offset : 0,
+ (start == end) ? end->end_offset : end->start_offset);
+}
+
+static void
pgf_print_item(PgfItem* item, PgfParseState* state, GuOut* out, GuExn* err, GuPool* pool)
{
- gu_printf(out, err, "[%d-%d; C%d -> ",
- item->conts->state ? item->conts->state->offset : 0,
- state ? state->offset : 0,
- item->conts->ccat->fid);
+ gu_printf(out, err, "[");
+ pgf_print_range(item->conts->state, state, out, err);
+ gu_printf(out, err, "; C%d -> ", item->conts->ccat->fid);
GuVariantInfo i = gu_variant_open(item->prod);
switch (i.tag) {
@@ -483,6 +495,45 @@ pgf_print_expr_state0(PgfExprState* st,
#endif
static int
+cmp_string(GuString* psent, size_t* plen, GuString tok)
+{
+ GuString sent = *psent;
+ size_t len = *plen;
+
+ while (*tok != 0) {
+ if (len == 0)
+ return -1;
+
+ if (((uint8_t) *sent) > ((uint8_t) *tok))
+ return 1;
+ else if (((uint8_t) *sent) < ((uint8_t) *tok))
+ return -2;
+
+ tok++;
+ sent++;
+ len--;
+ }
+
+ *psent = sent;
+ *plen = len;
+ return 0;
+}
+
+static bool
+skip_space(GuString* psent, size_t* plen)
+{
+ if (*plen == 0)
+ return false;
+
+ char c = **psent;
+ if (!gu_is_space(c))
+ return false;
+
+ (*psent)++;
+ return true;
+}
+
+static int
cmp_item_prob(GuOrder* self, const void* a, const void* b)
{
PgfItem *item1 = *((PgfItem **) a);
@@ -500,32 +551,31 @@ cmp_item_prob(GuOrder* self, const void* a, const void* b)
}
static GuOrder
-pgf_item_prob_order = { cmp_item_prob };
+pgf_item_prob_order[1] = { { cmp_item_prob } };
static PgfItemContss*
-pgf_parsing_get_contss(PgfContsMap* conts_map, PgfCCat* cat, GuPool *pool)
+pgf_parsing_get_contss(PgfParseState* state, PgfCCat* cat, GuPool *pool)
{
- PgfItemContss* contss = gu_map_get(conts_map, cat, PgfItemContss*);
+ PgfItemContss* contss = gu_map_get(state->conts_map, cat, PgfItemContss*);
if (contss == NULL) {
size_t n_lins = cat->cnccat->n_lins;
contss = gu_new_seq(PgfItemConts*, n_lins, pool);
for (size_t i = 0; i < n_lins; i++) {
gu_seq_set(contss, PgfItemConts*, i, NULL);
}
- gu_map_put(conts_map, cat, PgfItemContss*, contss);
+ gu_map_put(state->conts_map, cat, PgfItemContss*, contss);
}
return contss;
}
static PgfItemConts*
-pgf_parsing_get_conts(PgfContsMap* conts_map,
- PgfCCat* ccat, size_t lin_idx,
- PgfParseState* state,
+pgf_parsing_get_conts(PgfParseState* state,
+ PgfCCat* ccat, size_t lin_idx,
GuPool *pool)
{
gu_require(lin_idx < ccat->cnccat->n_lins);
PgfItemContss* contss =
- pgf_parsing_get_contss(conts_map, ccat, pool);
+ pgf_parsing_get_contss(state, ccat, pool);
PgfItemConts* conts = gu_seq_get(contss, PgfItemConts*, lin_idx);
if (!conts) {
conts = gu_new(PgfItemConts, pool);
@@ -555,13 +605,14 @@ gu_ccat_fini(GuFinalizer* fin)
}
static PgfCCat*
-pgf_parsing_create_completed(PgfParseState* state, PgfItemConts* conts,
+pgf_parsing_create_completed(PgfParsing* ps, PgfParseState* state,
+ PgfItemConts* conts,
prob_t viterbi_prob)
{
- PgfCCat* cat = gu_new_flex(state->ps->pool, PgfCCat, fin, 1);
+ PgfCCat* cat = gu_new_flex(ps->pool, PgfCCat, fin, 1);
cat->cnccat = conts->ccat->cnccat;
cat->viterbi_prob = viterbi_prob;
- cat->fid = state->ps->max_fid++;
+ cat->fid = ps->max_fid++;
cat->conts = conts;
cat->answers = NULL;
cat->prods = NULL;
@@ -569,7 +620,7 @@ pgf_parsing_create_completed(PgfParseState* state, PgfItemConts* conts,
gu_map_put(state->generated_cats, conts, PgfCCat*, cat);
cat->fin[0].fn = gu_ccat_fini;
- gu_pool_finally(state->ps->pool, cat->fin);
+ gu_pool_finally(ps->pool, cat->fin);
#ifdef PGF_COUNTS_DEBUG
state->ps->ccat_full_count++;
@@ -578,15 +629,6 @@ pgf_parsing_create_completed(PgfParseState* state, PgfItemConts* conts,
return cat;
}
-static void
-pgf_parsing_add_production(PgfCCat* ccat, PgfProduction prod)
-{
- if (ccat->prods == NULL || ccat->n_synprods >= gu_seq_length(ccat->prods)) {
- ccat->prods = gu_realloc_seq(ccat->prods, PgfProduction, ccat->n_synprods+1);
- }
- gu_seq_set(ccat->prods, PgfProduction, ccat->n_synprods++, prod);
-}
-
static PgfCCat*
pgf_parsing_get_completed(PgfParseState* state, PgfItemConts* conts)
{
@@ -602,18 +644,18 @@ pgf_item_set_curr_symbol(PgfItem* item, GuPool* pool)
PgfProductionApply* papp = i.data;
PgfCncFun* fun = papp->fun;
gu_assert(item->conts->lin_idx < fun->n_lins);
- PgfSequence* seq = fun->lins[item->conts->lin_idx];
- gu_assert(item->seq_idx <= gu_seq_length(seq));
- if (item->seq_idx == gu_seq_length(seq)) {
+ PgfSymbols* syms = fun->lins[item->conts->lin_idx]->syms;
+ gu_assert(item->sym_idx <= gu_seq_length(syms));
+ if (item->sym_idx == gu_seq_length(syms)) {
item->curr_sym = gu_null_variant;
} else {
- item->curr_sym = gu_seq_get(seq, PgfSymbol, item->seq_idx);
+ item->curr_sym = gu_seq_get(syms, PgfSymbol, item->sym_idx);
}
break;
}
case PGF_PRODUCTION_COERCE: {
- gu_assert(item->seq_idx <= 1);
- if (item->seq_idx == 1) {
+ gu_assert(item->sym_idx <= 1);
+ if (item->sym_idx == 1) {
item->curr_sym = gu_null_variant;
} else {
item->curr_sym = gu_new_variant_i(pool, PGF_SYMBOL_CAT,
@@ -634,12 +676,11 @@ pgf_item_set_curr_symbol(PgfItem* item, GuPool* pool)
}
static PgfItem*
-pgf_new_item(PgfItemConts* conts, PgfProduction prod,
- GuPool* pool, PgfParsing* ps)
+pgf_new_item(PgfParsing* ps, PgfItemConts* conts, PgfProduction prod)
{
PgfItem* item;
if (ps == NULL || ps->free_item == NULL)
- item = gu_new(PgfItem, pool);
+ item = gu_new(PgfItem, ps->pool);
else {
item = ps->free_item;
ps->free_item = ps->free_item->next;
@@ -661,7 +702,7 @@ pgf_new_item(PgfItemConts* conts, PgfProduction prod,
}
case PGF_PRODUCTION_COERCE: {
PgfProductionCoerce* pcoerce = pi.data;
- item->args = gu_new_seq(PgfPArg, 1, pool);
+ item->args = gu_new_seq(PgfPArg, 1, ps->pool);
PgfPArg* parg = gu_seq_index(item->args, PgfPArg, 0);
parg->hypos = NULL;
parg->ccat = pcoerce->coerce;
@@ -671,13 +712,7 @@ pgf_new_item(PgfItemConts* conts, PgfProduction prod,
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = pi.data;
item->args = gu_empty_seq();
- item->inside_prob = pext->ep ? pext->ep->prob : 0;
-
- int n_args = gu_seq_length(item->args);
- for (int i = 0; i < n_args; i++) {
- PgfPArg *arg = gu_seq_index(item->args, PgfPArg, i);
- item->inside_prob += arg->ccat->viterbi_prob;
- }
+ item->inside_prob = pext->ep->prob;
break;
}
case PGF_PRODUCTION_META: {
@@ -698,13 +733,13 @@ pgf_new_item(PgfItemConts* conts, PgfProduction prod,
item->conts = conts;
item->prod = prod;
item->curr_sym = gu_null_variant;
- item->seq_idx = 0;
+ item->sym_idx = 0;
item->alt_idx = 0;
item->alt = 0;
conts->ref_count++;
- pgf_item_set_curr_symbol(item, pool);
+ pgf_item_set_curr_symbol(item, ps->pool);
#ifdef PGF_COUNTS_DEBUG
if (ps != NULL) {
@@ -717,11 +752,11 @@ pgf_new_item(PgfItemConts* conts, PgfProduction prod,
}
static PgfItem*
-pgf_item_copy(PgfItem* item, GuPool* pool, PgfParsing* ps)
+pgf_item_copy(PgfItem* item, PgfParsing* ps)
{
PgfItem* copy;
if (ps == NULL || ps->free_item == NULL)
- copy = gu_new(PgfItem, pool);
+ copy = gu_new(PgfItem, ps->pool);
else {
copy = ps->free_item;
ps->free_item = ps->free_item->next;
@@ -742,14 +777,14 @@ pgf_item_copy(PgfItem* item, GuPool* pool, PgfParsing* ps)
static PgfItem*
pgf_item_update_arg(PgfItem* item, size_t d, PgfCCat *new_ccat,
- GuPool* pool, PgfParsing *ps)
+ PgfParsing *ps)
{
PgfCCat *old_ccat =
gu_seq_index(item->args, PgfPArg, d)->ccat;
- PgfItem* new_item = pgf_item_copy(item, pool, ps);
+ PgfItem* new_item = pgf_item_copy(item, ps);
size_t nargs = gu_seq_length(item->args);
- new_item->args = gu_new_seq(PgfPArg, nargs, pool);
+ new_item->args = gu_new_seq(PgfPArg, nargs, ps->pool);
memcpy(gu_seq_data(new_item->args), gu_seq_data(item->args),
nargs * sizeof(PgfPArg));
gu_seq_set(new_item->args, PgfPArg, d,
@@ -763,8 +798,9 @@ pgf_item_update_arg(PgfItem* item, size_t d, PgfCCat *new_ccat,
static void
pgf_item_advance(PgfItem* item, GuPool* pool)
{
+
if (GU_LIKELY(item->alt == 0)) {
- item->seq_idx++;
+ item->sym_idx++;
pgf_item_set_curr_symbol(item, pool);
}
else
@@ -772,8 +808,7 @@ pgf_item_advance(PgfItem* item, GuPool* pool)
}
static void
-pgf_item_free(PgfParseState* before, PgfParseState* after,
- PgfItem* item)
+pgf_item_free(PgfParsing* ps, PgfItem* item)
{
GuVariantInfo i = gu_variant_open(item->prod);
switch (i.tag) {
@@ -797,37 +832,21 @@ pgf_item_free(PgfParseState* before, PgfParseState* after,
if (cont == NULL)
continue;
- pgf_item_free(before, after, cont);
+ pgf_item_free(ps, cont);
}
}
#ifdef PGF_PARSER_DEBUG
memset(item, 0, sizeof(*item));
#endif
- item->next = before->ps->free_item;
- before->ps->free_item = item;
+ item->next = ps->free_item;
+ ps->free_item = item;
#ifdef PGF_COUNTS_DEBUG
- before->ps->item_real_count--;
+ ps->item_real_count--;
#endif
}
static void
-pgf_parsing_add_transition(PgfParseState* before, PgfParseState* after,
- PgfToken tok, PgfItem* item)
-{
- if (after->ts->fn->match_token(after->ts, tok, item)) {
- if (after->next == NULL) {
- after->viterbi_prob =
- item->inside_prob+item->conts->outside_prob;
- }
-
- gu_buf_heap_push(after->agenda, &pgf_item_prob_order, &item);
- } else {
- pgf_item_free(before, after, item);
- }
-}
-
-static void
pgf_result_predict(PgfParsing* ps,
PgfExprState* cont, PgfCCat* ccat);
@@ -836,12 +855,13 @@ pgf_result_production(PgfParsing* ps,
PgfAnswers* answers, PgfProduction prod);
static void
-pgf_parsing_combine(PgfParseState* before, PgfParseState* after,
+pgf_parsing_combine(PgfParsing* ps,
+ PgfParseState* before, PgfParseState* after,
PgfItem* cont, PgfCCat* cat, int lin_idx)
{
if (cont == NULL) {
- if (after == NULL) {
- pgf_result_predict(before->ps, NULL, cat);
+ if (before->end_offset == strlen(ps->sentence)) {
+ pgf_result_predict(ps, NULL, cat);
}
return;
}
@@ -850,29 +870,29 @@ pgf_parsing_combine(PgfParseState* before, PgfParseState* after,
switch (gu_variant_tag(cont->curr_sym)) {
case PGF_SYMBOL_CAT: {
PgfSymbolCat* scat = gu_variant_data(cont->curr_sym);
- item = pgf_item_update_arg(cont, scat->d, cat, before->ps->pool, before->ps);
+ item = pgf_item_update_arg(cont, scat->d, cat, ps);
break;
}
case PGF_SYMBOL_LIT: {
PgfSymbolLit* slit = gu_variant_data(cont->curr_sym);
- item = pgf_item_update_arg(cont, slit->d, cat, before->ps->pool, before->ps);
+ item = pgf_item_update_arg(cont, slit->d, cat, ps);
break;
}
default:
gu_impossible();
}
- pgf_item_advance(item, before->ps->pool);
- gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
+ pgf_item_advance(item, ps->pool);
+ gu_buf_heap_push(before->agenda, pgf_item_prob_order, &item);
}
static void
-pgf_parsing_production(PgfParseState* state,
+pgf_parsing_production(PgfParsing* ps, PgfParseState* state,
PgfItemConts* conts, PgfProduction prod)
{
PgfItem* item =
- pgf_new_item(conts, prod, state->ps->pool, state->ps);
- gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
+ pgf_new_item(ps, conts, prod);
+ gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
}
static PgfProduction
@@ -904,9 +924,9 @@ pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool)
PgfProductionExtern* pext = i.data;
if (pext->lins == NULL ||
- gu_seq_get(pext->lins,PgfSequence*,item->conts->lin_idx) == NULL) {
- PgfSequence* seq =
- pgf_extern_seq_get(item, pool);
+ gu_seq_get(pext->lins,PgfSymbols*,item->conts->lin_idx) == NULL) {
+ PgfSymbols* syms =
+ pgf_extern_syms_get(item, pool);
size_t n_lins = item->conts->ccat->cnccat->n_lins;
@@ -914,22 +934,20 @@ pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool)
gu_new_variant(PGF_PRODUCTION_EXTERN,
PgfProductionExtern,
&prod, pool);
- new_pext->callback = pext->callback;
new_pext->ep = ep;
- new_pext->lins = gu_new_seq(PgfSequence*, n_lins, pool);
+ new_pext->lins = gu_new_seq(PgfSymbols*, n_lins, pool);
if (pext->lins == NULL) {
for (size_t i = 0; i < n_lins; i++) {
- gu_seq_set(new_pext->lins,PgfSequence*,i,
- NULL);
+ gu_seq_set(new_pext->lins,PgfSymbols*,i,NULL);
}
} else {
for (size_t i = 0; i < n_lins; i++) {
- gu_seq_set(new_pext->lins,PgfSequence*,i,
- gu_seq_get(pext->lins,PgfSequence*,i));
+ gu_seq_set(new_pext->lins,PgfSymbols*,i,
+ gu_seq_get(pext->lins,PgfSymbols*,i));
}
}
- gu_seq_set(new_pext->lins,PgfSequence*,item->conts->lin_idx,seq);
+ gu_seq_set(new_pext->lins,PgfSymbols*,item->conts->lin_idx,syms);
} else {
prod = item->prod;
}
@@ -952,42 +970,41 @@ pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool)
}
static void
-pgf_parsing_complete(PgfParseState* before, PgfParseState* after,
- PgfItem* item, PgfExprProb *ep)
-{
- PgfProduction prod =
- pgf_parsing_new_production(item, ep, before->ps->pool);
-#ifdef PGF_COUNTS_DEBUG
- before->ps->prod_full_count++;
-#endif
-
- PgfCCat* tmp_cat = pgf_parsing_get_completed(before, item->conts);
- PgfCCat* cat = tmp_cat;
- if (cat == NULL) {
- cat = pgf_parsing_create_completed(before, item->conts,
- item->inside_prob);
+pgf_parsing_add_production(PgfParsing* ps,
+ PgfParseState* before, PgfParseState* after,
+ PgfItemConts* conts, PgfProduction prod,
+ prob_t inside_prob)
+{
+ PgfCCat* tmp_ccat = pgf_parsing_get_completed(before, conts);
+ PgfCCat* ccat = tmp_ccat;
+ if (ccat == NULL) {
+ ccat = pgf_parsing_create_completed(ps, before, conts, inside_prob);
}
- pgf_parsing_add_production(cat, prod);
+ if (ccat->prods == NULL || ccat->n_synprods >= gu_seq_length(ccat->prods)) {
+ ccat->prods = gu_realloc_seq(ccat->prods, PgfProduction, ccat->n_synprods+1);
+ }
+ gu_seq_set(ccat->prods, PgfProduction, ccat->n_synprods++, prod);
#ifdef PGF_PARSER_DEBUG
GuPool* tmp_pool = gu_new_pool();
GuOut* out = gu_file_out(stderr, tmp_pool);
GuExn* err = gu_exn(NULL, type, tmp_pool);
- if (tmp_cat == NULL)
- gu_printf(out, err, "[%d-%d; C%d; %d; C%d]\n",
- item->conts->state ? item->conts->state->offset : 0,
- before->offset,
- item->conts->ccat->fid,
- item->conts->lin_idx,
- cat->fid);
- pgf_print_production(cat->fid, prod, out, err, tmp_pool);
+ if (tmp_ccat == NULL) {
+ gu_printf(out, err, "[");
+ pgf_print_range(conts->state, before, out, err);
+ gu_printf(out, err, "; C%d; %d; C%d]\n",
+ conts->ccat->fid,
+ conts->lin_idx,
+ ccat->fid);
+ }
+ pgf_print_production(ccat->fid, prod, out, err, tmp_pool);
gu_pool_free(tmp_pool);
#endif
- if (tmp_cat != NULL) {
+ if (tmp_ccat != NULL) {
PgfItemContss* contss =
- pgf_parsing_get_contss(before->conts_map, cat, before->ps->pool);
+ pgf_parsing_get_contss(before, ccat, ps->pool);
size_t n_contss = gu_seq_length(contss);
for (size_t i = 0; i < n_contss; i++) {
PgfItemConts* conts2 = gu_seq_get(contss, PgfItemConts*, i);
@@ -997,7 +1014,7 @@ pgf_parsing_complete(PgfParseState* before, PgfParseState* after,
* production immediately to the agenda,
* i.e. process it. */
if (conts2) {
- pgf_parsing_production(before, conts2, prod);
+ pgf_parsing_production(ps, before, conts2, prod);
}
}
@@ -1006,7 +1023,7 @@ pgf_parsing_complete(PgfParseState* before, PgfParseState* after,
PgfParseState* state = after;
while (state != NULL) {
PgfItemContss* contss =
- pgf_parsing_get_contss(state->conts_map, cat, state->ps->pool);
+ pgf_parsing_get_contss(state, ccat, ps->pool);
size_t n_contss = gu_seq_length(contss);
for (size_t i = 0; i < n_contss; i++) {
PgfItemConts* conts2 = gu_seq_get(contss, PgfItemConts*, i);
@@ -1016,33 +1033,249 @@ pgf_parsing_complete(PgfParseState* before, PgfParseState* after,
* production immediately to the agenda,
* i.e. process it. */
if (conts2) {
- pgf_parsing_production(state, conts2, prod);
+ pgf_parsing_production(ps, state, conts2, prod);
}
}
state = state->next;
}
- if (cat->answers != NULL) {
- pgf_result_production(before->ps, cat->answers, prod);
+ if (ccat->answers != NULL) {
+ pgf_result_production(ps, ccat->answers, prod);
}
} else {
- size_t n_conts = gu_buf_length(item->conts->items);
+ size_t n_conts = gu_buf_length(conts->items);
for (size_t i = 0; i < n_conts; i++) {
- PgfItem* cont = gu_buf_get(item->conts->items, PgfItem*, i);
- pgf_parsing_combine(before, after, cont, cat, item->conts->lin_idx);
+ PgfItem* cont = gu_buf_get(conts->items, PgfItem*, i);
+ pgf_parsing_combine(ps, before, after, cont, ccat, conts->lin_idx);
}
}
}
static void
-pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
+pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep)
+{
+ PgfProduction prod =
+ pgf_parsing_new_production(item, ep, ps->pool);
+#ifdef PGF_COUNTS_DEBUG
+ before->ps->prod_full_count++;
+#endif
+
+ pgf_parsing_add_production(ps, ps->before, ps->after, item->conts, prod, item->inside_prob);
+}
+
+static int
+pgf_symbols_cmp(GuString* psent, size_t sent_len, BIND_TYPE* pbind, PgfSymbols* syms)
+{
+ GuString sent = *psent;
+
+ size_t n_syms = gu_seq_length(syms);
+ for (size_t i = 0; i < n_syms; i++) {
+ PgfSymbol sym = gu_seq_get(syms, PgfSymbol, i);
+
+ GuVariantInfo inf = gu_variant_open(sym);
+ switch (inf.tag) {
+ case PGF_SYMBOL_CAT:
+ case PGF_SYMBOL_LIT:
+ case PGF_SYMBOL_VAR: {
+ if (sent_len == 0)
+ return -1;
+ return 1;
+ }
+ case PGF_SYMBOL_KS: {
+ PgfSymbolKS* pks = inf.data;
+ if (sent_len == 0)
+ return -1;
+
+ if (*pbind == BIND_HARD)
+ *pbind = BIND_NONE;
+ else {
+ if (*pbind != BIND_SOFT && !skip_space(&sent, &sent_len))
+ return 1;
+
+ while (*sent != 0) {
+ if (!skip_space(&sent, &sent_len))
+ break;
+ }
+ }
+
+ int cmp = cmp_string(&sent, &sent_len, pks->token);
+ if (cmp != 0)
+ return cmp;
+ break;
+ }
+ case PGF_SYMBOL_KP: {
+ return -2;
+ }
+ case PGF_SYMBOL_BIND: {
+ *pbind = BIND_HARD;
+ break;
+ }
+ case PGF_SYMBOL_NE: {
+ return -2;
+ }
+ default:
+ gu_impossible();
+ }
+ }
+
+ *psent = sent;
+ return 0;
+}
+
+static void
+pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state)
+{
+ PgfSequence* epsilon_seq =
+ gu_seq_index(ps->concr->sequences, PgfSequence, 0);
+ if (gu_seq_length(epsilon_seq->syms) == 0) {
+ // Since the sequences are sorted, the epsilon sequence will
+ // always be the first if there is any at all. We should
+ // always add the epsilon in the index, because we do
+ // bottom up prediction for epsilons.
+ PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
+ entry->idx = epsilon_seq->idx;
+ entry->bind_type = BIND_NONE;
+ entry->offset = state->start_offset;
+ }
+
+ size_t i = 0;
+ size_t j = gu_seq_length(ps->concr->sequences)-1;
+ size_t s = j;
+ size_t n = 1;
+ size_t sent_len = strlen(ps->sentence);
+
+ while (state->end_offset + n <= sent_len) {
+ while (i <= j) {
+ size_t k = (i+j) / 2;
+ PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, k);
+
+ GuString current = ps->sentence + state->end_offset;
+ BIND_TYPE bind_type = state->needs_bind ? BIND_NONE : BIND_HARD;
+ switch (pgf_symbols_cmp(&current, n, &bind_type, seq->syms)) {
+ case -2:
+ j = k-1;
+ s = j;
+ break;
+ case -1:
+ j = k-1;
+ break;
+ case 0: {
+ if (seq->idx != NULL) {
+ PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
+ entry->idx = seq->idx;
+ entry->bind_type = bind_type;
+ entry->offset = (current - ps->sentence);
+ }
+ i = k+1;
+ goto next;
+ }
+ case 1:
+ i = k+1;
+ break;
+ }
+ }
+
+next:
+ j = s;
+ n++;
+ }
+}
+
+static PgfParseState*
+pgf_new_parse_state(PgfParsing* ps, size_t start_offset, BIND_TYPE bind_type)
+{
+ PgfParseState** pstate;
+ if (ps->before == NULL && start_offset == 0)
+ pstate = &ps->before;
+ else {
+ if (bind_type != BIND_NONE) {
+ if (ps->before->start_offset == start_offset &&
+ ps->before->end_offset == start_offset &&
+ !ps->before->needs_bind)
+ return ps->before;
+ } else {
+ if (ps->before->start_offset == start_offset)
+ return ps->before;
+ }
+
+ pstate = &ps->after;
+ while (*pstate != NULL) {
+ if (bind_type != BIND_NONE) {
+ if (ps->before->start_offset == start_offset &&
+ ps->before->end_offset == start_offset &&
+ !ps->before->needs_bind)
+ return ps->before;
+ } else {
+ if ((*pstate)->start_offset == start_offset)
+ return *pstate;
+ }
+ if ((*pstate)->start_offset > start_offset)
+ break;
+ pstate = &(*pstate)->next;
+ }
+ }
+
+ size_t end_offset = start_offset;
+ GuString current = ps->sentence + start_offset;
+ size_t len = strlen(current);
+ while (skip_space(&current, &len)) {
+ end_offset++;
+ }
+
+ if (bind_type == BIND_HARD && start_offset != end_offset)
+ return NULL;
+
+ PgfParseState* state = gu_new(PgfParseState, ps->pool);
+ state->next = *pstate;
+ state->agenda = gu_new_buf(PgfItem*, ps->pool);
+ state->meta_item = NULL;
+ state->generated_cats = gu_map_type_new(PgfGenCatMap, ps->pool);
+ state->conts_map = gu_map_type_new(PgfContsMap, ps->pool);
+ state->needs_bind = (bind_type == BIND_NONE) &&
+ (start_offset == end_offset);
+ state->start_offset = start_offset;
+ state->end_offset = end_offset;
+ state->viterbi_prob = 0;
+ state->lexicon_idx =
+ gu_new_buf(PgfLexiconIdxEntry, ps->pool);
+
+ if (ps->before == NULL && start_offset == 0)
+ state->needs_bind = false;
+
+ pgf_parsing_lookahead(ps, state);
+
+ *pstate = state;
+
+ return state;
+}
+
+static void
+pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
+{
+ GuString current = ps->sentence + ps->before->end_offset;
+ size_t len = strlen(current);
+
+ if (!ps->before->needs_bind && cmp_string(&current, &len, tok) == 0) {
+ PgfParseState* state =
+ pgf_new_parse_state(ps, (current - ps->sentence), BIND_NONE);
+ if (state->next == NULL) {
+ state->viterbi_prob =
+ item->inside_prob+item->conts->outside_prob;
+ }
+
+ gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
+ } else {
+ pgf_item_free(ps, item);
+ }
+}
+
+static void
+pgf_parsing_td_predict(PgfParsing* ps,
PgfItem* item, PgfCCat* ccat, size_t lin_idx)
{
PgfItemConts* conts =
- pgf_parsing_get_conts(before->conts_map,
- ccat, lin_idx, before,
- before->ps->pool);
+ pgf_parsing_get_conts(ps->before, ccat, lin_idx, ps->pool);
gu_buf_push(conts->items, PgfItem*, item);
if (gu_buf_length(conts->items) == 1) {
/* First time we encounter this linearization
@@ -1053,71 +1286,52 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
item->inside_prob-conts->ccat->viterbi_prob+
item->conts->outside_prob;
- size_t n_prods = ccat->n_synprods;
- PgfProductionIdx* lexicon_idx = NULL;
- if (after != NULL) {
- lexicon_idx = after->ts->fn->get_lexicon_idx(after->ts);
-
- // we don't know the current token.
- // probably we just compute the list of completions
- if (lexicon_idx == NULL && ccat->fid < after->ps->concr->total_cats)
- n_prods = gu_seq_length(ccat->prods);
- }
-
// Top-down prediction for syntactic rules
- for (size_t i = 0; i < n_prods; i++) {
+ for (size_t i = 0; i < ccat->n_synprods; i++) {
PgfProduction prod =
gu_seq_get(ccat->prods, PgfProduction, i);
- pgf_parsing_production(before, conts, prod);
+ pgf_parsing_production(ps, ps->before, conts, prod);
}
- // Bottom-up prediction for lexical rules
-
- if (lexicon_idx != NULL) {
- PgfCFCat cfc = {ccat, lin_idx};
- PgfProductionBuf* tok_prods =
- gu_map_get(lexicon_idx, &cfc, PgfProductionBuf*);
-
- if (tok_prods != NULL) {
- size_t n_prods = gu_buf_length(tok_prods);
- for (size_t i = 0; i < n_prods; i++) {
- PgfProduction prod =
- gu_buf_get(tok_prods, PgfProduction, i);
-
- pgf_parsing_production(before, conts, prod);
+ // Bottom-up prediction for lexical and epsilon rules
+ size_t n_idcs = gu_buf_length(ps->before->lexicon_idx);
+ for (size_t i = 0; i < n_idcs; i++) {
+ PgfLexiconIdxEntry* lentry =
+ gu_buf_index(ps->before->lexicon_idx, PgfLexiconIdxEntry, i);
+ PgfParseState* state =
+ pgf_new_parse_state(ps, lentry->offset, lentry->bind_type);
+
+ if (state != NULL) {
+ size_t n_entries = gu_buf_length(lentry->idx);
+ for (size_t j = 0; j < n_entries; j++) {
+ PgfProductionIdxEntry* pentry =
+ gu_buf_index(lentry->idx, PgfProductionIdxEntry, j);
+
+ if (pentry->ccat == ccat && pentry->lin_idx == lin_idx) {
+ GuVariantInfo i = { PGF_PRODUCTION_APPLY, pentry->papp };
+ PgfProduction prod = gu_variant_close(i);
+ pgf_parsing_add_production(ps, state, state->next,
+ conts, prod,
+ pentry->papp->fun->absfun->ep.prob);
+ }
}
}
}
-
- // Bottom-up prediction for epsilon rules
- PgfCFCat cfc = {ccat, lin_idx};
- PgfProductionBuf* eps_prods =
- gu_map_get(before->ps->concr->epsilon_idx, &cfc, PgfProductionBuf*);
-
- if (eps_prods != NULL) {
- size_t n_prods = gu_buf_length(eps_prods);
- for (size_t i = 0; i < n_prods; i++) {
- PgfProduction prod =
- gu_buf_get(eps_prods, PgfProduction, i);
-
- pgf_parsing_production(before, conts, prod);
- }
- }
} else {
/* If it has already been completed, combine. */
PgfCCat* completed =
- pgf_parsing_get_completed(before, conts);
+ pgf_parsing_get_completed(ps->before, conts);
if (completed) {
- pgf_parsing_combine(before, after, item, completed, lin_idx);
+ pgf_parsing_combine(ps, ps->before, ps->after, item, completed, lin_idx);
}
- PgfParseState* state = after;
+ PgfParseState* state = ps->after;
while (state != NULL) {
PgfCCat* completed =
pgf_parsing_get_completed(state, conts);
if (completed) {
- pgf_parsing_combine(state, state->next, item, completed, lin_idx);
+ pgf_parsing_combine(ps, state, state->next, item, completed, lin_idx);
}
state = state->next;
@@ -1126,32 +1340,24 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
}
static void
-pgf_parsing_meta_scan(PgfParseState* before, PgfParseState* after,
+pgf_parsing_meta_scan(PgfParsing* ps,
PgfItem* meta_item, prob_t meta_prob)
{
- PgfToken tok = after->ts->fn->get_token(after->ts);
+/* PgfToken tok = after->ts->fn->get_token(after->ts);
if (*tok == 0) {
- PgfItem* item = pgf_item_copy(meta_item, before->ps->pool, before->ps);
+ PgfItem* item = pgf_item_copy(meta_item, before->ps);
item->inside_prob += meta_prob;
- PgfSymbol prev = item->curr_sym;
- size_t tok_len = strlen(tok);
- PgfSymbolKS* sks = (PgfSymbolKS*)
- gu_alloc_variant(PGF_SYMBOL_KS,
- sizeof(PgfSymbolKS)+tok_len+1+sizeof(PgfSymbol),
- gu_alignof(PgfSymbolKS),
- &item->curr_sym, after->ps->pool);
- strcpy((char*) sks->token, (char*) tok);
- *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1)) = prev;
+ pgf_add_extern_tok(item, tok, ps->pool);
- gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
- }
+ gu_buf_heap_push(before->agenda, pgf_item_prob_order, &item);
+ }*/
}
typedef struct {
GuMapItor fn;
- PgfParseState* state;
+ PgfParsing* ps;
PgfItem* meta_item;
} PgfMetaPredictFn;
@@ -1163,11 +1369,11 @@ pgf_parsing_meta_predict(GuMapItor* fn, const void* key, void* value, GuExn* err
PgfAbsCat* abscat = (PgfAbsCat*) key;
prob_t meta_prob = *((prob_t*) value);
PgfMetaPredictFn* clo = (PgfMetaPredictFn*) fn;
- PgfParseState* state = clo->state;
+ PgfParsing* ps = clo->ps;
PgfItem* meta_item = clo->meta_item;
PgfCncCat* cnccat =
- gu_map_get(state->ps->concr->cnccats, abscat->name, PgfCncCat*);
+ gu_map_get(ps->concr->cnccats, abscat->name, PgfCncCat*);
if (cnccat == NULL)
return;
@@ -1181,35 +1387,27 @@ pgf_parsing_meta_predict(GuMapItor* fn, const void* key, void* value, GuExn* err
for (size_t lin_idx = 0; lin_idx < cnccat->n_lins; lin_idx++) {
PgfItem* item =
- pgf_item_copy(meta_item, state->ps->pool, state->ps);
+ pgf_item_copy(meta_item, ps);
item->inside_prob +=
ccat->viterbi_prob+meta_prob;
size_t nargs = gu_seq_length(meta_item->args);
- item->args = gu_new_seq(PgfPArg, nargs+1, state->ps->pool);
+ item->args = gu_new_seq(PgfPArg, nargs+1, ps->pool);
memcpy(gu_seq_data(item->args), gu_seq_data(meta_item->args),
nargs * sizeof(PgfPArg));
gu_seq_set(item->args, PgfPArg, nargs,
((PgfPArg) { .hypos = NULL, .ccat = ccat }));
- PgfSymbol prev = item->curr_sym;
- PgfSymbolCat* scat = (PgfSymbolCat*)
- gu_alloc_variant(PGF_SYMBOL_CAT,
- sizeof(PgfSymbolCat)+sizeof(PgfSymbol),
- gu_alignof(PgfSymbolCat),
- &item->curr_sym, state->ps->pool);
- *((PgfSymbol*) (scat+1)) = prev;
- scat->d = nargs;
- scat->r = lin_idx;
-
- gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
+ pgf_add_extern_cat(&item->curr_sym, nargs, lin_idx, ps->pool);
+
+ gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item);
}
}
}
static void
-pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
- PgfItem* item, PgfSymbol sym) {
+pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
+{
switch (gu_variant_tag(sym)) {
case PGF_SYMBOL_CAT: {
PgfSymbolCat* scat = gu_variant_data(sym);
@@ -1217,124 +1415,136 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
if (parg->ccat->prods == NULL) {
// empty category
- pgf_item_free(before, after, item);
+ pgf_item_free(ps, item);
return;
}
- pgf_parsing_td_predict(before, after, item, parg->ccat, scat->r);
+ pgf_parsing_td_predict(ps, item, parg->ccat, scat->r);
break;
}
case PGF_SYMBOL_KS: {
- if (after != NULL) {
- PgfSymbolKS* sks = gu_variant_data(sym);
- pgf_item_advance(item, after->ps->pool);
- pgf_parsing_add_transition(before, after, sks->token, item);
- }
+ PgfSymbolKS* sks = gu_variant_data(sym);
+ pgf_item_advance(item, ps->pool);
+ pgf_parsing_add_transition(ps, sks->token, item);
break;
}
case PGF_SYMBOL_KP: {
- if (after != NULL) {
+ if (ps->after != NULL) {
PgfSymbolKP* skp = gu_variant_data(sym);
PgfSymbol sym;
if (item->alt == 0) {
PgfItem* new_item;
- new_item = pgf_item_copy(item, after->ps->pool, after->ps);
+ new_item = pgf_item_copy(item, ps);
new_item->alt = 1;
new_item->alt_idx = 0;
sym = gu_seq_get(skp->default_form, PgfSymbol, new_item->alt_idx);
- pgf_parsing_symbol(before, after, new_item, sym);
+ pgf_parsing_symbol(ps, new_item, sym);
for (size_t i = 0; i < skp->n_forms; i++) {
- PgfSequence* syms = skp->forms[i].form;
- PgfSequence* syms2 = skp->default_form;
+ PgfSymbols* syms = skp->forms[i].form;
+ PgfSymbols* syms2 = skp->default_form;
bool skip = false; /*pgf_tokens_equal(toks, toks2);
for (size_t j = 0; j < i; j++) {
PgfTokens* toks2 = skp->forms[j].form;
skip |= pgf_tokens_equal(toks, toks2);
}*/
if (!skip) {
- new_item = pgf_item_copy(item, after->ps->pool, after->ps);
+ new_item = pgf_item_copy(item, ps);
new_item->alt = i+2;
new_item->alt_idx = 0;
sym = gu_seq_get(syms, PgfSymbol, new_item->alt_idx);
- pgf_parsing_symbol(before, after, new_item, sym);
+ pgf_parsing_symbol(ps, new_item, sym);
}
}
} else {
- PgfSequence* syms =
+ PgfSymbols* syms =
(item->alt == 1) ? skp->default_form :
skp->forms[item->alt-2].form;
if (item->alt_idx < gu_seq_length(syms)) {
sym = gu_seq_get(syms, PgfSymbol, item->alt_idx);
- pgf_parsing_symbol(before, after, item, sym);
+ pgf_parsing_symbol(ps, item, sym);
} else {
item->alt = 0;
- pgf_item_advance(item, after->ps->pool);
- gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
+ pgf_item_advance(item, ps->pool);
+ gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item);
}
}
}
break;
}
case PGF_SYMBOL_LIT: {
- if (after != NULL) {
- PgfSymbolLit* slit = gu_variant_data(sym);
- PgfPArg* parg = gu_seq_index(item->args, PgfPArg, slit->d);
+ PgfSymbolLit* slit = gu_variant_data(sym);
+ PgfPArg* parg = gu_seq_index(item->args, PgfPArg, slit->d);
- if (parg->ccat->fid > 0 &&
- parg->ccat->fid >= before->ps->concr->total_cats) {
- pgf_parsing_td_predict(before, after, item, parg->ccat, slit->r);
- }
- else {
- PgfItemConts* conts =
- pgf_parsing_get_conts(before->conts_map,
- parg->ccat, slit->r, before,
- before->ps->pool);
- gu_buf_push(conts->items, PgfItem*, item);
-
- if (gu_buf_length(conts->items) == 1) {
- /* This is the first time when we encounter this
- * literal category so we must call the callback */
-
- PgfLiteralCallback* callback =
- gu_map_get(before->ps->concr->callbacks,
- parg->ccat->cnccat,
- PgfLiteralCallback*);
-
- if (callback != NULL) {
+ if (parg->ccat->fid >= ps->concr->total_cats) {
+ pgf_parsing_td_predict(ps, item, parg->ccat, slit->r);
+ }
+ else {
+ PgfItemConts* conts =
+ pgf_parsing_get_conts(ps->before,
+ parg->ccat, slit->r,
+ ps->pool);
+ gu_buf_push(conts->items, PgfItem*, item);
+
+ if (gu_buf_length(conts->items) == 1) {
+ /* This is the first time when we encounter this
+ * literal category so we must call the callback */
+
+ PgfLiteralCallback* callback =
+ gu_map_get(ps->concr->callbacks,
+ parg->ccat->cnccat,
+ PgfLiteralCallback*);
+
+ if (callback != NULL) {
+ size_t offset = ps->before->end_offset;
+ PgfSymbol curr_sym = gu_null_variant;
+ PgfExprProb *ep =
+ callback->match(ps->concr, &curr_sym,
+ item->conts->lin_idx,
+ ps->sentence, &offset,
+ ps->pool, ps->out_pool);
+
+ if (ep != NULL) {
PgfProduction prod;
PgfProductionExtern* pext =
gu_new_variant(PGF_PRODUCTION_EXTERN,
- PgfProductionExtern,
- &prod, before->ps->pool);
- pext->callback = callback;
- pext->ep = NULL;
+ PgfProductionExtern,
+ &prod, ps->pool);
+ pext->ep = ep;
pext->lins = NULL;
- pgf_parsing_production(before, conts, prod);
+ PgfParseState* state =
+ pgf_new_parse_state(ps, offset, BIND_NONE);
+ PgfItem* item =
+ pgf_new_item(ps, conts, prod);
+ item->curr_sym = curr_sym;
+ item->sym_idx = pgf_item_symbols_length(item);
+ gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
}
} else {
- /* If it has already been completed, combine. */
+ pgf_item_free(ps, item);
+ }
+ } else {
+ /* If it has already been completed, combine. */
+ PgfCCat* completed =
+ pgf_parsing_get_completed(ps->before, conts);
+ if (completed) {
+ pgf_parsing_combine(ps, ps->before, ps->after, item, completed, slit->r);
+ }
+
+ PgfParseState* state = ps->after;
+ while (state != NULL) {
PgfCCat* completed =
- pgf_parsing_get_completed(before, conts);
+ pgf_parsing_get_completed(state, conts);
if (completed) {
- pgf_parsing_combine(before, after, item, completed, slit->r);
- }
-
- PgfParseState* state = after;
- while (state != NULL) {
- PgfCCat* completed =
- pgf_parsing_get_completed(state, conts);
- if (completed) {
- pgf_parsing_combine(state, state->next, item, completed, slit->r);
- }
-
- state = state->next;
+ pgf_parsing_combine(ps, state, state->next, item, completed, slit->r);
}
+
+ state = state->next;
}
}
}
@@ -1344,11 +1554,29 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
// XXX TODO proper support
break;
case PGF_SYMBOL_NE: {
- pgf_item_free(before, after, item);
+ // Nothing can match with a non-existant token
+ pgf_item_free(ps, item);
break;
}
case PGF_SYMBOL_BIND: {
- pgf_item_free(before, after, item);
+ if (ps->before->start_offset == ps->before->end_offset &&
+ ps->before->needs_bind) {
+ PgfParseState* state =
+ pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD);
+ if (state != NULL) {
+ if (state->next == NULL) {
+ state->viterbi_prob =
+ item->inside_prob+item->conts->outside_prob;
+ }
+
+ pgf_item_advance(item, ps->pool);
+ gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
+ } else {
+ pgf_item_free(ps, item);
+ }
+ } else {
+ pgf_item_free(ps, item);
+ }
break;
}
default:
@@ -1357,47 +1585,47 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
}
static void
-pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
+pgf_parsing_item(PgfParsing* ps, PgfItem* item)
{
#ifdef PGF_PARSER_DEBUG
GuPool* tmp_pool = gu_new_pool();
GuOut* out = gu_file_out(stderr, tmp_pool);
GuExn* err = gu_exn(NULL, type, tmp_pool);
- pgf_print_item(item, before, out, err, tmp_pool);
+ pgf_print_item(item, ps->before, out, err, tmp_pool);
gu_pool_free(tmp_pool);
#endif
-
+
GuVariantInfo i = gu_variant_open(item->prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
PgfCncFun* fun = papp->fun;
- PgfSequence* seq = fun->lins[item->conts->lin_idx];
- if (item->seq_idx == gu_seq_length(seq)) {
- pgf_parsing_complete(before, after, item, NULL);
- pgf_item_free(before, after, item);
+ PgfSymbols* syms = fun->lins[item->conts->lin_idx]->syms;
+ if (item->sym_idx == gu_seq_length(syms)) {
+ pgf_parsing_complete(ps, item, NULL);
+ pgf_item_free(ps, item);
} else {
- pgf_parsing_symbol(before, after, item, item->curr_sym);
+ pgf_parsing_symbol(ps, item, item->curr_sym);
}
break;
}
case PGF_PRODUCTION_COERCE: {
PgfProductionCoerce* pcoerce = i.data;
- switch (item->seq_idx) {
+ switch (item->sym_idx) {
case 0:
if (pcoerce->coerce->prods == NULL) {
// empty category
- pgf_item_free(before, after, item);
+ pgf_item_free(ps, item);
return;
}
- pgf_parsing_td_predict(before, after, item,
+ pgf_parsing_td_predict(ps, item,
pcoerce->coerce,
item->conts->lin_idx);
break;
case 1:
- pgf_parsing_complete(before, after, item, NULL);
- pgf_item_free(before, after, item);
+ pgf_parsing_complete(ps, item, NULL);
+ pgf_item_free(ps, item);
break;
default:
gu_impossible();
@@ -1406,85 +1634,56 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
-
- PgfSequence* seq;
+
+ PgfSymbols* syms;
if (pext->lins != NULL &&
- (seq = gu_seq_get(pext->lins,PgfSequence*,item->conts->lin_idx)) != NULL) {
- if (item->seq_idx == gu_seq_length(seq)) {
- pgf_parsing_complete(before, after, item, NULL);
- pgf_item_free(before, after, item);
- } else {
+ (syms = gu_seq_get(pext->lins,PgfSymbols*,item->conts->lin_idx)) != NULL) {
+ if (item->sym_idx == gu_seq_length(syms)) {
+ pgf_parsing_complete(ps, item, NULL);
+ pgf_item_free(ps, item);
+ } else {
PgfSymbol sym =
- gu_seq_get(seq, PgfSymbol, item->seq_idx);
- pgf_parsing_symbol(before, after, item, sym);
+ gu_seq_get(syms, PgfSymbol, item->sym_idx);
+ pgf_parsing_symbol(ps, item, sym);
}
} else {
- PgfToken tok = (after != NULL)
- ? after->ts->fn->get_token(after->ts)
- : "";
-
- PgfExprProb *ep = NULL;
- bool accepted =
- pext->callback->match(before->ps->concr, item,
- tok,
- &ep, before->ps->out_pool);
-
- if (ep != NULL)
- pgf_parsing_complete(before, after, item, ep);
-
- if (accepted) {
- if (after != NULL) {
- PgfSymbol prev = item->curr_sym;
- size_t tok_len = strlen(tok);
- PgfSymbolKS* sks = (PgfSymbolKS*)
- gu_alloc_variant(PGF_SYMBOL_KS,
- sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+tok_len+1,
- gu_alignof(PgfSymbolKS),
- &item->curr_sym, after->ps->pool);
- strcpy((char*) sks->token, (char*) tok);
- *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1)) = prev;
-
- item->seq_idx++;
- pgf_parsing_add_transition(before, after, tok, item);
- }
- } else {
- pgf_item_free(before, after, item);
- }
+ pgf_parsing_complete(ps, item, pext->ep);
+ pgf_item_free(ps, item);
}
break;
}
case PGF_PRODUCTION_META: {
- if (item->seq_idx == pgf_item_sequence_length(item)) {
- if (before->meta_item != NULL)
+ if (item->sym_idx == pgf_item_symbols_length(item)) {
+ if (ps->before->meta_item != NULL)
break;
- before->meta_item = item;
+ ps->before->meta_item = item;
- if (after == NULL) {
- PgfExprProb *ep = gu_new(PgfExprProb, before->ps->pool);
- ep->expr = before->ps->meta_var;
+ if (ps->after == NULL) {
+ PgfExprProb *ep = gu_new(PgfExprProb, ps->pool);
+ ep->expr = ps->meta_var;
ep->prob = item->inside_prob;
size_t n_args = gu_seq_length(item->args);
for (size_t i = 0; i < n_args; i++) {
PgfPArg* arg = gu_seq_index(item->args, PgfPArg, i);
ep->prob -= arg->ccat->viterbi_prob;
}
- pgf_parsing_complete(before, after, item, ep);
+ pgf_parsing_complete(ps, item, ep);
} else {
prob_t meta_token_prob =
item->conts->ccat->cnccat->abscat->meta_token_prob;
if (meta_token_prob != INFINITY) {
- pgf_parsing_meta_scan(before, after, item, meta_token_prob);
+ pgf_parsing_meta_scan(ps, item, meta_token_prob);
}
PgfCIdMap* meta_child_probs =
item->conts->ccat->cnccat->abscat->meta_child_probs;
if (meta_child_probs != NULL) {
- PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, before, item };
+ PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, ps, item };
gu_map_iter(meta_child_probs, &clo.fn, NULL);
}
}
} else {
- pgf_parsing_symbol(before, after, item, item->curr_sym);
+ pgf_parsing_symbol(ps, item, item->curr_sym);
}
break;
}
@@ -1493,66 +1692,6 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
}
}
-static bool
-pgf_parsing_proceed(PgfParseState* state)
-{
- prob_t best_prob = INFINITY;
- if (gu_buf_length(state->ps->expr_queue) > 0) {
- best_prob = gu_buf_get(state->ps->expr_queue, PgfExprState*, 0)->ep.prob;
- }
-
- PgfParseState* before = NULL;
-
- prob_t delta_prob = 0;
- PgfParseState* st = state;
- while (st != NULL) {
- if (gu_buf_length(st->agenda) > 0) {
- PgfItem* item = gu_buf_get(st->agenda, PgfItem*, 0);
- prob_t item_prob =
- item->inside_prob+item->conts->outside_prob+delta_prob;
- if (item_prob < best_prob) {
- best_prob = item_prob;
- before = st;
- }
- }
-
- prob_t state_delta =
- (st->viterbi_prob-(st->next ? st->next->viterbi_prob : 0))*
- state->ps->beam_size;
- //prob_t lexical_prob =
- // st->ts ? st->ts->lexical_prob : 0;
- delta_prob += state_delta; /*fmax(state_delta, lexical_prob)*/; // the calculation of lexical_prob doesn't work properly.
- st = st->next;
- }
-
- if (before == NULL)
- return false;
-
- PgfParseState* after = NULL;
-
- st = state;
- while (st != before) {
- PgfParseState* tmp = st->next;
- st->next = after;
- after = st;
- st = tmp;
- }
-
- PgfItem* item;
- gu_buf_heap_pop(before->agenda, &pgf_item_prob_order, &item);
- pgf_parsing_item(before, after, item);
-
- while (after != NULL) {
- PgfParseState* tmp = after->next;
- after->next = before;
- before = after;
- after = tmp;
- }
- state = before;
-
- return true;
-}
-
static prob_t
pgf_parsing_default_beam_size(PgfConcr* concr)
{
@@ -1567,15 +1706,19 @@ pgf_parsing_default_beam_size(PgfConcr* concr)
}
static PgfParsing*
-pgf_new_parsing(PgfConcr* concr, double heuristics,
+pgf_new_parsing(PgfConcr* concr,
+ GuString sentence, double heuristics,
GuPool* pool, GuPool* out_pool)
{
PgfParsing* ps = gu_new(PgfParsing, pool);
ps->concr = concr;
ps->pool = pool;
ps->out_pool = out_pool;
+ ps->sentence = sentence;
ps->expr_queue = gu_new_buf(PgfExprState*, pool);
ps->max_fid = concr->total_cats;
+ ps->before = NULL;
+ ps->after = NULL;
#ifdef PGF_COUNTS_DEBUG
ps->item_full_count = 0;
ps->item_real_count = 0;
@@ -1602,68 +1745,6 @@ pgf_new_parsing(PgfConcr* concr, double heuristics,
return ps;
}
-static PgfParseState*
-pgf_new_parse_state(PgfParsing* ps,
- PgfParseState* next,
- PgfTokenState* ts,
- GuPool* pool)
-{
- PgfParseState* state = gu_new(PgfParseState, pool);
- state->next = next;
- state->agenda = gu_new_buf(PgfItem*, pool);
- state->meta_item = NULL;
- state->generated_cats = gu_map_type_new(PgfGenCatMap, pool);
- state->conts_map = gu_map_type_new(PgfContsMap, pool);
- state->offset = next ? next->offset+1 : 0;
- state->viterbi_prob = 0;
- state->ps = ps;
- state->ts = ts;
- return state;
-}
-
-typedef struct {
- GuMapItor fn;
- PgfTokenState* ts;
-} PgfLexiconFn;
-
-static void
-pgf_parser_compute_lexicon_prob(GuMapItor* fn, const void* key, void* value, GuExn* err)
-{
- PgfTokenState* ts = ((PgfLexiconFn*) fn)->ts;
- PgfProductionBuf* prods = *((PgfProductionBuf**) value);
-
- if (prods == NULL)
- return;
-
- size_t n_prods = gu_buf_length(prods);
- for (size_t i = 0; i < n_prods; i++) {
- PgfProduction prod =
- gu_buf_get(prods, PgfProduction, i);
-
- GuVariantInfo pi = gu_variant_open(prod);
- switch (pi.tag) {
- case PGF_PRODUCTION_APPLY: {
- PgfProductionApply* papp = pi.data;
- if (ts->lexical_prob > papp->fun->ep->prob) {
- ts->lexical_prob = papp->fun->ep->prob;
- }
- break;
- }
- }
- }
-}
-
-#define pgf_new_token_state(ty, pool) \
- (ty*) pgf_new_token_state_(&pgf_tsfn_##ty, (PgfTokenState*) gu_new(ty, pool))
-
-static PgfTokenState*
-pgf_new_token_state_(PgfTokenFn* fn, PgfTokenState* ts)
-{
- ts->fn = fn;
- ts->lexical_prob = INFINITY;
- return ts;
-}
-
#ifdef PGF_COUNTS_DEBUG
void pgf_parsing_print_counts(PgfParsing* ps)
{
@@ -1676,6 +1757,8 @@ void pgf_parsing_print_counts(PgfParsing* ps)
}
#endif
+/*static bool
+*************
typedef struct {
PgfTokenState ts;
PgfToken tok;
@@ -1744,6 +1827,7 @@ typedef struct {
} PgfPrefixTokenState;
static bool
+^ ^ ^ ^ ^ ^ ^
pgf_prefix_match_token(PgfTokenState* ts0, PgfToken tok, PgfItem* item)
{
PgfPrefixTokenState* ts =
@@ -1754,9 +1838,18 @@ pgf_prefix_match_token(PgfTokenState* ts0, PgfToken tok, PgfItem* item)
PgfSequence* seq;
pgf_item_sequence(item, &lin_idx, &seq, ts->pool);
+ uint16_t seq_idx = item->seq_idx;
+ uint8_t tok_idx = item->tok_idx;
+
+ // go one token back
+ if (tok_idx > 0)
+ tok_idx--;
+ else
+ seq_idx--;
+
ts->tp = gu_new(PgfTokenProb, ts->pool);
ts->tp->tok =
- pgf_get_tokens(seq, item->seq_idx-1, ts->pool);
+ pgf_get_tokens(seq, seq_idx, tok_idx, ts->pool);
ts->tp->cat = item->conts->ccat->cnccat->abscat->name;
ts->tp->prob = item->inside_prob+item->conts->outside_prob;
}
@@ -1794,24 +1887,25 @@ pgf_parser_completions_next(GuEnum* self, void* to, GuPool* pool)
}
*((PgfTokenProb**)to) = ts->tp;
-}
+}*/
GuEnum*
-pgf_parser_completions(PgfParseState* prev, GuString prefix)
+pgf_parsing_completions(PgfParsing* ps, GuString prefix)
{
#ifdef PGF_COUNTS_DEBUG
- pgf_parsing_print_counts(prev->ps);
+ pgf_parsing_print_counts(ps);
#endif
- PgfPrefixTokenState* ts =
+/* PgfPrefixTokenState* ts =
pgf_new_token_state(PgfPrefixTokenState, prev->ps->pool);
ts->en.next = pgf_parser_completions_next;
ts->prefix = prefix;
ts->tp = NULL;
ts->state =
- pgf_new_parse_state(prev->ps, prev, &ts->ts, prev->ps->pool);
+ pgf_new_parse_state(prev->ps, prev, &ts->ts);
- return &ts->en;
+ return &ts->en;*/
+ return NULL;
}
static int
@@ -1981,17 +2075,134 @@ pgf_parse_result_is_new(PgfExprState* st)
return true;
}
+// TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat
+static PgfParsing*
+pgf_parsing_init(PgfConcr* concr, PgfCId cat, size_t lin_idx,
+ GuString sentence, double heuristics,
+ GuExn* err,
+ GuPool* pool, GuPool* out_pool)
+{
+ PgfCncCat* cnccat =
+ gu_map_get(concr->cnccats, cat, PgfCncCat*);
+ if (!cnccat) {
+ GuExnData* exn = gu_raise(err, PgfExn);
+ exn->data = "Unknown start category";
+ return NULL;
+ }
+
+ gu_assert(lin_idx < cnccat->n_lins);
+
+ if (heuristics < 0) {
+ heuristics = pgf_parsing_default_beam_size(concr);
+ }
+
+ PgfParsing* ps =
+ pgf_new_parsing(concr, sentence, heuristics, pool, out_pool);
+ PgfParseState* state =
+ pgf_new_parse_state(ps, 0, BIND_SOFT);
+
+ size_t n_ccats = gu_seq_length(cnccat->cats);
+ for (size_t i = 0; i < n_ccats; i++) {
+ PgfCCat* ccat = gu_seq_get(cnccat->cats, PgfCCat*, i);
+ if (ccat != NULL) {
+ if (ccat->prods == NULL) {
+ // Empty category
+ continue;
+ }
+
+ PgfItemConts* conts =
+ pgf_parsing_get_conts(state, ccat, lin_idx, ps->pool);
+ gu_buf_push(conts->items, PgfItem*, NULL);
+
+#ifdef PGF_COUNTS_DEBUG
+ ps->cont_full_count++;
+#endif
+
+ size_t n_prods = gu_seq_length(ccat->prods);
+ for (size_t i = 0; i < n_prods; i++) {
+ PgfProduction prod =
+ gu_seq_get(ccat->prods, PgfProduction, i);
+ PgfItem* item =
+ pgf_new_item(ps, conts, prod);
+ gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
+ }
+
+ PgfItem *item =
+ pgf_new_item(ps, conts, ps->meta_prod);
+ item->inside_prob =
+ ccat->cnccat->abscat->meta_prob;
+ gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
+ }
+ }
+
+ return ps;
+}
+
+static bool
+pgf_parsing_proceed(PgfParsing* ps)
+{
+ bool has_progress = false;
+
+ prob_t best_prob = INFINITY;
+ if (gu_buf_length(ps->expr_queue) > 0) {
+ best_prob = gu_buf_get(ps->expr_queue, PgfExprState*, 0)->ep.prob;
+ }
+
+ prob_t delta_prob = 0;
+ PgfParseState* st = ps->before;
+ while (st != NULL) {
+ if (gu_buf_length(st->agenda) > 0) {
+ PgfItem* item = gu_buf_get(st->agenda, PgfItem*, 0);
+ prob_t item_prob =
+ item->inside_prob+item->conts->outside_prob+delta_prob;
+ if (item_prob < best_prob) {
+ best_prob = item_prob;
+
+ while (st != ps->before) {
+ PgfParseState* tmp = ps->before->next;
+ ps->before->next = ps->after;
+ ps->after = ps->before;
+ ps->before = tmp;
+ }
+
+ has_progress = true;
+ }
+ }
+
+ prob_t state_delta =
+ (st->viterbi_prob-(st->next ? st->next->viterbi_prob : 0))*
+ ps->beam_size;
+ delta_prob += state_delta;
+ st = st->next;
+ }
+
+ if (has_progress) {
+ PgfItem* item;
+ gu_buf_heap_pop(ps->before->agenda, pgf_item_prob_order, &item);
+ pgf_parsing_item(ps, item);
+ }
+
+ while (ps->after != NULL) {
+ PgfParseState* tmp = ps->after->next;
+ ps->after->next = ps->before;
+ ps->before = ps->after;
+ ps->after = tmp;
+ }
+
+ return has_progress;
+}
+
static PgfExprProb*
-pgf_parse_result_next(PgfParseResult* pr)
+pgf_parse_result_next(PgfParsing* ps)
{
for (;;) {
- while (pgf_parsing_proceed(pr->state));
+ while (pgf_parsing_proceed(ps));
- if (gu_buf_length(pr->state->ps->expr_queue) == 0)
+ if (gu_buf_length(ps->expr_queue) == 0)
break;
PgfExprState* st;
- gu_buf_heap_pop(pr->state->ps->expr_queue, &pgf_expr_state_order, &st);
+ gu_buf_heap_pop(ps->expr_queue, &pgf_expr_state_order, &st);
#ifdef PGF_PARSER_DEBUG
#ifdef PGF_RESULT_DEBUG
@@ -2008,16 +2219,16 @@ pgf_parse_result_next(PgfParseResult* pr)
PgfCCat* ccat =
gu_seq_index(st->args, PgfPArg, st->arg_idx)->ccat;
- if (ccat->fid < pr->state->ps->concr->total_cats) {
+ if (ccat->fid < ps->concr->total_cats) {
st->ep.expr =
- gu_new_variant_i(pr->state->ps->out_pool,
+ gu_new_variant_i(ps->out_pool,
PGF_EXPR_APP, PgfExprApp,
.fun = st->ep.expr,
- .arg = pr->state->ps->meta_var);
+ .arg = ps->meta_var);
st->arg_idx++;
- gu_buf_heap_push(pr->state->ps->expr_queue, &pgf_expr_state_order, &st);
+ gu_buf_heap_push(ps->expr_queue, &pgf_expr_state_order, &st);
} else {
- pgf_result_predict(pr->state->ps, st, ccat);
+ pgf_result_predict(ps, st, ccat);
}
} else if (pgf_parse_result_is_new(st)) {
gu_buf_push(st->answers->exprs, PgfExprProb*, &st->ep);
@@ -2030,10 +2241,10 @@ pgf_parse_result_next(PgfParseResult* pr)
return &st->ep;
}
- PgfExprState* st3 = gu_new(PgfExprState, pr->state->ps->pool);
+ PgfExprState* st3 = gu_new(PgfExprState, ps->pool);
st3->answers = st2->answers;
st3->ep.expr =
- gu_new_variant_i(pr->state->ps->out_pool,
+ gu_new_variant_i(ps->out_pool,
PGF_EXPR_APP, PgfExprApp,
.fun = st2->ep.expr,
.arg = st->ep.expr);
@@ -2041,7 +2252,7 @@ pgf_parse_result_next(PgfParseResult* pr)
st3->args = st2->args;
st3->arg_idx = st2->arg_idx+1;
- gu_buf_heap_push(pr->state->ps->expr_queue, &pgf_expr_state_order, &st3);
+ gu_buf_heap_push(ps->expr_queue, &pgf_expr_state_order, &st3);
}
}
}
@@ -2052,181 +2263,111 @@ pgf_parse_result_next(PgfParseResult* pr)
static void
pgf_parse_result_enum_next(GuEnum* self, void* to, GuPool* pool)
{
- PgfParseResult* pr = gu_container(self, PgfParseResult, en);
- *(PgfExprProb**)to = pgf_parse_result_next(pr);
+ PgfParsing* ps = gu_container(self, PgfParsing, en);
+ *(PgfExprProb**)to = pgf_parse_result_next(ps);
}
-PgfExprEnum*
-pgf_parse_result(PgfParseState* state)
-{
-#ifdef PGF_COUNTS_DEBUG
- pgf_parsing_print_counts(state->ps);
-#endif
+static GuString
+pgf_parsing_last_token(PgfParsing* ps, GuPool* pool)
+{
+ if (ps->before == NULL)
+ return "";
- PgfParseResult* res = gu_new(PgfParseResult, state->ps->pool);
- res->state = state;
- res->en.next = pgf_parse_result_enum_next;
- return &res->en;
-}
+ size_t start = ps->before->end_offset;
+ while (start > 0) {
+ char c = ps->sentence[start-1];
+ if (gu_is_space(c))
+ break;
+ start--;
+ }
-void
-pgf_parse_print_chunks(PgfParseState* state)
-{
-/* if (state->ps->completed == NULL) {
- while (state->ps->completed == NULL) {
- if (!pgf_parsing_proceed(state))
- break;
- }
- if (state->ps->completed == NULL)
- return;
+ size_t end = ps->before->end_offset;
+ while (ps->sentence[end] != 0) {
+ char c = ps->sentence[end];
+ if (gu_is_space(c))
+ break;
+ end++;
}
-
- GuPool* tmp_pool = gu_new_pool();
- GuOut* out = gu_file_out(stdout, tmp_pool);
- GuWriter* wtr = gu_new_utf8_writer(out, tmp_pool);
- GuExn* err = gu_exn(NULL, type, tmp_pool);
- PgfCCat* completed = state->ps->completed;
- if (gu_seq_length(completed->prods) == 0)
- return;
+ char* tok = gu_malloc(pool, end-start+1);
+ memcpy(tok, ps->sentence+start, (end-start));
+ tok[end-start] = 0;
+ return tok;
+}
- size_t n_args = 0;
- size_t arg_idx = 0;
- PgfCCat* ccat = NULL;
- PgfProductionMeta* pmeta = NULL;
+GU_DEFINE_TYPE(PgfParseError, abstract, _);
- PgfProduction prod = gu_seq_get(completed->prods, PgfProduction, 0);
- GuVariantInfo pi = gu_variant_open(prod);
- switch (pi.tag) {
- case PGF_PRODUCTION_APPLY:
- n_args = 1;
- arg_idx = 0;
- ccat = completed;
- break;
- case PGF_PRODUCTION_META:
- pmeta = pi.data;
- n_args = gu_seq_length(pmeta->args);
- arg_idx = 0;
- ccat = gu_seq_index(pmeta->args, PgfPArg, arg_idx)->ccat;
- break;
- }
+GuEnum*
+pgf_parse(PgfConcr* concr, PgfCId cat, GuString sentence,
+ GuExn* err,
+ GuPool* pool, GuPool* out_pool)
+{
+ return pgf_parse_with_heuristics(concr, cat, sentence, -1.0, err, pool, out_pool);
+}
- PgfParseState* next = NULL;
- while (state != NULL) {
- PgfParseState* tmp = state->next;
- state->next = next;
- next = state;
- state = tmp;
+GuEnum*
+pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, GuString sentence,
+ double heuristics,
+ GuExn* err,
+ GuPool* pool, GuPool* out_pool)
+{
+ // Begin parsing a sentence with the specified category
+ PgfParsing* ps =
+ pgf_parsing_init(concr, cat, 0, sentence, heuristics, err, pool, out_pool);
+ if (ps == NULL) {
+ return NULL;
}
- int offset = 0;
-
- state = next;
- next = NULL;
- while (state != NULL) {
- if (state->ts != NULL)
- {
- if (ccat != NULL &&
- offset == ((ccat->conts->state != NULL) ? ccat->conts->state->offset : 0)) {
- PgfCCat *ccat2 = ccat;
- while (ccat2->conts != NULL) {
- ccat2 = ccat2->conts->ccat;
- }
-
- gu_putc('(', wtr, err);
- gu_string_write(ccat2->cnccat->abscat->name, wtr, err);
- gu_putc(' ', wtr, err);
- }
+#ifdef PGF_COUNTS_DEBUG
+ pgf_parsing_print_counts(ps);
+#endif
- gu_string_write(state->ts->tok, wtr, err);
- offset++;
-
- if (ccat != NULL &&
- ccat ==
- gu_map_get(state->generated_cats, ccat->conts, PgfCCat*)) {
- gu_putc(')', wtr, err);
-
- arg_idx++;
- ccat =
- (arg_idx >= n_args) ?
- NULL :
- gu_seq_index(pmeta->args, PgfPArg, arg_idx)->ccat;
- }
-
- gu_putc(' ', wtr, err);
+ while (gu_buf_length(ps->expr_queue) == 0) {
+ if (!pgf_parsing_proceed(ps)) {
+ GuExnData* exn = gu_raise(err, PgfParseError);
+ exn->data = (void*) pgf_parsing_last_token(ps, exn->pool);
+ return NULL;
}
- PgfParseState* tmp = state->next;
- state->next = next;
- next = state;
- state = tmp;
+#ifdef PGF_COUNTS_DEBUG
+ pgf_parsing_print_counts(ps);
+#endif
}
- gu_putc('\n', wtr, err);
- gu_pool_free(tmp_pool);*/
+ // Now begin enumerating the resulting syntax trees
+ ps->en.next = pgf_parse_result_enum_next;
+ return &ps->en;
}
-// TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat
-PgfParseState*
-pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx,
- double heuristics,
- GuPool* pool, GuPool* out_pool)
+GuEnum*
+pgf_complete(PgfConcr* concr, PgfCId cat, GuString sentence,
+ GuString prefix, GuExn *err, GuPool* pool)
{
- PgfCncCat* cnccat =
- gu_map_get(concr->cnccats, cat, PgfCncCat*);
- if (!cnccat)
+ // Begin parsing a sentence of the specified category
+ PgfParsing* ps =
+ pgf_parsing_init(concr, cat, 0, sentence, -1, err, pool, pool);
+ if (ps == NULL) {
return NULL;
-
- gu_assert(lin_idx < cnccat->n_lins);
-
- if (heuristics < 0) {
- heuristics = pgf_parsing_default_beam_size(concr);
}
- PgfParsing* ps =
- pgf_new_parsing(concr, heuristics, pool, out_pool);
- PgfParseState* state =
- pgf_new_parse_state(ps, NULL, NULL, pool);
-
- size_t n_ccats = gu_seq_length(cnccat->cats);
- for (size_t i = 0; i < n_ccats; i++) {
- PgfCCat* ccat = gu_seq_get(cnccat->cats, PgfCCat*, i);
- if (ccat != NULL) {
- if (ccat->prods == NULL) {
- // Empty category
- continue;
- }
-
- PgfItemConts* conts = gu_new(PgfItemConts, pool);
- conts->ccat = ccat;
- conts->lin_idx = lin_idx;
- conts->state = NULL;
- conts->items = gu_new_buf(PgfItem*, pool);
- conts->outside_prob = 0;
- conts->ref_count = 0;
- gu_buf_push(conts->items, PgfItem*, NULL);
+ // Tokenization
+ GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), pool);
+/* PgfToken tok = pgf_lexer_read_token(lexer, lex_err);
+ while (!gu_exn_is_raised(lex_err)) {
+ // feed the token to get a new parse state
+ state = pgf_parser_next_state(state, tok);
+ if (state == NULL) {
+ return NULL;
+ }
-#ifdef PGF_COUNTS_DEBUG
- ps->cont_full_count++;
-#endif
+ tok = pgf_lexer_read_token(lexer, lex_err);
+ }*/
- size_t n_prods = gu_seq_length(ccat->prods);
- for (size_t i = 0; i < n_prods; i++) {
- PgfProduction prod =
- gu_seq_get(ccat->prods, PgfProduction, i);
- PgfItem* item =
- pgf_new_item(conts, prod, pool, ps);
- gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
- }
+ if (gu_exn_caught(lex_err) != gu_type(GuEOF))
+ return NULL;
- PgfItem *item =
- pgf_new_item(conts, ps->meta_prod, pool, ps);
- item->inside_prob =
- ccat->cnccat->abscat->meta_prob;
- gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
- }
- }
- return state;
+ // Now begin enumerating the resulting syntax trees
+ return pgf_parsing_completions(ps, prefix);
}
void
@@ -2242,306 +2383,114 @@ pgf_parser_add_literal(PgfConcr *concr, PgfCId cat,
PgfLiteralCallback*, callback);
}
-typedef struct {
- GuMapItor fn;
- PgfTokens* tokens;
- PgfMorphoCallback* callback;
-} PgfMorphoFn;
-
static void
-pgf_morpho_iter(GuMapItor* fn, const void* key, void* value, GuExn* err)
-{
- PgfMorphoFn* clo = (PgfMorphoFn*) fn;
- PgfCFCat cfc = *((PgfCFCat*) key);
- PgfProductionBuf* prods = *((PgfProductionBuf**) value);
-
- if (prods == NULL)
- return;
-
- GuString analysis = cfc.ccat->cnccat->labels[cfc.lin_idx];
-
- size_t n_prods = gu_buf_length(prods);
- for (size_t i = 0; i < n_prods; i++) {
- PgfProduction prod =
- gu_buf_get(prods, PgfProduction, i);
-
- GuVariantInfo i = gu_variant_open(prod);
- switch (i.tag) {
- case PGF_PRODUCTION_APPLY: {
- PgfProductionApply* papp = i.data;
-
- if (clo->tokens != NULL) {
- // match the tokens with the production
- size_t pos = 0;
- PgfSequence* seq = papp->fun->lins[cfc.lin_idx];
- size_t len = gu_seq_length(seq);
- for (size_t i = 0; i < len; i++) {
- PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i);
-
- GuVariantInfo i = gu_variant_open(sym);
- switch (i.tag) {
- case PGF_SYMBOL_KS: {
- PgfSymbolKS* symks = i.data;
-
- if (pos >= gu_seq_length(clo->tokens))
- goto cont;
-
- PgfToken tok1 = symks->token;
- PgfToken tok2 = gu_seq_get(clo->tokens, PgfToken, pos++);
-
- if (strcmp(tok1, tok2) != 0)
- goto cont;
- }
- default:
- continue;
- }
- }
-
- if (pos != gu_seq_length(clo->tokens))
- goto cont;
- }
-
- PgfCId lemma = papp->fun->absfun->name;
- prob_t prob = papp->fun->absfun->ep.prob;
- clo->callback->callback(clo->callback,
- lemma, analysis, prob, err);
- }
- }
- cont:;
+pgf_morpho_iter(PgfProductionIdx* idx,
+ PgfMorphoCallback* callback,
+ GuExn* err)
+{
+ size_t n_entries = gu_buf_length(idx);
+ for (size_t i = 0; i < n_entries; i++) {
+ PgfProductionIdxEntry* entry =
+ gu_buf_index(idx, PgfProductionIdxEntry, i);
+
+ PgfCId lemma = entry->papp->fun->absfun->name;
+ GuString analysis = entry->ccat->cnccat->labels[entry->lin_idx];
+ prob_t prob = entry->papp->fun->absfun->ep.prob;
+ callback->callback(callback,
+ lemma, analysis, prob, err);
+ if (!gu_ok(err))
+ return;
}
}
-void
-pgf_lookup_morpho(PgfConcr *concr, PgfLexer *lexer,
- PgfMorphoCallback* callback, GuExn* err)
+static int
+pgf_sequence_cmp_fn(GuOrder* self, const void* p1, const void* p2)
{
- GuPool* tmp_pool = gu_local_pool();
-
- GuBuf* tokens = gu_new_buf(PgfToken, tmp_pool);
- GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), tmp_pool);
+ (void) self;
+ GuString sent = (GuString) p1;
+ const PgfSequence* sp2 = p2;
- PgfToken tok = pgf_lexer_read_token(lexer, lex_err);
- if (gu_exn_is_raised(lex_err)) {
- gu_raise(err, PgfExn);
- gu_pool_free(tmp_pool);
- return;
+ BIND_TYPE bind = BIND_HARD;
+ int res = pgf_symbols_cmp(&sent, strlen(sent), &bind, sp2->syms);
+ if (res == 0 && *sent != 0) {
+ res = 1;
}
- PgfProductionIdx* lexicon_idx =
- gu_map_get(concr->leftcorner_tok_idx, tok, PgfProductionIdx*);
- if (lexicon_idx == NULL) {
- gu_pool_free(tmp_pool);
- return;
- }
+ return res;
+}
- do {
- gu_buf_push(tokens, PgfToken, tok);
- tok = pgf_lexer_read_token(lexer, lex_err);
- } while (!gu_exn_is_raised(lex_err));
+static GuOrder pgf_sequence_order[1] = { { pgf_sequence_cmp_fn } };
- PgfMorphoFn clo = { { pgf_morpho_iter }, gu_buf_data_seq(tokens), callback };
- gu_map_iter(lexicon_idx, &clo.fn, err);
+void
+pgf_lookup_morpho(PgfConcr *concr, GuString sentence,
+ PgfMorphoCallback* callback, GuExn* err)
+{
+ PgfSequence* seq = (PgfSequence*)
+ gu_seq_binsearch_(concr->sequences, pgf_sequence_order,
+ sizeof(PgfSequence), 0, (void*) sentence);
- gu_pool_free(tmp_pool);
+ if (seq != NULL && seq->idx != NULL)
+ pgf_morpho_iter(seq->idx, callback, err);
}
typedef struct {
GuEnum en;
- GuEnum* map_en1;
- GuEnum* map_en2;
-
- GuMapItor fn;
- PgfLeftcornerTokIdx* new_idx;
-
- GuPool* pool;
+ PgfSequences* sequences;
+ size_t seq_idx;
} PgfFullFormState;
-static void
-pgf_fullform_iter(GuMapItor* fn, const void* key, void* value, GuExn* err)
-{
- PgfFullFormState* st = gu_container(fn, PgfFullFormState, fn);
- PgfCFCat cfc = *((PgfCFCat*) key);
- PgfProductionBuf* prods = *((PgfProductionBuf**) value);
-
- if (prods == NULL)
- return;
-
- size_t n_prods = gu_buf_length(prods);
- for (size_t i = 0; i < n_prods; i++) {
- PgfProduction prod =
- gu_buf_get(prods, PgfProduction, i);
-
- GuVariantInfo i = gu_variant_open(prod);
- switch (i.tag) {
- case PGF_PRODUCTION_APPLY: {
- PgfProductionApply* papp = i.data;
-
- PgfSequence* seq = papp->fun->lins[cfc.lin_idx];
- GuString tokens = pgf_get_tokens(seq, 0, st->pool);
-
- // create a new production index with keys that
- // are multiword units
- PgfProductionIdx* lexicon_idx =
- gu_map_get(st->new_idx, tokens, PgfProductionIdx*);
- if (lexicon_idx == NULL) {
- lexicon_idx = gu_map_type_new(PgfProductionIdx, st->pool);
- gu_map_put(st->new_idx, tokens, PgfProductionIdx*, lexicon_idx);
- }
-
- PgfProductionBuf* prods =
- gu_map_get(lexicon_idx, &cfc, PgfProductionBuf*);
- if (prods == NULL) {
- prods = gu_new_buf(PgfProduction, st->pool);
- gu_map_put(lexicon_idx, &cfc, PgfProductionBuf*, prods);
- }
-
- gu_buf_push(prods, PgfProduction, prod);
- }
- }
- }
-}
+struct PgfFullFormEntry {
+ GuString tokens;
+ PgfProductionIdx* idx;
+};
static void
gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool)
{
PgfFullFormState* st = gu_container(self, PgfFullFormState, en);
-
- for (;;) {
- if (st->new_idx == NULL) {
- GuMapKeyValue* kv = gu_next(st->map_en1, GuMapKeyValue*, pool);
- if (kv == NULL) {
- *((PgfFullFormEntry**)to) = NULL;
- return;
- }
-
- PgfProductionIdx* lexicon_idx = *((PgfProductionIdx**) kv->value);
-
- // we have an index by the first token but we must re-index
- // by taking into account the multiword units
- st->pool = pool;
- st->new_idx = gu_map_type_new(PgfLeftcornerTokIdx, pool);
- st->fn.fn = pgf_fullform_iter;
- gu_map_iter(lexicon_idx, &st->fn, NULL);
-
- st->map_en2 = gu_map_enum(st->new_idx, pool);
- }
- PgfFullFormEntry* entry =
- gu_next(st->map_en2, PgfFullFormEntry*, pool);
- if (entry != NULL) {
- *((PgfFullFormEntry**)to) = entry;
+ PgfFullFormEntry* entry = NULL;
+
+ size_t n_seqs = gu_seq_length(st->sequences);
+ while (st->seq_idx < n_seqs) {
+ PgfSymbols* syms = gu_seq_index(st->sequences, PgfSequence, st->seq_idx)->syms;
+ GuString tokens = pgf_get_tokens(syms, 0, pool);
+ if (strlen(tokens) > 0 &&
+ gu_seq_index(st->sequences, PgfSequence, st->seq_idx)->idx != NULL) {
+ entry = gu_new(PgfFullFormEntry, pool);
+ entry->tokens = tokens;
+ entry->idx = gu_seq_index(st->sequences, PgfSequence, st->seq_idx)->idx;
+
+ st->seq_idx++;
break;
}
-
- st->new_idx = NULL;
+
+ st->seq_idx++;
}
+
+ *((PgfFullFormEntry**) to) = entry;
}
GuEnum*
pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool)
{
PgfFullFormState* st = gu_new(PgfFullFormState, pool);
- st->en.next = gu_fullform_enum_next;
- st->map_en1 = gu_map_enum(concr->leftcorner_tok_idx, pool);
- st->map_en2 = NULL;
- st->new_idx = NULL;
- st->pool = NULL;
+ st->en.next = gu_fullform_enum_next;
+ st->sequences = concr->sequences;
+ st->seq_idx = 0;
return &st->en;
}
GuString
pgf_fullform_get_string(PgfFullFormEntry* entry)
{
- return (GuString) entry->key;
+ return entry->tokens;
}
void
pgf_fullform_get_analyses(PgfFullFormEntry* entry,
PgfMorphoCallback* callback, GuExn* err)
{
- PgfProductionIdx* lexicon_idx = *((PgfProductionIdx**) entry->value);
- PgfMorphoFn clo = { { pgf_morpho_iter }, NULL, callback };
- gu_map_iter(lexicon_idx, &clo.fn, err);
-}
-
-static void
-pgf_parser_index_token(PgfConcr* concr,
- PgfToken tok,
- PgfCCat* ccat, size_t lin_idx, PgfProduction prod,
- GuPool *pool)
-{
- PgfProductionIdx* set =
- gu_map_get(concr->leftcorner_tok_idx, tok, PgfProductionIdx*);
- if (set == NULL) {
- set = gu_map_type_new(PgfProductionIdx, pool);
- gu_map_put(concr->leftcorner_tok_idx, tok, PgfProductionIdx*, set);
- }
-
- PgfCFCat cfc = {ccat, lin_idx};
- PgfProductionBuf* prods = gu_map_get(set, &cfc, PgfProductionBuf*);
- if (prods == NULL) {
- prods = gu_new_buf(PgfProduction, pool);
- gu_map_put(set, &cfc, PgfProductionBuf*, prods);
- }
-
- gu_buf_push(prods, PgfProduction, prod);
-}
-
-static void
-pgf_parser_index_epsilon(PgfConcr* concr,
- PgfCCat* ccat, size_t lin_idx, PgfProduction prod,
- GuPool *pool)
-{
- PgfCFCat cfc = {ccat, lin_idx};
- PgfProductionBuf* prods =
- gu_map_get(concr->epsilon_idx, &cfc, PgfProductionBuf*);
- if (prods == NULL) {
- prods = gu_new_buf(PgfProduction, pool);
- gu_map_put(concr->epsilon_idx, &cfc, PgfProductionBuf*, prods);
- }
-
- gu_buf_push(prods, PgfProduction, prod);
-}
-
-static void
-pgf_parser_index_symbol(PgfConcr* concr, PgfSymbol sym,
- PgfCCat* ccat, size_t lin_idx, PgfProduction prod,
- GuPool *pool)
-{
- GuVariantInfo i = gu_variant_open(sym);
- switch (i.tag) {
- case PGF_SYMBOL_KS: {
- PgfSymbolKS* sks = i.data;
- pgf_parser_index_token(concr,
- sks->token,
- ccat, lin_idx, prod,
- pool);
- break;
- }
- case PGF_SYMBOL_KP: {
- PgfSymbolKP* skp = i.data;
- PgfSymbol sym =
- gu_seq_get(skp->default_form, PgfSymbol, 0);
- pgf_parser_index_symbol(concr, sym,
- ccat, lin_idx, prod,
- pool);
- for (size_t i = 0; i < skp->n_forms; i++) {
- sym = gu_seq_get(skp->forms[i].form, PgfSymbol, 0);
- pgf_parser_index_symbol(concr, sym,
- ccat, lin_idx, prod,
- pool);
- }
- break;
- }
- case PGF_SYMBOL_CAT:
- case PGF_SYMBOL_LIT:
- case PGF_SYMBOL_NE:
- case PGF_SYMBOL_BIND:
- case PGF_SYMBOL_VAR:
- // Nothing to be done here
- break;
- default:
- gu_impossible();
- }
+ pgf_morpho_iter(entry->idx, callback, err);
}
void
@@ -2559,15 +2508,14 @@ pgf_parser_index(PgfConcr* concr,
break;
PgfSequence* seq = papp->fun->lins[lin_idx];
- if (gu_seq_length(seq) > 0) {
- pgf_parser_index_symbol(concr, gu_seq_get(seq, PgfSymbol, 0),
- ccat, lin_idx, prod,
- pool);
- } else {
- pgf_parser_index_epsilon(concr,
- ccat, lin_idx, prod,
- pool);
+ if (seq->idx == NULL) {
+ seq->idx = gu_new_buf(PgfProductionIdxEntry, pool);
}
+
+ PgfProductionIdxEntry* entry = gu_buf_extend(seq->idx);
+ entry->ccat = ccat;
+ entry->lin_idx = lin_idx;
+ entry->papp = papp;
}
break;
case PGF_PRODUCTION_COERCE:
@@ -2631,24 +2579,3 @@ pgf_ccat_set_viterbi_prob(PgfCCat* ccat) {
return ccat->viterbi_prob;
}
-
-static bool
-pgf_cfcat_eq_fn(GuEquality* self, const void* a, const void* b)
-{
- PgfCFCat *x = (PgfCFCat *) a;
- PgfCFCat *y = (PgfCFCat *) b;
-
- return (x->ccat->fid == y->ccat->fid && x->lin_idx == y->lin_idx);
-}
-
-static GuHash
-pgf_cfcat_hash_fn(GuHasher* self, const void* a)
-{
- PgfCFCat *x = (PgfCFCat *) a;
- return ((x->ccat->fid << 16) ^ x->lin_idx);
-}
-
-GuHasher pgf_cfcat_hasher = {
- { pgf_cfcat_eq_fn },
- pgf_cfcat_hash_fn
-};