summaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
authorkrasimir <krasimir@chalmers.se>2017-05-30 11:42:32 +0000
committerkrasimir <krasimir@chalmers.se>2017-05-30 11:42:32 +0000
commitbf3557cf9019af6788fe07ce1af7182113b41c53 (patch)
tree93e79d9087c9a0696f55932af8d30cd2655eeabb /src/runtime
parent4c59db0dac70ac6815d9002baaa91dac1c9559db (diff)
finally a smoothed and more precise ranking for lookups
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/c/pgf/lookup.c138
1 files changed, 88 insertions, 50 deletions
diff --git a/src/runtime/c/pgf/lookup.c b/src/runtime/c/pgf/lookup.c
index 8a101e931..c86a990ec 100644
--- a/src/runtime/c/pgf/lookup.c
+++ b/src/runtime/c/pgf/lookup.c
@@ -20,7 +20,6 @@ typedef struct {
typedef struct {
PgfAbsFun* fun;
- size_t count;
PgfMetaId args[0];
} PgfAbsProduction;
@@ -35,7 +34,7 @@ pgf_print_abs_production(PgfMetaId id,
for (size_t i = 0; i < n_hypos; i++) {
gu_printf(out,err," ?%d", prod->args[i]);
}
- gu_printf(out,err," (%d)\n",prod->count);
+ gu_putc('\n',out,err);
}
static void
@@ -112,12 +111,18 @@ typedef struct {
GuPool* pool;
} PgfSpineBuilder;
+typedef struct {
+ PgfToken token;
+ size_t n_funs;
+ PgfAbsFun** funs;
+} PgfInputToken;
+
static PgfAbsProduction*
-pgf_lookup_new_production(PgfAbsFun* fun, GuPool *pool) {
+pgf_lookup_new_production(PgfAbsFun* fun, GuPool *pool)
+{
size_t n_hypos = gu_seq_length(fun->type->hypos);
PgfAbsProduction* prod = gu_new_flex(pool, PgfAbsProduction, args, n_hypos);
- prod->fun = fun;
- prod->count = 0;
+ prod->fun = fun;
for (size_t i = 0; i < n_hypos; i++) {
prod->args[i] = 0;
}
@@ -166,14 +171,13 @@ pgf_lookup_add_spine_leaf(PgfSpineBuilder* builder, PgfAbsFun *fun)
{
PgfMetaId id = pgf_lookup_add_spine_nodes(builder, fun->type->cid);
PgfAbsProduction* prod = pgf_lookup_new_production(fun, builder->pool);
- prod->count = 1;
pgf_lookup_add_production(builder, id, prod);
}
static GuBuf*
-pgf_lookup_build_spine(GuMap* lexicon_idx, GuMap* function_idx,
- GuString tok, PgfType* typ, PgfMetaId* meta_id,
+pgf_lookup_build_spine(GuMap* function_idx,
+ PgfInputToken* tok, PgfType* typ, PgfMetaId* meta_id,
GuPool* pool)
{
PgfSpineBuilder builder;
@@ -181,17 +185,11 @@ pgf_lookup_build_spine(GuMap* lexicon_idx, GuMap* function_idx,
builder.meta_ids = gu_new_string_map(PgfMetaId, &gu_null_struct, pool);
builder.spine = gu_new_buf(GuBuf*, pool);
builder.pool = pool;
-
+
gu_buf_push(builder.spine, GuBuf*, NULL);
- GuBuf* funs = gu_map_get(lexicon_idx, tok, GuBuf*);
- if (funs != NULL) {
- size_t n_funs = gu_buf_length(funs);
- for (size_t i = 0; i < n_funs; i++) {
- PgfAbsFun* absfun =
- gu_buf_get(funs, PgfAbsFun*, i);
- pgf_lookup_add_spine_leaf(&builder, absfun);
- }
+ for (size_t i = 0; i < tok->n_funs; i++) {
+ pgf_lookup_add_spine_leaf(&builder, tok->funs[i]);
}
*meta_id = gu_map_get(builder.meta_ids, typ->cid, PgfMetaId);
@@ -264,7 +262,6 @@ pgf_lookup_merge_cats(GuBuf* spine, GuMap* pairs,
if (prod1->fun == prod2->fun) {
PgfAbsProduction* prod =
pgf_lookup_new_production(prod1->fun, pool);
- prod->count = prod1->count+prod2->count;
size_t n_hypos = gu_seq_length(prod->fun->type->hypos);
for (size_t l = 0; l < n_hypos; l++) {
prod->args[l] =
@@ -274,7 +271,7 @@ pgf_lookup_merge_cats(GuBuf* spine, GuMap* pairs,
pool);
}
gu_buf_push(id_prods, PgfAbsProduction*, prod);
-
+
count++;
}
}
@@ -282,7 +279,6 @@ pgf_lookup_merge_cats(GuBuf* spine, GuMap* pairs,
if (count == 0) {
PgfAbsProduction* prod =
pgf_lookup_new_production(prod1->fun, pool);
- prod->count = prod1->count;
size_t n_hypos = gu_seq_length(prod->fun->type->hypos);
for (size_t l = 0; l < n_hypos; l++) {
prod->args[l] =
@@ -309,11 +305,10 @@ pgf_lookup_merge_cats(GuBuf* spine, GuMap* pairs,
break;
}
}
-
+
if (!found) {
PgfAbsProduction* prod =
pgf_lookup_new_production(prod2->fun, pool);
- prod->count = prod2->count;
size_t n_hypos = gu_seq_length(prod->fun->type->hypos);
for (size_t l = 0; l < n_hypos; l++) {
prod->args[l] =
@@ -359,7 +354,7 @@ typedef struct {
GuBuf* stack;
GuBuf* expr_tokens;
GuBuf* ctrees;
- int fid;
+ PgfAbsFun** curr_absfun;
GuPool* pool;
} PgfLookupState;
@@ -380,7 +375,6 @@ pgf_lookup_extract_app(PgfLookupState* st,
size_t n_args, PgfMetaId* args)
{
GuChoiceMark mark = gu_choice_mark(st->choice);
- int save_fid = st->fid;
PgfCncTree ret = gu_null_variant;
PgfCncTreeApp* capp =
@@ -417,7 +411,6 @@ redo:;
} else {
int index = gu_choice_next(st->choice, gu_buf_length(coercions));
if (index < 0) {
- st->fid = save_fid;
gu_choice_reset(st->choice, mark);
if (!gu_choice_advance(st->choice))
return gu_null_variant;
@@ -499,7 +492,7 @@ pgf_lookup_extract(PgfLookupState* st, PgfMetaId meta_id, PgfCCat *ccat)
args, 1, &ret, st->pool);
capp->ccat = ccat;
capp->fun = gu_seq_get(ccat->lindefs, PgfCncFun*, index);
- capp->fid = st->fid++;
+ capp->fid = 0;
capp->n_vars = 0;
capp->context = NULL;
capp->n_args = 1;
@@ -567,12 +560,12 @@ done:
}
static GuBuf*
-pgf_lookup_tokenize(GuString buf, size_t len, GuPool* pool)
+pgf_lookup_tokenize(GuMap* lexicon_idx, GuString sentence, GuPool* pool)
{
- GuBuf* tokens = gu_new_buf(GuString, pool);
+ GuBuf* tokens = gu_new_buf(PgfInputToken, pool);
GuUCS c = ' ';
- const uint8_t* p = (const uint8_t*) buf;
+ const uint8_t* p = (const uint8_t*) sentence;
for (;;) {
while (gu_ucs_is_space(c)) {
c = gu_utf8_decode(&p);
@@ -586,14 +579,23 @@ pgf_lookup_tokenize(GuString buf, size_t len, GuPool* pool)
}
const uint8_t* end = p-1;
- size_t len = end-start;
- GuString tok = gu_malloc(pool, len+1);
- memcpy((uint8_t*) tok, start, len);
- ((uint8_t*) tok)[len] = 0;
+ PgfInputToken* tok = gu_buf_extend(tokens);
- gu_buf_push(tokens, GuString, tok);
+ size_t len = end-start;
+ tok->token = gu_malloc(pool, len+1);
+ memcpy((uint8_t*) tok->token, start, len);
+ ((uint8_t*) tok->token)[len] = 0;
+
+ GuBuf* funs = gu_map_get(lexicon_idx, tok->token, GuBuf*);
+ if (funs != NULL) {
+ tok->n_funs = gu_buf_length(funs);
+ tok->funs = gu_buf_data(funs);
+ } else {
+ tok->n_funs = 0;
+ tok->funs = NULL;
+ }
}
-
+
return tokens;
}
@@ -610,11 +612,25 @@ pgf_lookup_compute_kernel_helper(GuBuf* sentence_tokens, GuBuf* expr_tokens,
for (size_t l = 0; l < i; l++) {
matrix[l + dim*j] = score;
for (size_t k = j; k > 0; k--) {
- GuString sentence_token = gu_buf_get(sentence_tokens, GuString, l);
- GuString expr_token = gu_buf_get(expr_tokens, GuString, k-1);
+ PgfInputToken* sentence_token = gu_buf_index(sentence_tokens, PgfInputToken, l);
+ PgfInputToken* expr_token = gu_buf_index(expr_tokens, PgfInputToken, k-1);
- if (strcmp(sentence_token, expr_token) == 0) {
+ if (strcmp(sentence_token->token, expr_token->token) == 0) {
score += 1 + pgf_lookup_compute_kernel_helper(sentence_tokens, expr_tokens, matrix, l, k-1);
+ } else {
+ bool match = false;
+ for (size_t i = 0; i < sentence_token->n_funs; i++) {
+ for (size_t j = 0; j < expr_token->n_funs; j++) {
+ if (sentence_token->funs[i] == expr_token->funs[j]) {
+ match = true;
+ goto done;
+ }
+ }
+ }
+ done:
+ if (match) {
+ score += 0.5 + pgf_lookup_compute_kernel_helper(sentence_tokens, expr_tokens, matrix, l, k-1);
+ }
}
}
}
@@ -720,16 +736,40 @@ pgf_lookup_enum_next(GuEnum* self, void* to, GuPool* pool)
}
static void
-pgf_lookup_symbol_token(PgfLinFuncs** funcs, PgfToken tok)
+pgf_lookup_symbol_token(PgfLinFuncs** self, PgfToken token)
+{
+ PgfLookupState* st = gu_container(self, PgfLookupState, funcs);
+ PgfInputToken* tok = gu_buf_extend(st->expr_tokens);
+ tok->token = token;
+ tok->n_funs = st->curr_absfun ? 1 : 0;
+ tok->funs = st->curr_absfun;
+}
+
+static void
+pgf_lookup_begin_phrase(PgfLinFuncs** self, PgfCId cat, int fid, int lindex, PgfCId funname)
+{
+ PgfLookupState* st = gu_container(self, PgfLookupState, funcs);
+
+ PgfAbsFun* absfun = gu_seq_binsearch(st->concr->abstr->funs, pgf_absfun_order, PgfAbsFun, funname);
+ if (absfun != NULL) {
+ st->curr_absfun = gu_new(PgfAbsFun*, st->pool);
+ *st->curr_absfun = absfun;
+ } else {
+ st->curr_absfun = NULL;
+ }
+}
+
+static void
+pgf_lookup_end_phrase(PgfLinFuncs** self, PgfCId cat, int fid, int lindex, PgfCId fun)
{
- PgfLookupState* st = gu_container(funcs, PgfLookupState, funcs);
- gu_buf_push(st->expr_tokens, PgfToken, tok);
+ PgfLookupState* st = gu_container(self, PgfLookupState, funcs);
+ st->curr_absfun = NULL;
}
static PgfLinFuncs pgf_lookup_lin_funcs = {
.symbol_token = pgf_lookup_symbol_token,
- .begin_phrase = NULL,
- .end_phrase = NULL,
+ .begin_phrase = pgf_lookup_begin_phrase,
+ .end_phrase = pgf_lookup_end_phrase,
.symbol_ne = NULL,
.symbol_bind = NULL,
.symbol_capit = NULL
@@ -773,9 +813,7 @@ pgf_lookup_sentence(PgfConcr* concr, PgfType* typ, GuString sentence, GuPool* po
GuPool *work_pool = gu_new_pool();
GuBuf* sentence_tokens =
- pgf_lookup_tokenize(sentence,
- strlen(sentence),
- work_pool);
+ pgf_lookup_tokenize(lexicon_idx, sentence, work_pool);
PgfMetaId meta_id1 = 0;
GuBuf* join = gu_new_buf(GuBuf*, pool);
@@ -783,11 +821,11 @@ pgf_lookup_sentence(PgfConcr* concr, PgfType* typ, GuString sentence, GuPool* po
size_t n_tokens = gu_buf_length(sentence_tokens);
for (size_t i = 0; i < n_tokens; i++) {
- GuString tok = gu_buf_get(sentence_tokens, GuString, i);
+ PgfInputToken* tok = gu_buf_index(sentence_tokens, PgfInputToken, i);
PgfMetaId meta_id2 = 0;
GuBuf* spine =
- pgf_lookup_build_spine(lexicon_idx, function_idx,
+ pgf_lookup_build_spine(function_idx,
tok, typ, &meta_id2,
work_pool);
@@ -810,9 +848,9 @@ pgf_lookup_sentence(PgfConcr* concr, PgfType* typ, GuString sentence, GuPool* po
st.start_id= meta_id1;
st.choice = gu_new_choice(work_pool);
st.stack = gu_new_buf(PgfMetaId, work_pool);
- st.expr_tokens=gu_new_buf(GuString, work_pool);
+ st.expr_tokens=gu_new_buf(PgfInputToken, work_pool);
st.ctrees = gu_new_buf(PgfCncTreeScore, pool);
- st.fid = 0;
+ st.curr_absfun= NULL;
st.pool = pool;
GuChoiceMark mark = gu_choice_mark(st.choice);