summaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
authorkr.angelov <kr.angelov@gmail.com>2014-04-17 11:00:27 +0000
committerkr.angelov <kr.angelov@gmail.com>2014-04-17 11:00:27 +0000
commitb0b27a1b3212e668178fb61e4ca20200b7af547c (patch)
tree0406bd34ddf90a25b290a0b663b90f709054eff6 /src/runtime
parent5cb5cdd31eb2168ed823088c40585d48686df112 (diff)
now word completion from Java works. It could be made better
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/c/pgf/parser.c355
1 files changed, 132 insertions, 223 deletions
diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c
index f944bc88d..f805ca137 100644
--- a/src/runtime/c/pgf/parser.c
+++ b/src/runtime/c/pgf/parser.c
@@ -54,7 +54,9 @@ typedef struct {
int max_fid;
PgfParseState *before;
PgfParseState *after;
- PgfExprEnum en; // enumeration for the generated trees
+ PgfToken prefix;
+ PgfTokenProb* tp;
+ PgfExprEnum en; // enumeration for the generated trees/tokens
#ifdef PGF_COUNTS_DEBUG
int item_full_count;
int item_real_count;
@@ -1278,7 +1280,7 @@ pgf_new_parse_state(PgfParsing* ps, size_t start_offset, BIND_TYPE bind_type)
}
size_t end_offset = start_offset;
- GuString current = ps->sentence + start_offset;
+ GuString current = ps->sentence + end_offset;
size_t len = strlen(current);
while (skip_space(&current, &len)) {
end_offset++;
@@ -1317,17 +1319,25 @@ pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
GuString current = ps->sentence + ps->before->end_offset;
size_t len = strlen(current);
- if (!ps->before->needs_bind && cmp_string(&current, &len, tok) == 0) {
- PgfParseState* state =
- pgf_new_parse_state(ps, (current - ps->sentence), BIND_NONE);
- if (state->next == NULL) {
- state->viterbi_prob =
- item->inside_prob+item->conts->outside_prob;
+ if (ps->prefix != NULL && ps->sentence[ps->before->end_offset] == 0) {
+ if (gu_string_is_prefix(ps->prefix, tok)) {
+ ps->tp = gu_new(PgfTokenProb, ps->out_pool);
+ ps->tp->tok = tok;
+ ps->tp->prob = item->inside_prob + item->conts->outside_prob;
}
+ } else {
+ if (!ps->before->needs_bind && cmp_string(&current, &len, tok) == 0) {
+ PgfParseState* state =
+ pgf_new_parse_state(ps, (current - ps->sentence), BIND_NONE);
+ if (state->next == NULL) {
+ state->viterbi_prob =
+ item->inside_prob+item->conts->outside_prob;
+ }
- gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
- } else {
- pgf_item_free(ps, item);
+ gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
+ } else {
+ pgf_item_free(ps, item);
+ }
}
}
@@ -1361,53 +1371,68 @@ pgf_parsing_td_predict(PgfParsing* ps,
item->inside_prob-conts->ccat->viterbi_prob+
item->conts->outside_prob;
- // Top-down prediction for syntactic rules
- for (size_t i = 0; i < ccat->n_synprods; i++) {
- PgfProduction prod =
- gu_seq_get(ccat->prods, PgfProduction, i);
- pgf_parsing_production(ps, ps->before, conts, prod);
- }
-
- // Bottom-up prediction for lexical and epsilon rules
- size_t n_idcs = gu_buf_length(ps->before->lexicon_idx);
- for (size_t i = 0; i < n_idcs; i++) {
- PgfLexiconIdxEntry* lentry =
- gu_buf_index(ps->before->lexicon_idx, PgfLexiconIdxEntry, i);
- PgfParseState* state =
- pgf_new_parse_state(ps, lentry->offset, lentry->bind_type);
+ if (ps->prefix != NULL) {
+ // We do completion:
+ // - top-down prediction for both syntactic and lexical rules
+ size_t n_prods;
+ if (ccat->fid < ps->concr->total_cats) // in grammar
+ n_prods = gu_seq_length(ccat->prods);
+ else
+ n_prods = ccat->n_synprods;
+ for (size_t i = 0; i < n_prods; i++) {
+ PgfProduction prod =
+ gu_seq_get(ccat->prods, PgfProduction, i);
+ pgf_parsing_production(ps, ps->before, conts, prod);
+ }
+ } else {
+ // Top-down prediction for syntactic rules
+ for (size_t i = 0; i < ccat->n_synprods; i++) {
+ PgfProduction prod =
+ gu_seq_get(ccat->prods, PgfProduction, i);
+ pgf_parsing_production(ps, ps->before, conts, prod);
+ }
- if (state != NULL) {
- PgfProductionIdxEntry key;
- key.ccat = ccat;
- key.lin_idx = lin_idx;
- key.papp = NULL;
- PgfProductionIdxEntry* value =
- gu_seq_binsearch(gu_buf_data_seq(lentry->idx),
- pgf_production_idx_entry_order,
- PgfProductionIdxEntry, &key);
-
- if (value != NULL) {
- pgf_parsing_predict_lexeme(ps, state, conts, value);
-
- PgfProductionIdxEntry* start =
- gu_buf_data(lentry->idx);
- PgfProductionIdxEntry* end =
- start + gu_buf_length(lentry->idx)-1;
-
- PgfProductionIdxEntry* left = value-1;
- while (left >= start &&
- value->ccat->fid == left->ccat->fid &&
- value->lin_idx == left->lin_idx) {
- pgf_parsing_predict_lexeme(ps, state, conts, left);
- left--;
- }
+ // Bottom-up prediction for lexical and epsilon rules
+ size_t n_idcs = gu_buf_length(ps->before->lexicon_idx);
+ for (size_t i = 0; i < n_idcs; i++) {
+ PgfLexiconIdxEntry* lentry =
+ gu_buf_index(ps->before->lexicon_idx, PgfLexiconIdxEntry, i);
+ PgfParseState* state =
+ pgf_new_parse_state(ps, lentry->offset, lentry->bind_type);
- PgfProductionIdxEntry* right = value+1;
- while (right <= end &&
- value->ccat->fid == right->ccat->fid &&
- value->lin_idx == right->lin_idx) {
- pgf_parsing_predict_lexeme(ps, state, conts, right);
- right++;
+ if (state != NULL) {
+ PgfProductionIdxEntry key;
+ key.ccat = ccat;
+ key.lin_idx = lin_idx;
+ key.papp = NULL;
+ PgfProductionIdxEntry* value =
+ gu_seq_binsearch(gu_buf_data_seq(lentry->idx),
+ pgf_production_idx_entry_order,
+ PgfProductionIdxEntry, &key);
+
+ if (value != NULL) {
+ pgf_parsing_predict_lexeme(ps, state, conts, value);
+
+ PgfProductionIdxEntry* start =
+ gu_buf_data(lentry->idx);
+ PgfProductionIdxEntry* end =
+ start + gu_buf_length(lentry->idx)-1;
+
+ PgfProductionIdxEntry* left = value-1;
+ while (left >= start &&
+ value->ccat->fid == left->ccat->fid &&
+ value->lin_idx == left->lin_idx) {
+ pgf_parsing_predict_lexeme(ps, state, conts, left);
+ left--;
+ }
+
+ PgfProductionIdxEntry* right = value+1;
+ while (right <= end &&
+ value->ccat->fid == right->ccat->fid &&
+ value->lin_idx == right->lin_idx) {
+ pgf_parsing_predict_lexeme(ps, state, conts, right);
+ right++;
+ }
}
}
}
@@ -1871,6 +1896,8 @@ pgf_new_parsing(PgfConcr* concr, GuString sentence,
ps->ccat_full_count = 0;
ps->prod_full_count = 0;
#endif
+ ps->prefix = NULL;
+ ps->tp = NULL;
ps->free_item = NULL;
ps->heuristic_factor = 0;
ps->meta_prob = INFINITY;
@@ -1906,157 +1933,6 @@ void pgf_parsing_print_counts(PgfParsing* ps)
}
#endif
-/*static bool
-*************
-typedef struct {
- PgfTokenState ts;
- PgfToken tok;
- PgfProductionIdx *lexicon_idx;
-} PgfRealTokenState;
-
-static bool
-pgf_real_match_token(PgfTokenState* ts, PgfToken tok, PgfItem* item)
-{
- return strcmp(gu_container(ts, PgfRealTokenState, ts)->tok, tok) == 0;
-}
-
-static PgfToken
-pgf_real_get_token(PgfTokenState* ts) {
- return gu_container(ts, PgfRealTokenState, ts)->tok;
-}
-
-static PgfProductionIdx*
-pgf_real_get_lexicon_idx(PgfTokenState* ts) {
- return gu_container(ts, PgfRealTokenState, ts)->lexicon_idx;
-}
-
-static PgfTokenFn pgf_tsfn_PgfRealTokenState = {
- pgf_real_match_token,
- pgf_real_get_token,
- pgf_real_get_lexicon_idx
-};
-
-PgfParseState*
-pgf_parser_next_state(PgfParseState* prev, PgfToken tok)
-{
-#ifdef PGF_COUNTS_DEBUG
- pgf_parsing_print_counts(prev->ps);
-#endif
-
- PgfRealTokenState* ts =
- pgf_new_token_state(PgfRealTokenState, prev->ps->pool);
- ts->tok = tok;
- ts->lexicon_idx = gu_map_get(prev->ps->concr->leftcorner_tok_idx,
- tok, PgfProductionIdx*);
- if (ts->lexicon_idx != NULL) {
- PgfLexiconFn clo = { { pgf_parser_compute_lexicon_prob }, &ts->ts };
- gu_map_iter(ts->lexicon_idx, &clo.fn, NULL);
- }
- if (ts->ts.lexical_prob == INFINITY)
- ts->ts.lexical_prob = 0;
-
- PgfParseState* state =
- pgf_new_parse_state(prev->ps, prev, &ts->ts, prev->ps->pool);
-
- while (gu_buf_length(state->agenda) == 0) {
- if (!pgf_parsing_proceed(state))
- return NULL;
- }
-
- return state;
-}
-
-typedef struct {
- PgfTokenState ts;
- GuEnum en;
- GuString prefix;
- PgfTokenProb* tp;
- GuPool* pool;
- PgfParseState* state;
-} PgfPrefixTokenState;
-
-static bool
-^ ^ ^ ^ ^ ^ ^
-pgf_prefix_match_token(PgfTokenState* ts0, PgfToken tok, PgfItem* item)
-{
- PgfPrefixTokenState* ts =
- gu_container(ts0, PgfPrefixTokenState, ts);
-
- if (gu_string_is_prefix(ts->prefix, tok)) {
- size_t lin_idx;
- PgfSequence* seq;
- pgf_item_sequence(item, &lin_idx, &seq, ts->pool);
-
- uint16_t seq_idx = item->seq_idx;
- uint8_t tok_idx = item->tok_idx;
-
- // go one token back
- if (tok_idx > 0)
- tok_idx--;
- else
- seq_idx--;
-
- ts->tp = gu_new(PgfTokenProb, ts->pool);
- ts->tp->tok =
- pgf_get_tokens(seq, seq_idx, tok_idx, ts->pool);
- ts->tp->cat = item->conts->ccat->cnccat->abscat->name;
- ts->tp->prob = item->inside_prob+item->conts->outside_prob;
- }
-
- return false;
-}
-
-static PgfToken
-pgf_prefix_get_token(PgfTokenState* ts) {
- return "";
-}
-
-static PgfProductionIdx*
-pgf_prefix_get_lexicon_idx(PgfTokenState* ts) {
- return NULL;
-}
-
-static PgfTokenFn pgf_tsfn_PgfPrefixTokenState = {
- pgf_prefix_match_token,
- pgf_prefix_get_token,
- pgf_prefix_get_lexicon_idx
-};
-
-static void
-pgf_parser_completions_next(GuEnum* self, void* to, GuPool* pool)
-{
- PgfPrefixTokenState* ts =
- gu_container(self, PgfPrefixTokenState, en);
-
- ts->tp = NULL;
- ts->pool = pool;
- while (ts->tp == NULL) {
- if (!pgf_parsing_proceed(ts->state))
- break;
- }
-
- *((PgfTokenProb**)to) = ts->tp;
-}*/
-
-GuEnum*
-pgf_parsing_completions(PgfParsing* ps, GuString prefix)
-{
-#ifdef PGF_COUNTS_DEBUG
- pgf_parsing_print_counts(ps);
-#endif
-
-/* PgfPrefixTokenState* ts =
- pgf_new_token_state(PgfPrefixTokenState, prev->ps->pool);
- ts->en.next = pgf_parser_completions_next;
- ts->prefix = prefix;
- ts->tp = NULL;
- ts->state =
- pgf_new_parse_state(prev->ps, prev, &ts->ts);
-
- return &ts->en;*/
- return NULL;
-}
-
static int
cmp_expr_state(GuOrder* self, const void* a, const void* b)
{
@@ -2501,35 +2377,68 @@ pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, GuString sentence,
return &ps->en;
}
+static void
+pgf_parser_completions_next(GuEnum* self, void* to, GuPool* pool)
+{
+ PgfParsing* ps =
+ gu_container(self, PgfParsing, en);
+
+ ps->tp = NULL;
+ while (ps->tp == NULL) {
+ if (!pgf_parsing_proceed(ps))
+ break;
+
+#ifdef PGF_COUNTS_DEBUG
+ pgf_parsing_print_counts(ps);
+#endif
+ }
+
+ *((PgfTokenProb**)to) = ps->tp;
+}
+
GuEnum*
pgf_complete(PgfConcr* concr, PgfCId cat, GuString sentence,
GuString prefix, GuExn *err, GuPool* pool)
{
- // Begin parsing a sentence of the specified category
+ if (concr->sequences == NULL ||
+ concr->pre_sequences == NULL ||
+ concr->cnccats == NULL) {
+ GuExnData* err_data = gu_raise(err, PgfExn);
+ if (err_data) {
+ err_data->data = "The concrete syntax is not loaded";
+ return NULL;
+ }
+ }
+
+ // Begin parsing a sentence with the specified category
PgfParsing* ps =
- pgf_parsing_init(concr, cat, 0, sentence, -1, err, pool, pool);
+ pgf_parsing_init(concr, cat, 0, sentence, -1.0, err, pool, pool);
if (ps == NULL) {
return NULL;
}
- // Tokenization
- GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), pool);
-/* PgfToken tok = pgf_lexer_read_token(lexer, lex_err);
- while (!gu_exn_is_raised(lex_err)) {
- // feed the token to get a new parse state
- state = pgf_parser_next_state(state, tok);
- if (state == NULL) {
+#ifdef PGF_COUNTS_DEBUG
+ pgf_parsing_print_counts(ps);
+#endif
+
+ size_t len = strlen(ps->sentence);
+ while (ps->before->end_offset < len) {
+ if (!pgf_parsing_proceed(ps)) {
+ GuExnData* exn = gu_raise(err, PgfParseError);
+ exn->data = (void*) pgf_parsing_last_token(ps, exn->pool);
return NULL;
}
- tok = pgf_lexer_read_token(lexer, lex_err);
- }*/
-
- if (gu_exn_caught(lex_err) != gu_type(GuEOF))
- return NULL;
+#ifdef PGF_COUNTS_DEBUG
+ pgf_parsing_print_counts(ps);
+#endif
+ }
- // Now begin enumerating the resulting syntax trees
- return pgf_parsing_completions(ps, prefix);
+ // Now begin enumerating the completions
+ ps->en.next = pgf_parser_completions_next;
+ ps->prefix = prefix;
+ ps->tp = NULL;
+ return &ps->en;
}
static void