summaryrefslogtreecommitdiff
path: root/src/runtime/c
diff options
context:
space:
mode:
authorkr.angelov <kr.angelov@gmail.com>2013-05-01 06:09:55 +0000
committerkr.angelov <kr.angelov@gmail.com>2013-05-01 06:09:55 +0000
commit22f44ef61f99acdec5d19f336bb80f6bb3a4e8b7 (patch)
tree4ee1496edc1c5df1b11464ce8425071c3ef11994 /src/runtime/c
parent41bccf5737544a6981dc6a17bb4bb8116ace7937 (diff)
word completion in the C runtime. The runtime/python/test.py example is now using readline with word completion
Diffstat (limited to 'src/runtime/c')
-rw-r--r--src/runtime/c/gu/string.c52
-rw-r--r--src/runtime/c/gu/string.h4
-rw-r--r--src/runtime/c/pgf/lexer.h6
-rw-r--r--src/runtime/c/pgf/parser.c225
-rw-r--r--src/runtime/c/pgf/parser.h4
-rw-r--r--src/runtime/c/pgf/pgf.c31
-rw-r--r--src/runtime/c/pgf/pgf.h4
7 files changed, 275 insertions, 51 deletions
diff --git a/src/runtime/c/gu/string.c b/src/runtime/c/gu/string.c
index 526a6871c..7dc90e1e3 100644
--- a/src/runtime/c/gu/string.c
+++ b/src/runtime/c/gu/string.c
@@ -311,6 +311,58 @@ gu_string_to_double(GuString s, double *res)
return true;
}
+bool
+gu_string_is_prefix(GuString s1, GuString s2)
+{
+ GuWord w1 = s1.w_;
+ uint8_t buf1[sizeof(GuWord)];
+ size_t sz1;
+ char* str1;
+ if (w1 & 1) {
+ sz1 = (w1 & 0xff) >> 1;
+ gu_assert(sz1 <= sizeof(GuWord));
+ size_t i = sz1;
+ while (i > 0) {
+ w1 >>= 8;
+ buf1[--i] = w1 & 0xff;
+ }
+ str1 = (char*) buf1;
+ } else {
+ uint8_t* p = (void*) w1;
+ sz1 = (p[0] == 0) ? ((size_t*) p)[-1] : p[0];
+ str1 = (char*) &p[1];
+ }
+
+ GuWord w2 = s2.w_;
+ uint8_t buf2[sizeof(GuWord)];
+ size_t sz2;
+ char* str2;
+ if (w2 & 1) {
+ sz2 = (w2 & 0xff) >> 1;
+ gu_assert(sz2 <= sizeof(GuWord));
+ size_t i = sz2;
+ while (i > 0) {
+ w2 >>= 8;
+ buf2[--i] = w2 & 0xff;
+ }
+ str2 = (char*) buf2;
+ } else {
+ uint8_t* p = (void*) w2;
+ sz2 = (p[0] == 0) ? ((size_t*) p)[-1] : p[0];
+ str2 = (char*) &p[1];
+ }
+
+ while (sz1 > 0 && sz2 > 0) {
+ if (*str1 != *str2)
+ return false;
+
+ str1++; sz1--;
+ str2++; sz2--;
+ }
+
+ return true;
+}
+
GuWord
gu_string_hash(GuString s)
{
diff --git a/src/runtime/c/gu/string.h b/src/runtime/c/gu/string.h
index b041518b3..37cb56ac2 100644
--- a/src/runtime/c/gu/string.h
+++ b/src/runtime/c/gu/string.h
@@ -69,6 +69,10 @@ gu_string_to_int(GuString s, int *res);
bool
gu_string_to_double(GuString s, double *res);
+
+bool
+gu_string_is_prefix(GuString s1, GuString s2);
+
#endif // GU_STRING_H_
#if defined(GU_HASH_H_) && !defined(GU_STRING_H_HASH_)
diff --git a/src/runtime/c/pgf/lexer.h b/src/runtime/c/pgf/lexer.h
index 270a7949b..d2992ee06 100644
--- a/src/runtime/c/pgf/lexer.h
+++ b/src/runtime/c/pgf/lexer.h
@@ -2,12 +2,18 @@
#define PGF_LEXER_H_
#include <gu/read.h>
+#include <pgf/expr.h>
/// A single lexical token
typedef GuString PgfToken;
typedef GuSeq PgfTokens; // -> PgfToken
typedef struct {
+ prob_t prob;
+ PgfToken tok;
+} PgfTokenProb;
+
+typedef struct {
PgfToken (*read_token)();
PgfToken tok;
} PgfLexer;
diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c
index f5e8afcde..c1362b88d 100644
--- a/src/runtime/c/pgf/parser.c
+++ b/src/runtime/c/pgf/parser.c
@@ -46,7 +46,6 @@ typedef struct {
PgfConcr* concr;
GuPool* pool;
GuBuf* expr_queue;
- PgfItem* target;
PgfExpr meta_var;
PgfProduction meta_prod;
int max_fid;
@@ -76,11 +75,18 @@ GU_DEFINE_TYPE(PgfProductionIdx, GuMap,
gu_type(PgfCFCat), &pgf_cfcat_hasher,
gu_type(PgfProductionSeq), &gu_null_seq);
+typedef struct PgfTokenState PgfTokenState;
+
typedef struct {
- PgfToken tok;
- PgfProductionIdx* lexicon_idx;
+ bool (*match_token)(PgfTokenState* ts, PgfToken tok, PgfItem* item);
+ PgfToken (*get_token)(PgfTokenState* ts);
+ PgfProductionIdx* (*get_lexicon_idx)(PgfTokenState* ts);
+} PgfTokenFn;
+
+struct PgfTokenState {
+ PgfTokenFn* fn;
prob_t lexical_prob;
-} PgfTokenState;
+};
struct PgfParseState {
PgfParseState* next;
@@ -785,9 +791,8 @@ static void
pgf_parsing_add_transition(PgfParseState* before, PgfParseState* after,
PgfToken tok, PgfItem* item)
{
- if (gu_string_eq(tok, after->ts->tok)) {
+ if (after->ts->fn->match_token(after->ts, tok, item)) {
if (after->next == NULL) {
- after->ps->target = item;
after->viterbi_prob =
item->inside_prob+item->conts->outside_prob;
}
@@ -1076,20 +1081,31 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
item->inside_prob-conts->ccat->viterbi_prob+
item->conts->outside_prob;
+ size_t n_prods = ccat->n_synprods;
+ PgfProductionIdx* lexicon_idx = NULL;
+ if (after != NULL) {
+ lexicon_idx = after->ts->fn->get_lexicon_idx(after->ts);
+
+ // we don't know the current token.
+ // probably we just compute the list of completions
+ if (lexicon_idx == NULL)
+ n_prods = gu_seq_length(ccat->prods);
+ }
+
// Top-down prediction for syntactic rules
- PgfProductionSeq prods = ccat->prods;
- for (size_t i = 0; i < ccat->n_synprods; i++) {
+ for (size_t i = 0; i < n_prods; i++) {
PgfProduction prod =
- gu_seq_get(prods, PgfProduction, i);
+ gu_seq_get(ccat->prods, PgfProduction, i);
pgf_parsing_production(before, conts, prod);
}
// Bottom-up prediction for lexical rules
- if (after != NULL && after->ts->lexicon_idx != NULL) {
+
+ if (lexicon_idx != NULL) {
PgfCFCat cfc = {ccat, lin_idx};
PgfProductionSeq tok_prods =
- gu_map_get(after->ts->lexicon_idx, &cfc, PgfProductionSeq);
-
+ gu_map_get(lexicon_idx, &cfc, PgfProductionSeq);
+
if (!gu_seq_is_null(tok_prods)) {
size_t n_prods = gu_seq_length(tok_prods);
for (size_t i = 0; i < n_prods; i++) {
@@ -1141,20 +1157,24 @@ static void
pgf_parsing_meta_scan(PgfParseState* before, PgfParseState* after,
PgfItem* meta_item, prob_t meta_prob)
{
- PgfItem* item = pgf_item_copy(meta_item, before->ps->pool, before->ps);
- item->inside_prob += meta_prob;
-
- PgfSymbol prev = item->curr_sym;
- PgfSymbolKS* sks = (PgfSymbolKS*)
- gu_alloc_variant(PGF_SYMBOL_KS,
- sizeof(PgfSymbolKS)+sizeof(PgfSymbol),
- gu_alignof(PgfSymbolKS),
- &item->curr_sym, after->ps->pool);
- *((PgfSymbol*)(sks+1)) = prev;
- sks->tokens = gu_new_seq(PgfToken, 1, after->ps->pool);
- gu_seq_set(sks->tokens, PgfToken, 0, after->ts->tok);
-
- gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
+ PgfToken tok = after->ts->fn->get_token(after->ts);
+
+ if (!gu_string_eq(tok, gu_empty_string)) {
+ PgfItem* item = pgf_item_copy(meta_item, before->ps->pool, before->ps);
+ item->inside_prob += meta_prob;
+
+ PgfSymbol prev = item->curr_sym;
+ PgfSymbolKS* sks = (PgfSymbolKS*)
+ gu_alloc_variant(PGF_SYMBOL_KS,
+ sizeof(PgfSymbolKS)+sizeof(PgfSymbol),
+ gu_alignof(PgfSymbolKS),
+ &item->curr_sym, after->ps->pool);
+ *((PgfSymbol*)(sks+1)) = prev;
+ sks->tokens = gu_new_seq(PgfToken, 1, after->ps->pool);
+ gu_seq_set(sks->tokens, PgfToken, 0, tok);
+
+ gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
+ }
}
typedef struct {
@@ -1468,8 +1488,9 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
pgf_parsing_symbol(before, after, item, sym);
}
} else {
- PgfToken tok = (after != NULL) ? after->ts->tok
- : gu_empty_string;
+ PgfToken tok = (after != NULL)
+ ? after->ts->fn->get_token(after->ts)
+ : gu_empty_string;
PgfExprProb *ep = NULL;
bool accepted =
@@ -1563,7 +1584,7 @@ pgf_parsing_proceed(PgfParseState* state)
before = st;
}
}
-
+
prob_t state_delta =
(st->viterbi_prob-(st->next ? st->next->viterbi_prob : 0))*
state->ps->beam_size;
@@ -1623,7 +1644,6 @@ pgf_new_parsing(PgfConcr* concr, GuPool* pool)
ps->concr = concr;
ps->pool = pool;
ps->expr_queue = gu_new_buf(PgfExprState*, pool);
- ps->target = NULL;
ps->max_fid = concr->total_cats;
#ifdef PGF_COUNTS_DEBUG
ps->item_full_count = 0;
@@ -1702,20 +1722,14 @@ pgf_parser_compute_lexicon_prob(GuMapItor* fn, const void* key, void* value, GuE
}
}
+#define pgf_new_token_state(ty, pool) \
+ (ty*) pgf_new_token_state_(&pgf_tsfn_##ty, (PgfTokenState*) gu_new(ty, pool))
+
static PgfTokenState*
-pgf_new_token_state(PgfConcr *concr, PgfToken tok, GuPool* pool)
+pgf_new_token_state_(PgfTokenFn* fn, PgfTokenState* ts)
{
- PgfTokenState* ts = gu_new(PgfTokenState, pool);
- ts->tok = tok;
- ts->lexicon_idx = gu_map_get(concr->leftcorner_tok_idx,
- &tok, PgfProductionIdx*);
- ts->lexical_prob = INFINITY;
- if (ts->lexicon_idx != NULL) {
- PgfLexiconFn clo = { { pgf_parser_compute_lexicon_prob }, ts };
- gu_map_iter(ts->lexicon_idx, &clo.fn, NULL);
- }
- if (ts->lexical_prob == INFINITY)
- ts->lexical_prob = 0;
+ ts->fn = fn;
+ ts->lexical_prob = INFINITY;
return ts;
}
@@ -1731,6 +1745,34 @@ void pgf_parsing_print_counts(PgfParsing* ps)
}
#endif
+typedef struct {
+ PgfTokenState ts;
+ PgfToken tok;
+ PgfProductionIdx *lexicon_idx;
+} PgfRealTokenState;
+
+static bool
+pgf_real_match_token(PgfTokenState* ts, PgfToken tok, PgfItem* item)
+{
+ return gu_string_eq(gu_container(ts, PgfRealTokenState, ts)->tok, tok);
+}
+
+static PgfToken
+pgf_real_get_token(PgfTokenState* ts) {
+ return gu_container(ts, PgfRealTokenState, ts)->tok;
+}
+
+static PgfProductionIdx*
+pgf_real_get_lexicon_idx(PgfTokenState* ts) {
+ return gu_container(ts, PgfRealTokenState, ts)->lexicon_idx;
+}
+
+static PgfTokenFn pgf_tsfn_PgfRealTokenState = {
+ pgf_real_match_token,
+ pgf_real_get_token,
+ pgf_real_get_lexicon_idx
+};
+
PgfParseState*
pgf_parser_next_state(PgfParseState* prev, PgfToken tok)
{
@@ -1738,23 +1780,104 @@ pgf_parser_next_state(PgfParseState* prev, PgfToken tok)
pgf_parsing_print_counts(prev->ps);
#endif
- PgfTokenState* ts =
- pgf_new_token_state(prev->ps->concr,tok,prev->ps->pool);
+ PgfRealTokenState* ts =
+ pgf_new_token_state(PgfRealTokenState, prev->ps->pool);
+ ts->tok = tok;
+ ts->lexicon_idx = gu_map_get(prev->ps->concr->leftcorner_tok_idx,
+ &tok, PgfProductionIdx*);
+ if (ts->lexicon_idx != NULL) {
+ PgfLexiconFn clo = { { pgf_parser_compute_lexicon_prob }, &ts->ts };
+ gu_map_iter(ts->lexicon_idx, &clo.fn, NULL);
+ }
+ if (ts->ts.lexical_prob == INFINITY)
+ ts->ts.lexical_prob = 0;
+
PgfParseState* state =
- pgf_new_parse_state(prev->ps, prev, ts, prev->ps->pool);
+ pgf_new_parse_state(prev->ps, prev, &ts->ts, prev->ps->pool);
- state->ps->target = NULL;
- while (state->ps->target == NULL) {
+ while (gu_buf_length(state->agenda) == 0) {
if (!pgf_parsing_proceed(state))
- break;
+ return NULL;
+ }
+
+ return state;
+}
+
+typedef struct {
+ PgfTokenState ts;
+ GuEnum en;
+ GuString prefix;
+ PgfTokenProb* tp;
+ GuPool* pool;
+ PgfParseState* state;
+} PgfPrefixTokenState;
+
+static bool
+pgf_prefix_match_token(PgfTokenState* ts0, PgfToken tok, PgfItem* item)
+{
+ PgfPrefixTokenState* ts =
+ gu_container(ts0, PgfPrefixTokenState, ts);
+
+ if (gu_string_is_prefix(ts->prefix, tok)) {
+ ts->tp = gu_new(PgfTokenProb, ts->pool);
+ ts->tp->tok = tok;
+ ts->tp->prob = item->inside_prob+item->conts->outside_prob;
}
- if (state->ps->target != NULL) {
- return state;
- }
+ return false;
+}
+
+static PgfToken
+pgf_prefix_get_token(PgfTokenState* ts) {
+ return gu_empty_string;
+}
+
+static PgfProductionIdx*
+pgf_prefix_get_lexicon_idx(PgfTokenState* ts) {
return NULL;
}
+static PgfTokenFn pgf_tsfn_PgfPrefixTokenState = {
+ pgf_prefix_match_token,
+ pgf_prefix_get_token,
+ pgf_prefix_get_lexicon_idx
+};
+
+static void
+pgf_parser_completions_next(GuEnum* self, void* to, GuPool* pool)
+{
+ PgfPrefixTokenState* ts =
+ gu_container(self, PgfPrefixTokenState, en);
+
+ ts->tp = NULL;
+ ts->pool = pool;
+ while (ts->tp == NULL) {
+ if (!pgf_parsing_proceed(ts->state))
+ break;
+ }
+
+ *((PgfTokenProb**)to) = ts->tp;
+}
+
+GuEnum*
+pgf_parser_completions(PgfParseState* prev, GuString prefix,
+ GuPool* pool)
+{
+#ifdef PGF_COUNTS_DEBUG
+ pgf_parsing_print_counts(prev->ps);
+#endif
+
+ PgfPrefixTokenState* ts =
+ pgf_new_token_state(PgfPrefixTokenState, pool);
+ ts->en.next = pgf_parser_completions_next;
+ ts->prefix = prefix;
+ ts->tp = NULL;
+ ts->state =
+ pgf_new_parse_state(prev->ps, prev, &ts->ts, pool);
+
+ return &ts->en;
+}
+
static int
cmp_expr_state(GuOrder* self, const void* a, const void* b)
{
diff --git a/src/runtime/c/pgf/parser.h b/src/runtime/c/pgf/parser.h
index 8c9839298..9fae0a565 100644
--- a/src/runtime/c/pgf/parser.h
+++ b/src/runtime/c/pgf/parser.h
@@ -66,6 +66,10 @@ pgf_parser_next_state(PgfParseState* prev, PgfToken tok);
* the pool used to create \parse.
*/
+GuEnum*
+pgf_parser_completions(PgfParseState* prev, GuString prefix,
+ GuPool* pool);
+
void
pgf_parser_set_beam_size(PgfParseState* state, double beam_size);
diff --git a/src/runtime/c/pgf/pgf.c b/src/runtime/c/pgf/pgf.c
index 39d3fcfbf..24d330981 100644
--- a/src/runtime/c/pgf/pgf.c
+++ b/src/runtime/c/pgf/pgf.c
@@ -236,6 +236,37 @@ pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool)
return pgf_parse_result(state, pool);
}
+GuEnum*
+pgf_get_completions(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
+ GuString prefix, GuPool* pool)
+{
+ // Begin parsing a sentence of the specified category
+ PgfParseState* state =
+ pgf_parser_init_state(concr, cat, 0, pool);
+ if (state == NULL) {
+ return NULL;
+ }
+
+ // Tokenization
+ GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), pool);
+ PgfToken tok = pgf_lexer_read_token(lexer, lex_err);
+ while (!gu_exn_is_raised(lex_err)) {
+ // feed the token to get a new parse state
+ state = pgf_parser_next_state(state, tok);
+ if (state == NULL) {
+ return NULL;
+ }
+
+ tok = pgf_lexer_read_token(lexer, lex_err);
+ }
+
+ if (gu_exn_caught(lex_err) != gu_type(GuEOF))
+ return NULL;
+
+ // Now begin enumerating the resulting syntax trees
+ return pgf_parser_completions(state, prefix, pool);
+}
+
void
pgf_print_chunks(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool)
{
diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h
index 03f1d4d48..39dc0dd04 100644
--- a/src/runtime/c/pgf/pgf.h
+++ b/src/runtime/c/pgf/pgf.h
@@ -116,6 +116,10 @@ pgf_linearize(PgfConcr* concr, PgfExpr expr, GuWriter* wtr, GuExn* err);
PgfExprEnum*
pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool);
+GuEnum*
+pgf_get_completions(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
+ GuString prefix, GuPool* pool);
+
bool
pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat,
double *precision, double *recall, double *exact);