diff options
Diffstat (limited to 'src/runtime/c/pgf/scanner.c')
| -rw-r--r-- | src/runtime/c/pgf/scanner.c | 435 |
1 files changed, 435 insertions, 0 deletions
diff --git a/src/runtime/c/pgf/scanner.c b/src/runtime/c/pgf/scanner.c new file mode 100644 index 000000000..71444ca25 --- /dev/null +++ b/src/runtime/c/pgf/scanner.c @@ -0,0 +1,435 @@ +#include <pgf/data.h> +#include <pgf/linearizer.h> +#include <gu/utf8.h> + +PGF_INTERNAL int +cmp_string(GuString* psent, size_t* ppos, GuString tok, + bool case_sensitive) +{ + for (;;) { + GuUCS c2 = gu_utf8_decode((const uint8_t**) &tok); + if (c2 == 0) + return 0; + + const uint8_t* p = (uint8_t*) *psent; + GuUCS c1 = gu_utf8_decode(&p); + if (c1 == 0) + return -1; + + if (!case_sensitive) + c1 = gu_ucs_to_lower(c1); + + if (c1 != c2) + return (c1-c2); + + *psent = (GuString) p; + (*ppos)++; + } +} + +PGF_INTERNAL bool +skip_space(GuString* psent, size_t* ppos) +{ + const uint8_t* p = (uint8_t*) *psent; + if (!gu_ucs_is_space(gu_utf8_decode(&p))) + return false; + + *psent = (GuString) p; + (*ppos)++; + return true; +} + +PGF_INTERNAL int +pgf_symbols_cmp(GuString* psent, size_t* ppos, + PgfSymbols* syms, size_t* sym_idx, + bool case_sensitive) +{ + size_t n_syms = gu_seq_length(syms); + while (*sym_idx < n_syms) { + PgfSymbol sym = gu_seq_get(syms, PgfSymbol, *sym_idx); + + if (*sym_idx > 0) { + if (!skip_space(psent,ppos)) { + if (**psent == 0) + return -1; + return 1; + } + + while (**psent != 0) { + if (!skip_space(psent,ppos)) + break; + } + } + + GuVariantInfo inf = gu_variant_open(sym); + switch (inf.tag) { + case PGF_SYMBOL_CAT: + case PGF_SYMBOL_LIT: + case PGF_SYMBOL_VAR: { + if (**psent == 0) + return -1; + return 1; + } + case PGF_SYMBOL_KS: { + PgfSymbolKS* pks = inf.data; + if (**psent == 0) + return -1; + + int cmp = cmp_string(psent,ppos,pks->token, case_sensitive); + if (cmp != 0) + return cmp; + break; + } + case PGF_SYMBOL_KP: + case PGF_SYMBOL_BIND: + case PGF_SYMBOL_NE: + case PGF_SYMBOL_SOFT_BIND: + case PGF_SYMBOL_SOFT_SPACE: + case PGF_SYMBOL_CAPIT: + case PGF_SYMBOL_ALL_CAPIT: { + return -1; + } + default: + gu_impossible(); + } + + (*sym_idx)++; + } + + return 0; +} + +static void +pgf_morpho_iter(PgfProductionIdx* idx, + PgfMorphoCallback* callback, + GuExn* err) +{ + size_t n_entries = gu_buf_length(idx); + for (size_t i = 0; i < n_entries; i++) { + PgfProductionIdxEntry* entry = + gu_buf_index(idx, PgfProductionIdxEntry, i); + + PgfCId lemma = entry->papp->fun->absfun->name; + GuString analysis = entry->ccat->cnccat->labels[entry->lin_idx]; + + prob_t prob = entry->ccat->cnccat->abscat->prob + + entry->papp->fun->absfun->ep.prob; + callback->callback(callback, + lemma, analysis, prob, err); + if (!gu_ok(err)) + return; + } +} + +typedef struct { + GuOrder order; + bool case_sensitive; +} PgfSequenceOrder; + +static int +pgf_sequence_cmp_fn(GuOrder* order, const void* p1, const void* p2) +{ + PgfSequenceOrder* self = gu_container(order, PgfSequenceOrder, order); + + size_t pos = 0; + GuString sent = (GuString) p1; + + const PgfSequence* sp2 = p2; + + size_t sym_idx = 0; + int res = pgf_symbols_cmp(&sent, &pos, sp2->syms, &sym_idx, self->case_sensitive); + if (res == 0 && (*sent != 0 || sym_idx != gu_seq_length(sp2->syms))) { + res = 1; + } + + return res; +} + +PGF_API void +pgf_lookup_morpho(PgfConcr *concr, GuString sentence, + PgfMorphoCallback* callback, GuExn* err) +{ + if (concr->sequences == NULL) { + GuExnData* err_data = gu_raise(err, PgfExn); + if (err_data) { + err_data->data = "The concrete syntax is not loaded"; + return; + } + } + + bool case_sensitive = + (gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL); + + PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive }; + PgfSequence* seq = (PgfSequence*) + gu_seq_binsearch(concr->sequences, &order.order, + PgfSequence, (void*) sentence); + + if (seq != NULL && seq->idx != NULL) + pgf_morpho_iter(seq->idx, callback, err); +} + +typedef struct { + GuEnum en; + PgfConcr* concr; + GuString sentence; + GuString current; + size_t len; + PgfMorphoCallback* callback; + GuExn* err; + bool case_sensitive; + GuBuf* spots; + GuBuf* found; +} PgfCohortsState; + +static int +cmp_cohort_spot(GuOrder* self, const void* a, const void* b) +{ + PgfCohortSpot *s1 = (PgfCohortSpot *) a; + PgfCohortSpot *s2 = (PgfCohortSpot *) b; + + return (s1->ptr-s2->ptr); +} + +static GuOrder +pgf_cohort_spot_order[1] = {{ cmp_cohort_spot }}; + +static void +pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot, + int i, int j, ptrdiff_t min, ptrdiff_t max) +{ + // This is a variation of a binary search algorithm which + // can retrieve all prefixes of a string with minimal + // comparisons, i.e. there is no need to lookup every + // prefix separately. + + while (i <= j) { + int k = (i+j) / 2; + PgfSequence* seq = gu_seq_index(state->concr->sequences, PgfSequence, k); + + PgfCohortSpot current = *spot; + + size_t sym_idx = 0; + int cmp = pgf_symbols_cmp(¤t.ptr, ¤t.pos, seq->syms, &sym_idx, state->case_sensitive); + if (cmp < 0) { + j = k-1; + } else if (cmp > 0) { + ptrdiff_t len = current.ptr - spot->ptr; + + if (min <= len) + pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len); + + if (len+1 <= max) + pgf_lookup_cohorts_helper(state, spot, k+1, j, len+1, max); + + break; + } else { + ptrdiff_t len = current.ptr - spot->ptr; + + if (min <= len-1) + pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len-1); + + if (seq->idx != NULL) { + PgfCohortRange* range = gu_buf_insert(state->found, 0); + range->start = *spot; + range->end = current; + range->buf = seq->idx; + + while (*current.ptr != 0) { + if (!skip_space(¤t.ptr, ¤t.pos)) + break; + } + + gu_buf_heap_push(state->spots, pgf_cohort_spot_order, ¤t); + } + + if (len+1 <= max) + pgf_lookup_cohorts_helper(state, spot, k+1, j, len+1, max); + + break; + } + } +} + +static void +pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool) +{ + PgfCohortsState* state = gu_container(self, PgfCohortsState, en); + + while (gu_buf_length(state->found) == 0 && + gu_buf_length(state->spots) > 0) { + PgfCohortSpot spot; + gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot); + + if (spot.ptr == state->current) + continue; + + if (*spot.ptr == 0) + break; + + pgf_lookup_cohorts_helper + (state, &spot, + 0, gu_seq_length(state->concr->sequences)-1, + 1, (state->sentence+state->len)-spot.ptr); + + if (gu_buf_length(state->found) == 0) { + gu_utf8_decode((const uint8_t**) &spot.ptr); + spot.pos++; + gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot); + } + } + + PgfCohortRange* pRes = (PgfCohortRange*)to; + + if (gu_buf_length(state->found) == 0) { + pRes->start.pos = 0; + pRes->start.ptr = NULL; + pRes->end.pos = 0; + pRes->end.ptr = NULL; + pRes->buf = NULL; + state->current = NULL; + return; + } else { + *pRes = gu_buf_pop(state->found, PgfCohortRange); + state->current = pRes->start.ptr; + pgf_morpho_iter(pRes->buf, state->callback, state->err); + } +} + +PGF_API GuEnum* +pgf_lookup_cohorts(PgfConcr *concr, GuString sentence, + PgfMorphoCallback* callback, + GuPool* pool, GuExn* err) +{ + if (concr->sequences == NULL) { + GuExnData* err_data = gu_raise(err, PgfExn); + if (err_data) { + err_data->data = "The concrete syntax is not loaded"; + return NULL; + } + } + + bool case_sensitive = + (gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL); + + PgfCohortsState* state = gu_new(PgfCohortsState, pool); + state->en.next = pgf_lookup_cohorts_enum_next; + state->concr = concr; + state->sentence= sentence; + state->len = strlen(sentence); + state->callback= callback; + state->err = err; + state->case_sensitive = case_sensitive; + state->spots = gu_new_buf(PgfCohortSpot, pool); + state->found = gu_new_buf(PgfCohortRange, pool); + + PgfCohortSpot spot = {0,sentence}; + while (*spot.ptr != 0) { + if (!skip_space(&spot.ptr, &spot.pos)) + break; + } + + gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot); + + return &state->en; +} + +typedef struct { + GuEnum en; + PgfSequences* sequences; + GuString prefix; + size_t seq_idx; +} PgfFullFormState; + +struct PgfFullFormEntry { + GuString tokens; + PgfProductionIdx* idx; +}; + +static void +gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool) +{ + PgfFullFormState* st = gu_container(self, PgfFullFormState, en); + PgfFullFormEntry* entry = NULL; + + if (st->sequences != NULL) { + size_t n_seqs = gu_seq_length(st->sequences); + while (st->seq_idx < n_seqs) { + PgfSequence* seq = gu_seq_index(st->sequences, PgfSequence, st->seq_idx); + GuString tokens = pgf_get_tokens(seq->syms, 0, pool); + + if (!gu_string_is_prefix(st->prefix, tokens)) { + st->seq_idx = n_seqs; + break; + } + + if (*tokens != 0 && seq->idx != NULL) { + entry = gu_new(PgfFullFormEntry, pool); + entry->tokens = tokens; + entry->idx = seq->idx; + + st->seq_idx++; + break; + } + + st->seq_idx++; + } + } + + *((PgfFullFormEntry**) to) = entry; +} + +PGF_API GuEnum* +pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool) +{ + PgfFullFormState* st = gu_new(PgfFullFormState, pool); + st->en.next = gu_fullform_enum_next; + st->sequences = concr->sequences; + st->prefix = ""; + st->seq_idx = 0; + return &st->en; +} + +PGF_API GuString +pgf_fullform_get_string(PgfFullFormEntry* entry) +{ + return entry->tokens; +} + +PGF_API void +pgf_fullform_get_analyses(PgfFullFormEntry* entry, + PgfMorphoCallback* callback, GuExn* err) +{ + pgf_morpho_iter(entry->idx, callback, err); +} + +PGF_API GuEnum* +pgf_lookup_word_prefix(PgfConcr *concr, GuString prefix, + GuPool* pool, GuExn* err) +{ + if (concr->sequences == NULL) { + GuExnData* err_data = gu_raise(err, PgfExn); + if (err_data) { + err_data->data = "The concrete syntax is not loaded"; + return NULL; + } + } + + PgfFullFormState* state = gu_new(PgfFullFormState, pool); + state->en.next = gu_fullform_enum_next; + state->sequences = concr->sequences; + state->prefix = prefix; + state->seq_idx = 0; + + bool case_sensitive = + (gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL); + + PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive }; + if (!gu_seq_binsearch_index(concr->sequences, &order.order, + PgfSequence, (void*) prefix, + &state->seq_idx)) { + state->seq_idx++; + } + + return &state->en; +} |
