#include #include #include PGF_INTERNAL int cmp_string(GuString* psent, size_t* ppos, GuString tok, bool case_sensitive) { for (;;) { GuUCS c2 = gu_utf8_decode((const uint8_t**) &tok); if (c2 == 0) return 0; const uint8_t* p = (uint8_t*) *psent; GuUCS c1 = gu_utf8_decode(&p); if (c1 == 0) return -1; if (!case_sensitive) c1 = gu_ucs_to_lower(c1); if (c1 != c2) return (c1-c2); *psent = (GuString) p; (*ppos)++; } } PGF_INTERNAL bool skip_space(GuString* psent, size_t* ppos) { const uint8_t* p = (uint8_t*) *psent; if (!gu_ucs_is_space(gu_utf8_decode(&p))) return false; *psent = (GuString) p; (*ppos)++; return true; } PGF_INTERNAL int pgf_symbols_cmp(GuString* psent, size_t* ppos, PgfSymbols* syms, size_t* sym_idx, bool case_sensitive) { size_t n_syms = gu_seq_length(syms); while (*sym_idx < n_syms) { PgfSymbol sym = gu_seq_get(syms, PgfSymbol, *sym_idx); if (*sym_idx > 0) { if (!skip_space(psent,ppos)) { if (**psent == 0) return -1; return 1; } while (**psent != 0) { if (!skip_space(psent,ppos)) break; } } GuVariantInfo inf = gu_variant_open(sym); switch (inf.tag) { case PGF_SYMBOL_CAT: case PGF_SYMBOL_LIT: case PGF_SYMBOL_VAR: { if (**psent == 0) return -1; return 1; } case PGF_SYMBOL_KS: { PgfSymbolKS* pks = inf.data; if (**psent == 0) return -1; int cmp = cmp_string(psent,ppos,pks->token, case_sensitive); if (cmp != 0) return cmp; break; } case PGF_SYMBOL_KP: case PGF_SYMBOL_BIND: case PGF_SYMBOL_NE: case PGF_SYMBOL_SOFT_BIND: case PGF_SYMBOL_SOFT_SPACE: case PGF_SYMBOL_CAPIT: case PGF_SYMBOL_ALL_CAPIT: { return -1; } default: gu_impossible(); } (*sym_idx)++; } return 0; } static void pgf_morpho_iter(PgfProductionIdx* idx, PgfMorphoCallback* callback, GuExn* err) { size_t n_entries = gu_buf_length(idx); for (size_t i = 0; i < n_entries; i++) { PgfProductionIdxEntry* entry = gu_buf_index(idx, PgfProductionIdxEntry, i); PgfCId lemma = entry->papp->fun->absfun->name; GuString analysis = entry->ccat->cnccat->labels[entry->lin_idx]; prob_t prob = entry->ccat->cnccat->abscat->prob + entry->papp->fun->absfun->ep.prob; callback->callback(callback, lemma, analysis, prob, err); if (!gu_ok(err)) return; } } typedef struct { GuOrder order; bool case_sensitive; } PgfSequenceOrder; static int pgf_sequence_cmp_fn(GuOrder* order, const void* p1, const void* p2) { PgfSequenceOrder* self = gu_container(order, PgfSequenceOrder, order); size_t pos = 0; GuString sent = (GuString) p1; const PgfSequence* sp2 = p2; size_t sym_idx = 0; int res = pgf_symbols_cmp(&sent, &pos, sp2->syms, &sym_idx, self->case_sensitive); if (res == 0 && (*sent != 0 || sym_idx != gu_seq_length(sp2->syms))) { res = 1; } return res; } PGF_API void pgf_lookup_morpho(PgfConcr *concr, GuString sentence, PgfMorphoCallback* callback, GuExn* err) { if (concr->sequences == NULL) { GuExnData* err_data = gu_raise(err, PgfExn); if (err_data) { err_data->data = "The concrete syntax is not loaded"; return; } } bool case_sensitive = (gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL); PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive }; PgfSequence* seq = (PgfSequence*) gu_seq_binsearch(concr->sequences, &order.order, PgfSequence, (void*) sentence); if (seq != NULL && seq->idx != NULL) pgf_morpho_iter(seq->idx, callback, err); } typedef struct { GuEnum en; PgfConcr* concr; GuString sentence; GuString current; size_t len; PgfMorphoCallback* callback; GuExn* err; bool case_sensitive; GuBuf* spots; GuBuf* found; } PgfCohortsState; static int cmp_cohort_spot(GuOrder* self, const void* a, const void* b) { PgfCohortSpot *s1 = (PgfCohortSpot *) a; PgfCohortSpot *s2 = (PgfCohortSpot *) b; return (s1->ptr-s2->ptr); } static GuOrder pgf_cohort_spot_order[1] = {{ cmp_cohort_spot }}; static void pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot, int i, int j, ptrdiff_t min, ptrdiff_t max) { // This is a variation of a binary search algorithm which // can retrieve all prefixes of a string with minimal // comparisons, i.e. there is no need to lookup every // prefix separately. while (i <= j) { int k = (i+j) / 2; PgfSequence* seq = gu_seq_index(state->concr->sequences, PgfSequence, k); PgfCohortSpot current = *spot; size_t sym_idx = 0; int cmp = pgf_symbols_cmp(¤t.ptr, ¤t.pos, seq->syms, &sym_idx, state->case_sensitive); if (cmp < 0) { j = k-1; } else if (cmp > 0) { ptrdiff_t len = current.ptr - spot->ptr; if (min <= len) pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len); if (len+1 <= max) pgf_lookup_cohorts_helper(state, spot, k+1, j, len+1, max); break; } else { ptrdiff_t len = current.ptr - spot->ptr; if (min <= len-1) pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len-1); if (seq->idx != NULL && gu_buf_length(seq->idx) > 0) { PgfCohortRange* range = gu_buf_insert(state->found, 0); range->start = *spot; range->end = current; range->buf = seq->idx; } while (*current.ptr != 0) { if (!skip_space(¤t.ptr, ¤t.pos)) break; } gu_buf_heap_push(state->spots, pgf_cohort_spot_order, ¤t); if (len+1 <= max) pgf_lookup_cohorts_helper(state, spot, k+1, j, len+1, max); break; } } } static void pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool) { PgfCohortsState* state = gu_container(self, PgfCohortsState, en); while (gu_buf_length(state->found) == 0 && gu_buf_length(state->spots) > 0) { PgfCohortSpot spot; gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot); if (spot.ptr == state->current) continue; if (*spot.ptr == 0) break; pgf_lookup_cohorts_helper (state, &spot, 0, gu_seq_length(state->concr->sequences)-1, 1, (state->sentence+state->len)-spot.ptr); if (gu_buf_length(state->found) == 0) { gu_utf8_decode((const uint8_t**) &spot.ptr); spot.pos++; gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot); } } PgfCohortRange* pRes = (PgfCohortRange*)to; if (gu_buf_length(state->found) == 0) { pRes->start.pos = 0; pRes->start.ptr = NULL; pRes->end.pos = 0; pRes->end.ptr = NULL; pRes->buf = NULL; state->current = NULL; return; } else { *pRes = gu_buf_pop(state->found, PgfCohortRange); state->current = pRes->start.ptr; pgf_morpho_iter(pRes->buf, state->callback, state->err); } } PGF_API GuEnum* pgf_lookup_cohorts(PgfConcr *concr, GuString sentence, PgfMorphoCallback* callback, GuPool* pool, GuExn* err) { if (concr->sequences == NULL) { GuExnData* err_data = gu_raise(err, PgfExn); if (err_data) { err_data->data = "The concrete syntax is not loaded"; return NULL; } } bool case_sensitive = (gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL); PgfCohortsState* state = gu_new(PgfCohortsState, pool); state->en.next = pgf_lookup_cohorts_enum_next; state->concr = concr; state->sentence= sentence; state->len = strlen(sentence); state->callback= callback; state->err = err; state->case_sensitive = case_sensitive; state->spots = gu_new_buf(PgfCohortSpot, pool); state->found = gu_new_buf(PgfCohortRange, pool); PgfCohortSpot spot = {0,sentence}; while (*spot.ptr != 0) { if (!skip_space(&spot.ptr, &spot.pos)) break; } gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot); return &state->en; } typedef struct { GuEnum en; PgfSequences* sequences; GuString prefix; size_t seq_idx; } PgfFullFormState; struct PgfFullFormEntry { GuString tokens; PgfProductionIdx* idx; }; static void gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool) { PgfFullFormState* st = gu_container(self, PgfFullFormState, en); PgfFullFormEntry* entry = NULL; if (st->sequences != NULL) { size_t n_seqs = gu_seq_length(st->sequences); while (st->seq_idx < n_seqs) { PgfSequence* seq = gu_seq_index(st->sequences, PgfSequence, st->seq_idx); GuString tokens = pgf_get_tokens(seq->syms, 0, pool); if (!gu_string_is_prefix(st->prefix, tokens)) { st->seq_idx = n_seqs; break; } if (*tokens != 0 && seq->idx != NULL) { entry = gu_new(PgfFullFormEntry, pool); entry->tokens = tokens; entry->idx = seq->idx; st->seq_idx++; break; } st->seq_idx++; } } *((PgfFullFormEntry**) to) = entry; } PGF_API GuEnum* pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool) { PgfFullFormState* st = gu_new(PgfFullFormState, pool); st->en.next = gu_fullform_enum_next; st->sequences = concr->sequences; st->prefix = ""; st->seq_idx = 0; return &st->en; } PGF_API GuString pgf_fullform_get_string(PgfFullFormEntry* entry) { return entry->tokens; } PGF_API void pgf_fullform_get_analyses(PgfFullFormEntry* entry, PgfMorphoCallback* callback, GuExn* err) { pgf_morpho_iter(entry->idx, callback, err); } PGF_API GuEnum* pgf_lookup_word_prefix(PgfConcr *concr, GuString prefix, GuPool* pool, GuExn* err) { if (concr->sequences == NULL) { GuExnData* err_data = gu_raise(err, PgfExn); if (err_data) { err_data->data = "The concrete syntax is not loaded"; return NULL; } } PgfFullFormState* state = gu_new(PgfFullFormState, pool); state->en.next = gu_fullform_enum_next; state->sequences = concr->sequences; state->prefix = prefix; state->seq_idx = 0; bool case_sensitive = (gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL); PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive }; if (!gu_seq_binsearch_index(concr->sequences, &order.order, PgfSequence, (void*) prefix, &state->seq_idx)) { state->seq_idx++; } return &state->en; }