diff options
Diffstat (limited to 'src/runtime/c')
| -rw-r--r-- | src/runtime/c/pgf/pgf.h | 4 | ||||
| -rw-r--r-- | src/runtime/c/pgf/scanner.c | 107 |
2 files changed, 77 insertions, 34 deletions
diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index 5dbe2e2e1..6ff269e00 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -171,8 +171,8 @@ pgf_lookup_morpho(PgfConcr *concr, GuString sentence, PgfMorphoCallback* callback, GuExn* err); typedef struct { - size_t pos; - GuString ptr; + size_t pos; // position in Unicode characters + GuString ptr; // pointer into the string } PgfCohortSpot; typedef struct { diff --git a/src/runtime/c/pgf/scanner.c b/src/runtime/c/pgf/scanner.c index ad3605edc..0b2f9680f 100644 --- a/src/runtime/c/pgf/scanner.c +++ b/src/runtime/c/pgf/scanner.c @@ -233,12 +233,13 @@ typedef struct { GuEnum en; PgfConcr* concr; GuString sentence; - GuString current; size_t len; PgfMorphoCallback* callback; GuExn* err; bool case_sensitive; GuBuf* spots; + GuBuf* skip_spots; + GuBuf* empty_buf; GuBuf* found; } PgfCohortsState; @@ -255,6 +256,29 @@ static GuOrder pgf_cohort_spot_order[1] = {{ cmp_cohort_spot }}; static void +pgf_lookup_cohorts_report_skip(PgfCohortsState *state, + PgfCohortSpot* spot, GuString msg) +{ + PgfCohortSpot end_spot = *spot; + while (gu_ucs_is_space(*(end_spot.ptr-1))) { + end_spot.pos--; + end_spot.ptr--; + } + + size_t n_spots = gu_buf_length(state->skip_spots); + for (size_t i = 0; i < n_spots; i++) { + PgfCohortSpot* skip_spot = + gu_buf_index(state->skip_spots, PgfCohortSpot, i); + + PgfCohortRange* range = gu_buf_insert(state->found, 0); + range->start = *skip_spot; + range->end = end_spot; + range->buf = state->empty_buf; + } + gu_buf_flush(state->skip_spots); +} + +static void pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot, int i, int j, ptrdiff_t min, ptrdiff_t max) { @@ -290,18 +314,23 @@ pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot, pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len); if (seq->idx != NULL && gu_buf_length(seq->idx) > 0) { + // Report unknown words + pgf_lookup_cohorts_report_skip(state, spot, "a"); + + // Report the actual hit PgfCohortRange* range = gu_buf_insert(state->found, 0); range->start = *spot; range->end = current; range->buf = seq->idx; - } - while (*current.ptr != 0) { - if (!skip_space(¤t.ptr, ¤t.pos)) - break; - } + // Schedule the next search spot + while (*current.ptr != 0) { + if (!skip_space(¤t.ptr, ¤t.pos)) + break; + } - gu_buf_heap_push(state->spots, pgf_cohort_spot_order, ¤t); + gu_buf_heap_push(state->spots, pgf_cohort_spot_order, ¤t); + } if (len <= max) pgf_lookup_cohorts_helper(state, spot, k+1, j, len, max); @@ -317,29 +346,45 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool) PgfCohortsState* state = gu_container(self, PgfCohortsState, en); while (gu_buf_length(state->found) == 0 && - gu_buf_length(state->spots) > 0) { + gu_buf_length(state->spots) > 0) { PgfCohortSpot spot; gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot); - if (spot.ptr == state->current) - continue; + GuString next_ptr = state->sentence+state->len; + while (gu_buf_length(state->spots) > 0) { + GuString ptr = + gu_buf_index(state->spots, PgfCohortSpot, 0)->ptr; + if (ptr > spot.ptr) { + next_ptr = ptr; + break; + } + gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot); + } - if (*spot.ptr == 0) - break; + bool needs_report = true; + while (next_ptr > spot.ptr) { + pgf_lookup_cohorts_helper + (state, &spot, + 0, gu_seq_length(state->concr->sequences)-1, + 1, (state->sentence+state->len)-spot.ptr); + + if (gu_buf_length(state->found) > 0) + break; + + if (needs_report) { + gu_buf_push(state->skip_spots, PgfCohortSpot, spot); + needs_report = false; + } - pgf_lookup_cohorts_helper - (state, &spot, - 0, gu_seq_length(state->concr->sequences)-1, - 1, (state->sentence+state->len)-spot.ptr); - - if (gu_buf_length(state->found) == 0) { // skip one character and try again gu_utf8_decode((const uint8_t**) &spot.ptr); spot.pos++; - gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot); } } + PgfCohortSpot end_spot = {state->len, state->sentence+state->len}; + pgf_lookup_cohorts_report_skip(state, &end_spot, "b"); + PgfCohortRange* pRes = (PgfCohortRange*)to; if (gu_buf_length(state->found) == 0) { @@ -348,15 +393,11 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool) pRes->end.pos = 0; pRes->end.ptr = NULL; pRes->buf = NULL; - state->current = NULL; - return; } else do { *pRes = gu_buf_pop(state->found, PgfCohortRange); - state->current = pRes->start.ptr; pgf_morpho_iter(pRes->buf, state->callback, state->err); } while (gu_buf_length(state->found) > 0 && gu_buf_index_last(state->found, PgfCohortRange)->end.ptr == pRes->end.ptr); - } PGF_API GuEnum* @@ -373,15 +414,17 @@ pgf_lookup_cohorts(PgfConcr *concr, GuString sentence, } PgfCohortsState* state = gu_new(PgfCohortsState, pool); - state->en.next = pgf_lookup_cohorts_enum_next; - state->concr = concr; - state->sentence= sentence; - state->len = strlen(sentence); - state->callback= callback; - state->err = err; - state->case_sensitive = pgf_is_case_sensitive(concr); - state->spots = gu_new_buf(PgfCohortSpot, pool); - state->found = gu_new_buf(PgfCohortRange, pool); + state->en.next = pgf_lookup_cohorts_enum_next; + state->concr = concr; + state->sentence = sentence; + state->len = strlen(sentence); + state->callback = callback; + state->err = err; + state->case_sensitive= pgf_is_case_sensitive(concr); + state->spots = gu_new_buf(PgfCohortSpot, pool); + state->skip_spots = gu_new_buf(PgfCohortSpot, pool); + state->empty_buf = gu_new_buf(PgfProductionIdxEntry, pool); + state->found = gu_new_buf(PgfCohortRange, pool); PgfCohortSpot spot = {0,sentence}; while (*spot.ptr != 0) { |
