summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorkr.angelov <kr.angelov@gmail.com>2013-08-19 12:16:30 +0000
committerkr.angelov <kr.angelov@gmail.com>2013-08-19 12:16:30 +0000
commite9eeefa6f05babcf9b9e2ca471797368672f806c (patch)
treefd70669dd4fb4310ec619e39b8f82e7d90f32e13 /src
parent308e1445ca5396327d9e78d68cc45e8e008a4876 (diff)
fullFormLexicon in C and Python
Diffstat (limited to 'src')
-rw-r--r--src/runtime/c/pgf/parser.c206
-rw-r--r--src/runtime/c/pgf/pgf.h14
-rw-r--r--src/runtime/python/pypgf.c72
3 files changed, 261 insertions, 31 deletions
diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c
index ac863db0a..f792fdaa8 100644
--- a/src/runtime/c/pgf/parser.c
+++ b/src/runtime/c/pgf/parser.c
@@ -2339,40 +2339,42 @@ pgf_morpho_iter(GuMapItor* fn, const void* key, void* value, GuExn* err)
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
- // match the tokens with the production
- size_t pos = 0;
- PgfSequence seq = papp->fun->lins[cfc.lin_idx];
- size_t len = gu_seq_length(seq);
- for (size_t i = 0; i < len; i++) {
- PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i);
-
- GuVariantInfo i = gu_variant_open(sym);
- switch (i.tag) {
- case PGF_SYMBOL_KS: {
- PgfSymbolKS* symks = i.data;
- size_t len = gu_seq_length(symks->tokens);
- for (size_t i = 0; i < len; i++) {
- if (pos >= gu_seq_length(clo->tokens))
- goto cont;
-
- PgfToken tok1 = gu_seq_get(symks->tokens, PgfToken, i);
- PgfToken tok2 = gu_seq_get(clo->tokens, PgfToken, pos++);
-
- if (!gu_string_eq(tok1, tok2))
- goto cont;
+ if (!gu_seq_is_null(clo->tokens)) {
+ // match the tokens with the production
+ size_t pos = 0;
+ PgfSequence seq = papp->fun->lins[cfc.lin_idx];
+ size_t len = gu_seq_length(seq);
+ for (size_t i = 0; i < len; i++) {
+ PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i);
+
+ GuVariantInfo i = gu_variant_open(sym);
+ switch (i.tag) {
+ case PGF_SYMBOL_KS: {
+ PgfSymbolKS* symks = i.data;
+ size_t len = gu_seq_length(symks->tokens);
+ for (size_t i = 0; i < len; i++) {
+ if (pos >= gu_seq_length(clo->tokens))
+ goto cont;
+
+ PgfToken tok1 = gu_seq_get(symks->tokens, PgfToken, i);
+ PgfToken tok2 = gu_seq_get(clo->tokens, PgfToken, pos++);
+
+ if (!gu_string_eq(tok1, tok2))
+ goto cont;
+ }
+ }
+ default:
+ continue;
}
}
- default:
- continue;
- }
+
+ if (pos != gu_seq_length(clo->tokens))
+ goto cont;
}
-
- if (pos != gu_seq_length(clo->tokens))
- goto cont;
PgfCId lemma = papp->fun->absfun->name;
prob_t prob = papp->fun->absfun->ep.prob;
- clo->callback->callback(clo->callback, clo->tokens,
+ clo->callback->callback(clo->callback,
lemma, analysis, prob, err);
}
}
@@ -2410,10 +2412,156 @@ pgf_lookup_morpho(PgfConcr *concr, PgfLexer *lexer,
PgfMorphoFn clo = { { pgf_morpho_iter }, gu_buf_seq(tokens), callback };
gu_map_iter(lexicon_idx, &clo.fn, err);
-
+
gu_pool_free(tmp_pool);
}
+typedef struct {
+ GuEnum en;
+ GuEnum* map_en1;
+ GuEnum* map_en2;
+
+ GuMapItor fn;
+ PgfLeftcornerTokIdx* new_idx;
+
+ GuPool* pool;
+} PgfFullFormState;
+
+static void
+pgf_fullform_iter(GuMapItor* fn, const void* key, void* value, GuExn* err)
+{
+ PgfFullFormState* st = gu_container(fn, PgfFullFormState, fn);
+ PgfCFCat cfc = *((PgfCFCat*) key);
+ PgfProductionSeq prods = *((PgfProductionSeq*) value);
+
+ if (gu_seq_is_null(prods))
+ return;
+
+ size_t n_prods = gu_seq_length(prods);
+ for (size_t i = 0; i < n_prods; i++) {
+ PgfProduction prod =
+ gu_seq_get(prods, PgfProduction, i);
+
+ GuVariantInfo i = gu_variant_open(prod);
+ switch (i.tag) {
+ case PGF_PRODUCTION_APPLY: {
+ PgfProductionApply* papp = i.data;
+
+ GuPool* tmp_pool = gu_new_pool();
+ GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool);
+ GuStringBuf* sbuf = gu_string_buf(tmp_pool);
+ GuWriter* wtr = gu_string_buf_writer(sbuf);
+
+ // collect the tokens in the production
+ PgfSequence seq = papp->fun->lins[cfc.lin_idx];
+ size_t len = gu_seq_length(seq);
+ for (size_t i = 0; i < len; i++) {
+ PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i);
+
+ GuVariantInfo i = gu_variant_open(sym);
+ switch (i.tag) {
+ case PGF_SYMBOL_KS: {
+ PgfSymbolKS* symks = i.data;
+ size_t len = gu_seq_length(symks->tokens);
+ for (size_t i = 0; i < len; i++) {
+ if (i > 0) {
+ gu_putc(' ', wtr, err);
+ }
+
+ PgfToken tok = gu_seq_get(symks->tokens, PgfToken, i);
+ gu_string_write(tok, wtr, err);
+ }
+ }
+ default:
+ continue;
+ }
+ }
+ GuString tokens = gu_string_buf_freeze(sbuf, st->pool);
+
+ // create a new production index with keys that
+ // are multiword units
+ PgfProductionIdx* lexicon_idx =
+ gu_map_get(st->new_idx, &tokens, PgfProductionIdx*);
+ if (lexicon_idx == NULL) {
+ lexicon_idx = gu_map_type_new(PgfProductionIdx, st->pool);
+ gu_map_put(st->new_idx, &tokens, PgfProductionIdx*, lexicon_idx);
+ }
+
+ PgfProductionSeq prods =
+ gu_map_get(lexicon_idx, &cfc, PgfProductionSeq);
+ if (gu_seq_is_null(prods)) {
+ prods = gu_buf_seq(gu_new_buf(PgfProduction, st->pool));
+ gu_map_put(lexicon_idx, &cfc, PgfProductionSeq, prods);
+ }
+
+ gu_buf_push(gu_seq_buf(prods), PgfProduction, prod);
+ }
+ }
+ }
+}
+
+static void
+gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool)
+{
+ PgfFullFormState* st = gu_container(self, PgfFullFormState, en);
+
+ for (;;) {
+ if (st->new_idx == NULL) {
+ GuMapKeyValue* kv = gu_next(st->map_en1, GuMapKeyValue*, pool);
+ if (kv == NULL) {
+ *((PgfFullFormEntry**)to) = NULL;
+ return;
+ }
+
+ PgfProductionIdx* lexicon_idx = *((PgfProductionIdx**) kv->value);
+
+ // we have an index by the first token but we must re-index
+ // by taking into account the multiword units
+ st->pool = pool;
+ st->new_idx = gu_map_type_new(PgfLeftcornerTokIdx, pool);
+ st->fn.fn = pgf_fullform_iter;
+ gu_map_iter(lexicon_idx, &st->fn, NULL);
+
+ st->map_en2 = gu_map_enum(st->new_idx, pool);
+ }
+ PgfFullFormEntry* entry =
+ gu_next(st->map_en2, PgfFullFormEntry*, pool);
+ if (entry != NULL) {
+ *((PgfFullFormEntry**)to) = entry;
+ break;
+ }
+
+ st->new_idx = NULL;
+ }
+}
+
+GuEnum*
+pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool)
+{
+ PgfFullFormState* st = gu_new(PgfFullFormState, pool);
+ st->en.next = gu_fullform_enum_next;
+ st->map_en1 = gu_map_enum(concr->leftcorner_tok_idx, pool);
+ st->map_en2 = NULL;
+ st->new_idx = NULL;
+ st->pool = NULL;
+ return &st->en;
+}
+
+GuString
+pgf_fullform_get_string(PgfFullFormEntry* entry)
+{
+ return *((GuString*) entry->key);
+}
+
+void
+pgf_fullform_get_analyses(PgfFullFormEntry* entry,
+ PgfMorphoCallback* callback, GuExn* err)
+{
+ PgfProductionIdx* lexicon_idx = *((PgfProductionIdx**) entry->value);
+ PgfMorphoFn clo = { { pgf_morpho_iter }, gu_null_seq, callback };
+ gu_map_iter(lexicon_idx, &clo.fn, err);
+}
+
static void
pgf_parser_leftcorner_add_token(PgfConcr* concr,
PgfTokens tokens, PgfItem* item,
diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h
index 40ae1ca45..51998c7b8 100644
--- a/src/runtime/c/pgf/pgf.h
+++ b/src/runtime/c/pgf/pgf.h
@@ -126,7 +126,7 @@ pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
typedef struct PgfMorphoCallback PgfMorphoCallback;
struct PgfMorphoCallback {
- void (*callback)(PgfMorphoCallback* self, PgfTokens tokens,
+ void (*callback)(PgfMorphoCallback* self,
PgfCId lemma, GuString analysis, prob_t prob,
GuExn* err);
};
@@ -135,6 +135,18 @@ void
pgf_lookup_morpho(PgfConcr *concr, PgfLexer *lexer,
PgfMorphoCallback* callback, GuExn* err);
+typedef GuMapKeyValue PgfFullFormEntry;
+
+GuEnum*
+pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool);
+
+GuString
+pgf_fullform_get_string(PgfFullFormEntry* entry);
+
+void
+pgf_fullform_get_analyses(PgfFullFormEntry* entry,
+ PgfMorphoCallback* callback, GuExn* err);
+
PgfExprEnum*
pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
double heuristics,
diff --git a/src/runtime/python/pypgf.c b/src/runtime/python/pypgf.c
index 5b4ae9193..eb6725050 100644
--- a/src/runtime/python/pypgf.c
+++ b/src/runtime/python/pypgf.c
@@ -1690,7 +1690,7 @@ typedef struct {
} PyMorphoCallback;
static void
-pypgf_collect_morpho(PgfMorphoCallback* self, PgfTokens tokens,
+pypgf_collect_morpho(PgfMorphoCallback* self,
PgfCId lemma, GuString analysis, prob_t prob,
GuExn* err)
{
@@ -1765,6 +1765,73 @@ Concr_lookupMorpho(ConcrObject* self, PyObject *args, PyObject *keywds) {
return analyses;
}
+PyObject*
+Iter_fetch_fullform(IterObject* self)
+{
+ PgfFullFormEntry* entry =
+ gu_next(self->res, PgfFullFormEntry*, self->pool);
+ if (entry == NULL)
+ return NULL;
+
+ PyObject* res = NULL;
+ PyObject* py_tokens = NULL;
+ PyObject* py_analyses = NULL;
+
+ GuString tokens =
+ pgf_fullform_get_string(entry);
+
+ py_tokens = gu2py_string(tokens);
+ if (py_tokens == NULL)
+ goto done;
+
+ py_analyses = PyList_New(0);
+ if (py_analyses == NULL)
+ goto done;
+
+ GuPool* tmp_pool = gu_local_pool();
+ GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool);
+
+ PyMorphoCallback callback = { { pypgf_collect_morpho }, py_analyses };
+ pgf_fullform_get_analyses(entry, &callback.fn, err);
+
+ if (!gu_ok(err))
+ goto done;
+
+ res = Py_BuildValue("OO", py_tokens, py_analyses);
+
+done:
+ Py_XDECREF(py_tokens);
+ Py_XDECREF(py_analyses);
+
+ return res;
+}
+
+static PyObject*
+Concr_fullFormLexicon(ConcrObject* self, PyObject *args)
+{
+ IterObject* pyres = (IterObject*)
+ pgf_IterType.tp_alloc(&pgf_IterType, 0);
+ if (pyres == NULL)
+ return NULL;
+
+ pyres->grammar = self->grammar;
+ Py_XINCREF(pyres->grammar);
+
+ pyres->container = NULL;
+ pyres->pool = gu_new_pool();
+ pyres->max_count = -1;
+ pyres->counter = 0;
+ pyres->fetch = Iter_fetch_fullform;
+
+ pyres->res = pgf_fullform_lexicon(self->concr, pyres->pool);
+ if (pyres->res == NULL) {
+ Py_DECREF(pyres);
+ return NULL;
+ }
+
+ return (PyObject*) pyres;
+}
+
static PyGetSetDef Concr_getseters[] = {
{"name",
(getter)Concr_getName, NULL,
@@ -1810,6 +1877,9 @@ static PyMethodDef Concr_methods[] = {
{"lookupMorpho", (PyCFunction)Concr_lookupMorpho, METH_VARARGS | METH_KEYWORDS,
"Looks up a word in the lexicon of the grammar"
},
+ {"fullFormLexicon", (PyCFunction)Concr_fullFormLexicon, METH_VARARGS,
+ "Enumerates all words in the lexicon (useful for extracting full form lexicons)"
+ },
{NULL} /* Sentinel */
};