summaryrefslogtreecommitdiff
path: root/src/runtime/python/pgf
diff options
context:
space:
mode:
authorgregoire.detrez <gregoire.detrez@gu.se>2013-01-24 13:31:34 +0000
committergregoire.detrez <gregoire.detrez@gu.se>2013-01-24 13:31:34 +0000
commit0aae4702edbd4889159e3772b72d0a4c10b7e57a (patch)
tree4b04b948fffe0ef768c204a25b1628a7640b0b27 /src/runtime/python/pgf
parente7db50b9bdd710690c78038d095bf249a9ed7f02 (diff)
Python binding: add a parsing function that accepts directly a list of tokens.
Is allows to define a tokenizer in python (or use an existing one, from nltk for instance.)
Diffstat (limited to 'src/runtime/python/pgf')
-rw-r--r--src/runtime/python/pgf/binding.c71
1 files changed, 71 insertions, 0 deletions
diff --git a/src/runtime/python/pgf/binding.c b/src/runtime/python/pgf/binding.c
index 5efb7a0be..dbbf7eb3a 100644
--- a/src/runtime/python/pgf/binding.c
+++ b/src/runtime/python/pgf/binding.c
@@ -362,6 +362,74 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds)
return pyres;
}
+// Concr_parse_tokens is the same as the above function but
+// instead of a string it expect a sequence of tokens as argument.
+// This is usefull if you want to implement your own tokenizer in
+// python.
+static ExprIterObject*
+Concr_parse_tokens(ConcrObject* self, PyObject *args, PyObject *keywds)
+{
+ static char *kwlist[] = {"tokens", "cat", "n", NULL};
+ // Variable for the input list of tokens
+ PyObject* obj;
+ PyObject* seq;
+ int len;
+ const char *catname_s = NULL;
+ int max_count = -1;
+
+ // Parsing arguments: the tokens is a python object (O),
+ // cat is a string (s) and n an integer (i)
+ if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|si", kwlist,
+ &obj, &catname_s, &max_count))
+ return NULL;
+ // The python object should be a sequence
+ seq = PySequence_Fast(obj, "expected a sequence");
+ len = PySequence_Size(obj);
+
+ ExprIterObject* pyres = (ExprIterObject*)
+ pgf_ExprType.tp_alloc(&pgf_ExprIterType, 0);
+ if (pyres == NULL) {
+ return NULL;
+ }
+
+ pyres->pool = gu_new_pool();
+ pyres->max_count = max_count;
+ pyres->counter = 0;
+
+ GuPool *tmp_pool = gu_local_pool();
+ GuString catname =
+ (catname_s == NULL) ? pgf_start_cat(self->grammar->pgf, tmp_pool)
+ : gu_str_string(catname_s, tmp_pool);
+
+ // turn the (python) list of tokens into a string array
+ char* tokens[len];
+ for (int i = 0; i < len; i++) {
+ tokens[i] = PyString_AsString(PySequence_Fast_GET_ITEM(seq, i));
+ if (tokens[i] == NULL) {
+ // Note: if the list item is not a string,
+ // PyString_AsString raises TypeError itself
+ // so we just have to return
+ gu_pool_free(tmp_pool);
+ return NULL;
+ }
+ }
+ Py_DECREF(seq);
+
+ pyres->res =
+ pgf_parse_tokens(self->concr, catname, tokens, len, pyres->pool);
+
+ if (pyres->res == NULL) {
+ Py_DECREF(pyres);
+
+ PyErr_SetString(PGFError, "Something went wrong during parsing");
+ gu_pool_free(tmp_pool);
+ return NULL;
+ }
+
+ gu_pool_free(tmp_pool);
+ return pyres;
+}
+
static PyObject*
Concr_linearize(ConcrObject* self, PyObject *args)
{
@@ -394,6 +462,9 @@ static PyMethodDef Concr_methods[] = {
{"parse", (PyCFunction)Concr_parse, METH_VARARGS | METH_KEYWORDS,
"Parses a string and returns an iterator over the abstract trees for this sentence"
},
+ {"parse_tokens", (PyCFunction)Concr_parse_tokens, METH_VARARGS | METH_KEYWORDS,
+ "Parses list of tokens and returns an iterator over the abstract trees for this sentence. Allows you to write your own tokenizer in python."
+ },
{"linearize", (PyCFunction)Concr_linearize, METH_VARARGS,
"Takes an abstract tree and linearizes it to a sentence"
},