Python binding: add a parsing function that accepts directly a list of tokens.

Is allows to define a tokenizer in python (or use an existing one, from nltk for instance.)
author: gregoire.detrez <gregoire.detrez@gu.se> 2013-01-24 13:31:34 +0000
committer: gregoire.detrez <gregoire.detrez@gu.se> 2013-01-24 13:31:34 +0000
commit: 0aae4702edbd4889159e3772b72d0a4c10b7e57a (patch)
tree: 4b04b948fffe0ef768c204a25b1628a7640b0b27 /src
parent: e7db50b9bdd710690c78038d095bf249a9ed7f02 (diff)
3 files changed, 101 insertions, 0 deletions
diff --git a/src/runtime/c/pgf/pgf.c b/src/runtime/c/pgf/pgf.c
index ccee4bf24..2b720f093 100644
--- a/src/runtime/c/pgf/pgf.c
+++ b/src/runtime/c/pgf/pgf.c
@@ -227,6 +227,33 @@ pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool)
 	return pgf_parse_result(state, pool);
 }
 
+// Same as previous but accept a list of tokens as input instead of a 
+// lexer
+GuEnum*
+pgf_parse_tokens(PgfConcr* concr, PgfCId cat, char **tokens, int len, GuPool* pool)
+{
+    // Begin parsing a sentence of the specified category
+    PgfParseState* state =
+        pgf_parser_init_state(concr, cat, 0, pool);
+    if (state == NULL) {
+        return NULL;
+    }
+
+    // Parsing
+    PgfToken tok;
+    for (int i = 0; i < len; i++) {
+        tok = gu_str_string(tokens[i], pool);
+
+        state = pgf_parser_next_state(state, tok, pool);
+        if (state == NULL) {
+            return NULL;
+        }
+    }
+
+    // Now begin enumerating the resulting syntax trees
+    return pgf_parse_result(state, pool);
+}
+
 void
 pgf_print_chunks(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool)
 {
diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h
index 1f3947bff..afef6ec48 100644
--- a/src/runtime/c/pgf/pgf.h
+++ b/src/runtime/c/pgf/pgf.h
@@ -116,6 +116,9 @@ PgfExprEnum*
 pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool);
 
 PgfExprEnum*
+pgf_parse_tokens(PgfConcr* concr, PgfCId cat, char* tokens[], int len, GuPool* pool);
+
+PgfExprEnum*
 pgf_generate(PgfPGF* pgf, PgfCId cat, GuPool* pool);
 
 // an experimental function. Please don't use it
diff --git a/src/runtime/python/pgf/binding.c b/src/runtime/python/pgf/binding.c
index 5efb7a0be..dbbf7eb3a 100644
--- a/src/runtime/python/pgf/binding.c
+++ b/src/runtime/python/pgf/binding.c
@@ -362,6 +362,74 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds)
 	return pyres;
 }
 
+// Concr_parse_tokens is the same as the above function but
+// instead of a string it expect a sequence of tokens as argument.
+// This is usefull if you want to implement your own tokenizer in
+// python.
+static ExprIterObject*
+Concr_parse_tokens(ConcrObject* self, PyObject *args, PyObject *keywds)
+{
+    static char *kwlist[] = {"tokens", "cat", "n", NULL};
+    // Variable for the input list of tokens
+    PyObject* obj;
+    PyObject* seq;
+    int len;
+    const char *catname_s = NULL;
+    int max_count = -1;
+
+    // Parsing arguments: the tokens is a python object (O),
+    // cat is a string (s) and n an integer (i)
+    if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|si", kwlist,
+                                    &obj, &catname_s, &max_count))
+        return NULL;
+    // The python object should be a sequence
+    seq = PySequence_Fast(obj, "expected a sequence");
+    len = PySequence_Size(obj);
+
+    ExprIterObject* pyres = (ExprIterObject*) 
+        pgf_ExprType.tp_alloc(&pgf_ExprIterType, 0);
+    if (pyres == NULL) {
+        return NULL;
+    }
+
+    pyres->pool = gu_new_pool();
+    pyres->max_count = max_count;
+    pyres->counter   = 0;
+
+    GuPool *tmp_pool = gu_local_pool();
+    GuString catname = 
+        (catname_s == NULL) ? pgf_start_cat(self->grammar->pgf, tmp_pool)
+                            : gu_str_string(catname_s, tmp_pool);
+
+    // turn the (python) list of tokens into a string array
+    char* tokens[len];
+    for (int i = 0; i < len; i++) {
+        tokens[i] = PyString_AsString(PySequence_Fast_GET_ITEM(seq, i));
+        if (tokens[i] == NULL) {
+            // Note: if the list item is not a string, 
+            // PyString_AsString raises TypeError itself
+            // so we just have to return
+            gu_pool_free(tmp_pool);
+            return NULL;
+        }
+    }
+    Py_DECREF(seq);
+    
+    pyres->res =
+        pgf_parse_tokens(self->concr, catname, tokens, len, pyres->pool);
+
+    if (pyres->res == NULL) {
+        Py_DECREF(pyres);
+
+        PyErr_SetString(PGFError, "Something went wrong during parsing");
+        gu_pool_free(tmp_pool);
+        return NULL;
+    }
+
+    gu_pool_free(tmp_pool);
+    return pyres;
+}
+
 static PyObject*
 Concr_linearize(ConcrObject* self, PyObject *args)
 {
@@ -394,6 +462,9 @@ static PyMethodDef Concr_methods[] = {
     {"parse", (PyCFunction)Concr_parse, METH_VARARGS | METH_KEYWORDS,
      "Parses a string and returns an iterator over the abstract trees for this sentence"
     },
+    {"parse_tokens", (PyCFunction)Concr_parse_tokens, METH_VARARGS | METH_KEYWORDS,
+     "Parses list of tokens and returns an iterator over the abstract trees for this sentence. Allows you to write your own tokenizer in python."
+    },
     {"linearize", (PyCFunction)Concr_linearize, METH_VARARGS,
      "Takes an abstract tree and linearizes it to a sentence"
     },
author	gregoire.detrez <gregoire.detrez@gu.se>	2013-01-24 13:31:34 +0000
committer	gregoire.detrez <gregoire.detrez@gu.se>	2013-01-24 13:31:34 +0000
commit	0aae4702edbd4889159e3772b72d0a4c10b7e57a (patch)
tree	4b04b948fffe0ef768c204a25b1628a7640b0b27 /src
parent	e7db50b9bdd710690c78038d095bf249a9ed7f02 (diff)