summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorkr.angelov <kr.angelov@gmail.com>2013-06-26 07:36:03 +0000
committerkr.angelov <kr.angelov@gmail.com>2013-06-26 07:36:03 +0000
commit3c2d1890d0c5de25bbaa7c582c20bcd67bc47d8c (patch)
tree9091c47c4693ab1ef14b98fc76726fec8c56c2f5 /src
parentd94b6146f2074608fba07421dd6185ac3a296680 (diff)
patch for adjustable heuristics from Python
Diffstat (limited to 'src')
-rw-r--r--src/runtime/c/Makefile.am6
-rw-r--r--src/runtime/c/pgf/parser.c18
-rw-r--r--src/runtime/c/pgf/parser.h4
-rw-r--r--src/runtime/c/pgf/parseval.c2
-rw-r--r--src/runtime/c/pgf/pgf.c40
-rw-r--r--src/runtime/c/pgf/pgf.h10
-rw-r--r--src/runtime/c/utils/pgf-chunk.c112
-rw-r--r--src/runtime/c/utils/pgf-parse.c22
-rw-r--r--src/runtime/python/pypgf.c17
9 files changed, 48 insertions, 183 deletions
diff --git a/src/runtime/c/Makefile.am b/src/runtime/c/Makefile.am
index e9dc866a5..47dd082fc 100644
--- a/src/runtime/c/Makefile.am
+++ b/src/runtime/c/Makefile.am
@@ -121,8 +121,7 @@ libpgf_la_SOURCES = \
bin_PROGRAMS = \
utils/pgf-print \
utils/pgf-translate \
- utils/pgf-parse \
- utils/pgf-chunk
+ utils/pgf-parse
utils_pgf_print_SOURCES = utils/pgf-print.c
utils_pgf_print_LDADD = libpgf.la libgu.la
@@ -133,9 +132,6 @@ utils_pgf_translate_LDADD = libpgf.la libgu.la
utils_pgf_parse_SOURCES = utils/pgf-parse.c
utils_pgf_parse_LDADD = libpgf.la libgu.la
-utils_pgf_chunk_SOURCES = utils/pgf-chunk.c
-utils_pgf_chunk_LDADD = libpgf.la libgu.la
-
AUTOMAKE_OPTIONS = foreign subdir-objects dist-bzip2
ACLOCAL_AMFLAGS = -I m4
include doxygen.am
diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c
index a1d8084ed..899628f6a 100644
--- a/src/runtime/c/pgf/parser.c
+++ b/src/runtime/c/pgf/parser.c
@@ -1638,7 +1638,8 @@ pgf_parsing_default_beam_size(PgfConcr* concr)
}
static PgfParsing*
-pgf_new_parsing(PgfConcr* concr, GuPool* pool, GuPool* out_pool)
+pgf_new_parsing(PgfConcr* concr, double heuristics,
+ GuPool* pool, GuPool* out_pool)
{
PgfParsing* ps = gu_new(PgfParsing, pool);
ps->concr = concr;
@@ -1654,7 +1655,7 @@ pgf_new_parsing(PgfConcr* concr, GuPool* pool, GuPool* out_pool)
ps->prod_full_count = 0;
#endif
ps->free_item = NULL;
- ps->beam_size = pgf_parsing_default_beam_size(concr);
+ ps->beam_size = heuristics;
PgfExprMeta *expr_meta =
gu_new_variant(PGF_EXPR_META,
@@ -2214,6 +2215,7 @@ pgf_parse_print_chunks(PgfParseState* state)
// TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat
PgfParseState*
pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx,
+ double heuristics,
GuPool* pool, GuPool* out_pool)
{
PgfCncCat* cnccat =
@@ -2223,8 +2225,12 @@ pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx,
gu_assert(lin_idx < cnccat->n_lins);
+ if (heuristics < 0) {
+ heuristics = pgf_parsing_default_beam_size(concr);
+ }
+
PgfParsing* ps =
- pgf_new_parsing(concr, pool, out_pool);
+ pgf_new_parsing(concr, heuristics, pool, out_pool);
PgfParseState* state =
pgf_new_parse_state(ps, NULL, NULL, pool);
@@ -2270,12 +2276,6 @@ pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx,
}
void
-pgf_parser_set_beam_size(PgfParseState* state, double beam_size)
-{
- state->ps->beam_size = beam_size;
-}
-
-void
pgf_parser_add_literal(PgfConcr *concr, PgfCId cat,
PgfLiteralCallback* callback)
{
diff --git a/src/runtime/c/pgf/parser.h b/src/runtime/c/pgf/parser.h
index 8c4ba77e9..b49cba868 100644
--- a/src/runtime/c/pgf/parser.h
+++ b/src/runtime/c/pgf/parser.h
@@ -34,6 +34,7 @@ typedef struct PgfParseState PgfParseState;
/// Begin parsing
PgfParseState*
pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx,
+ double heuristics,
GuPool* pool, GuPool* out_pool);
/**<
* @param parser The parser to use
@@ -70,9 +71,6 @@ GuEnum*
pgf_parser_completions(PgfParseState* prev, GuString prefix);
void
-pgf_parser_set_beam_size(PgfParseState* state, double beam_size);
-
-void
pgf_parser_add_literal(PgfConcr *concr, PgfCId cat,
PgfLiteralCallback* callback);
diff --git a/src/runtime/c/pgf/parseval.c b/src/runtime/c/pgf/parseval.c
index 70b2666fd..eed216b82 100644
--- a/src/runtime/c/pgf/parseval.c
+++ b/src/runtime/c/pgf/parseval.c
@@ -157,7 +157,7 @@ pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat,
PgfMetricsLznState state;
state.funcs = &pgf_metrics_lin_funcs1;
- state.ps = pgf_parser_init_state(concr, cat, 0, pool, pool);
+ state.ps = pgf_parser_init_state(concr, cat, 0, -1, pool, pool);
state.marks = gu_new_buf(int, pool);
state.pos = 0;
state.phrases = gu_new_buf(PgfPhrase*, pool);
diff --git a/src/runtime/c/pgf/pgf.c b/src/runtime/c/pgf/pgf.c
index f1b85cae3..95b2132f5 100644
--- a/src/runtime/c/pgf/pgf.c
+++ b/src/runtime/c/pgf/pgf.c
@@ -211,9 +211,17 @@ GuEnum*
pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
GuPool* pool, GuPool* out_pool)
{
+ return pgf_parse_with_heuristics(concr, cat, lexer, -1.0, pool, out_pool);
+}
+
+GuEnum*
+pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
+ double heuristics,
+ GuPool* pool, GuPool* out_pool)
+{
// Begin parsing a sentence of the specified category
PgfParseState* state =
- pgf_parser_init_state(concr, cat, 0, pool, out_pool);
+ pgf_parser_init_state(concr, cat, 0, heuristics, pool, out_pool);
if (state == NULL) {
return NULL;
}
@@ -244,7 +252,7 @@ pgf_get_completions(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
{
// Begin parsing a sentence of the specified category
PgfParseState* state =
- pgf_parser_init_state(concr, cat, 0, pool, pool);
+ pgf_parser_init_state(concr, cat, 0, -1, pool, pool);
if (state == NULL) {
return NULL;
}
@@ -268,31 +276,3 @@ pgf_get_completions(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
// Now begin enumerating the resulting syntax trees
return pgf_parser_completions(state, prefix);
}
-
-void
-pgf_print_chunks(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool)
-{
- // Begin parsing a sentence of the specified category
- PgfParseState* state =
- pgf_parser_init_state(concr, cat, 0, pool, pool);
- if (state == NULL) {
- printf("\n");
- return;
- }
-
- // Tokenization
- GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), pool);
- PgfToken tok = pgf_lexer_read_token(lexer, lex_err);
- while (!gu_exn_is_raised(lex_err)) {
- // feed the token to get a new parse state
- state = pgf_parser_next_state(state, tok);
- if (state == NULL) {
- printf("\n");
- return;
- }
-
- tok = pgf_lexer_read_token(lexer, lex_err);
- }
-
- pgf_parse_print_chunks(state);
-}
diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h
index d83598cc0..2e7e43584 100644
--- a/src/runtime/c/pgf/pgf.h
+++ b/src/runtime/c/pgf/pgf.h
@@ -117,6 +117,11 @@ PgfExprEnum*
pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
GuPool* pool, GuPool* out_pool);
+PgfExprEnum*
+pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
+ double heuristics,
+ GuPool* pool, GuPool* out_pool);
+
GuEnum*
pgf_get_completions(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
GuString prefix, GuPool* pool);
@@ -128,11 +133,6 @@ pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat,
PgfExprEnum*
pgf_generate(PgfPGF* pgf, PgfCId cat, GuPool* pool);
-// an experimental function. Please don't use it
-void
-pgf_print_chunks(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool);
-
-
/// @}
void
diff --git a/src/runtime/c/utils/pgf-chunk.c b/src/runtime/c/utils/pgf-chunk.c
deleted file mode 100644
index 5f4b8972a..000000000
--- a/src/runtime/c/utils/pgf-chunk.c
+++ /dev/null
@@ -1,112 +0,0 @@
-// Don't give too much hope to this script. It is doing the wrong thing
-// but let's see how far we can get with it.
-
-#include <gu/variant.h>
-#include <gu/map.h>
-#include <gu/dump.h>
-#include <gu/log.h>
-#include <gu/enum.h>
-#include <gu/file.h>
-#include <pgf/pgf.h>
-#include <pgf/parser.h>
-#include <pgf/lexer.h>
-#include <pgf/literals.h>
-#include <pgf/linearizer.h>
-#include <pgf/expr.h>
-#include <pgf/edsl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <locale.h>
-#include <time.h>
-
-int main(int argc, char* argv[]) {
- // Set the character locale, so we can produce proper output.
- setlocale(LC_CTYPE, "");
-
- // Create the pool that is used to allocate everything
- GuPool* pool = gu_new_pool();
- int status = EXIT_SUCCESS;
- if (argc != 4) {
- fprintf(stderr, "usage: %s pgf cat from_lang\n", argv[0]);
- status = EXIT_FAILURE;
- goto fail;
- }
- char* filename = argv[1];
-
- GuString cat = gu_str_string(argv[2], pool);
-
- GuString from_lang = gu_str_string(argv[3], pool);
-
- // Create an exception frame that catches all errors.
- GuExn* err = gu_new_exn(NULL, gu_kind(type), pool);
-
- // Read the PGF grammar.
- PgfPGF* pgf = pgf_read(filename, pool, err);
-
- // If an error occured, it shows in the exception frame
- if (!gu_ok(err)) {
- fprintf(stderr, "Reading PGF failed\n");
- status = EXIT_FAILURE;
- goto fail;
- }
-
- pgf_load_meta_child_probs(pgf, "../../../treebanks/PennTreebank/ParseEngAbs3.probs", pool, err);
- if (!gu_ok(err)) {
- fprintf(stderr, "Loading meta child probs failed\n");
- status = EXIT_FAILURE;
- goto fail;
- }
-
- // Look up the source and destination concrete categories
- PgfConcr* from_concr = pgf_get_language(pgf, from_lang);
- if (!from_concr) {
- fprintf(stderr, "Unknown language\n");
- status = EXIT_FAILURE;
- goto fail_concr;
- }
-
- // Register a callback for the literal category Symbol
- pgf_parser_add_literal(from_concr, gu_str_string("Symb", pool),
- &pgf_nerc_literal_callback);
-
- // We will keep the latest results in the 'ppool' and
- // we will iterate over them by using 'result'.
- GuPool* ppool = NULL;
-
- // The interactive translation loop.
- // XXX: This currently reads stdin directly, so it doesn't support
- // encodings properly. TODO: use a locale reader for input
- while (true) {
- char buf[4096];
- char* line = fgets(buf, sizeof(buf), stdin);
- if (line == NULL) {
- if (ferror(stdin)) {
- fprintf(stderr, "Input error\n");
- status = EXIT_FAILURE;
- }
- break;
- } else if (strcmp(line, "") == 0) {
- // End nicely on empty input
- break;
- }
-
- // We create a temporary pool for translating a single
- // sentence, so our memory usage doesn't increase over time.
- ppool = gu_new_pool();
-
- GuReader *rdr =
- gu_string_reader(gu_str_string(line, ppool), ppool);
- PgfLexer *lexer =
- pgf_new_simple_lexer(rdr, ppool);
-
- pgf_print_chunks(from_concr, cat, lexer, ppool);
-
- // Free all resources allocated during parsing and linearization
- gu_pool_free(ppool);
- }
-fail_concr:
-fail:
- gu_pool_free(pool);
- return status;
-}
diff --git a/src/runtime/c/utils/pgf-parse.c b/src/runtime/c/utils/pgf-parse.c
index a05d7988b..ba1088890 100644
--- a/src/runtime/c/utils/pgf-parse.c
+++ b/src/runtime/c/utils/pgf-parse.c
@@ -25,8 +25,8 @@ int main(int argc, char* argv[]) {
// Create the pool that is used to allocate everything
GuPool* pool = gu_new_pool();
int status = EXIT_SUCCESS;
- if (argc != 4) {
- fprintf(stderr, "usage: %s pgf-file start-cat cnc-lang\n", argv[0]);
+ if (argc < 4 || argc > 5) {
+ fprintf(stderr, "usage: %s pgf-file start-cat cnc-lang [heuristics]\n(0.0 <= heuristics < 1.0, default: 0.95)\n", argv[0]);
status = EXIT_FAILURE;
goto fail;
}
@@ -34,6 +34,11 @@ int main(int argc, char* argv[]) {
GuString cat = gu_str_string(argv[2], pool);
GuString lang = gu_str_string(argv[3], pool);
+ double heuristics = 0.95;
+ if (argc == 5) {
+ heuristics = atof(argv[4]);
+ }
+
// Create an exception frame that catches all errors.
GuExn* err = gu_new_exn(NULL, gu_kind(type), pool);
@@ -65,7 +70,7 @@ int main(int argc, char* argv[]) {
clock_t end = clock();
double cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
- fprintf(stderr, "(%.0f ms) Ready to parse!\n", 1000.0 * cpu_time_used);
+ fprintf(stderr, "(%.0f ms) Ready to parse [heuristics=%.2f]!\n", 1000.0 * cpu_time_used, heuristics);
// Create an output stream for stdout
GuOut* out = gu_file_out(stdout, pool);
@@ -113,18 +118,9 @@ int main(int argc, char* argv[]) {
clock_t start = clock();
- // Begin parsing a sentence of the specified category
- PgfParseState* state =
- pgf_parser_init_state(concr, cat, 0, ppool, ppool);
- if (state == NULL) {
- fprintf(stderr, "Couldn't begin parsing\n");
- status = EXIT_FAILURE;
- break;
- }
-
GuReader *rdr = gu_string_reader(gu_str_string(line, ppool), ppool);
PgfLexer *lexer = pgf_new_simple_lexer(rdr, ppool);
- GuEnum* result = pgf_parse(concr, cat, lexer, ppool, ppool);
+ GuEnum* result = pgf_parse_with_heuristics(concr, cat, lexer, heuristics, ppool, ppool);
PgfExprProb* ep = NULL;
if (result != NULL)
diff --git a/src/runtime/python/pypgf.c b/src/runtime/python/pypgf.c
index dc2d18bfa..1c7cd5edc 100644
--- a/src/runtime/python/pypgf.c
+++ b/src/runtime/python/pypgf.c
@@ -692,15 +692,16 @@ void pypgf_container_descructor(PyObject *capsule)
static IterObject*
Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds)
{
- static char *kwlist[] = {"sentence", "tokens", "cat", "n", NULL};
+ static char *kwlist[] = {"sentence", "tokens", "cat", "n", "heuristics", NULL};
int len;
const uint8_t *buf = NULL;
PyObject* py_lexer = NULL;
const char *catname_s = NULL;
int max_count = -1;
- if (!PyArg_ParseTupleAndKeywords(args, keywds, "|s#Osi", kwlist,
- &buf, &len, &py_lexer, &catname_s, &max_count))
+ double heuristics = -1;
+ if (!PyArg_ParseTupleAndKeywords(args, keywds, "|s#Osid", kwlist,
+ &buf, &len, &py_lexer, &catname_s, &max_count, &heuristics))
return NULL;
if ((buf == NULL && py_lexer == NULL) ||
@@ -752,7 +753,8 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds)
}
pyres->res =
- pgf_parse(self->concr, catname, lexer, pyres->pool, out_pool);
+ pgf_parse_with_heuristics(self->concr, catname, lexer,
+ heuristics, pyres->pool, out_pool);
if (pyres->res == NULL) {
Py_DECREF(pyres);
@@ -1217,7 +1219,12 @@ static PyMethodDef Concr_methods[] = {
"Returns the print name of a function or category"
},
{"parse", (PyCFunction)Concr_parse, METH_VARARGS | METH_KEYWORDS,
- "Parses a string and returns an iterator over the abstract trees for this sentence"
+ "Parses a string and returns an iterator over the abstract trees for this sentence\n\n"
+ "Named arguments:\n"
+ "- sentence (string) or tokens (list of strings)\n"
+ "- cat (string); OPTIONAL, default: the startcat of the grammar\n"
+ "- n (int), max. trees; OPTIONAL, default: extract all trees\n"
+ "- heuristics (double >= 0.0); OPTIONAL, default: taken from the flags in the grammar"
},
{"getCompletions", (PyCFunction)Concr_getCompletions, METH_VARARGS | METH_KEYWORDS,
"Parses a partial string and returns a list with the top n possible next tokens"