diff options
| author | kr.angelov <kr.angelov@gmail.com> | 2012-05-08 12:13:28 +0000 |
|---|---|---|
| committer | kr.angelov <kr.angelov@gmail.com> | 2012-05-08 12:13:28 +0000 |
| commit | a6800fc0da1d90dad0362c806037f9a92ab3e813 (patch) | |
| tree | ad383d165e5d2fe36fe10729d83ff5aa201b0f6c /src/runtime/c/utils | |
| parent | 931066f6fc004c7a193e5200d13ea651c7e02fd1 (diff) | |
a new unbiased statistical parser. it is still far from perfect use it on your own risk.
Diffstat (limited to 'src/runtime/c/utils')
| -rw-r--r-- | src/runtime/c/utils/pgf-translate.c | 158 |
1 files changed, 86 insertions, 72 deletions
diff --git a/src/runtime/c/utils/pgf-translate.c b/src/runtime/c/utils/pgf-translate.c index 878e07992..e9f482b84 100644 --- a/src/runtime/c/utils/pgf-translate.c +++ b/src/runtime/c/utils/pgf-translate.c @@ -18,6 +18,33 @@ #include <locale.h> #include <time.h> +static void +print_result(PgfExprProb* ep, PgfConcr* to_concr, + GuWriter* wtr, GuExn* err, GuPool* ppool) +{ + // Write out the abstract syntax tree + gu_printf(wtr, err, " [%f] ", ep->prob); + pgf_print_expr(ep->expr, 0, wtr, err); + gu_putc('\n', wtr, err); + + // Enumerate the concrete syntax trees corresponding + // to the abstract tree. + GuEnum* cts = pgf_lzr_concretize(to_concr, ep->expr, ppool); + while (true) { + PgfCncTree ctree = + gu_next(cts, PgfCncTree, ppool); + if (gu_variant_is_null(ctree)) { + break; + } + gu_putc(' ', wtr, err); + // Linearize the concrete tree as a simple + // sequence of strings. + pgf_lzr_linearize_simple(to_concr , ctree, 0, wtr, err); + gu_putc('\n', wtr, err); + gu_writer_flush(wtr, err); + } +} + int main(int argc, char* argv[]) { // Set the character locale, so we can produce proper output. setlocale(LC_CTYPE, ""); @@ -32,15 +59,7 @@ int main(int argc, char* argv[]) { } char* filename = argv[1]; - GuString cat; - bool robust_mode; - if (argv[2][0] == '.') { - cat = gu_str_string(argv[2]+1, pool); - robust_mode = true; - } else { - cat = gu_str_string(argv[2], pool); - robust_mode = false; - } + GuString cat = gu_str_string(argv[2], pool); GuString from_lang = gu_str_string(argv[3], pool); GuString to_lang = gu_str_string(argv[4], pool); @@ -83,10 +102,6 @@ int main(int argc, char* argv[]) { pgf_parser_add_literal(from_concr, gu_str_string("Symb", pool), &pgf_nerc_literal_callback); - // Arbitrarily choose linearization index 0. Usually the initial - // categories we are interested in only have one field. - int lin_idx = 0; - // Create an output stream for stdout GuOut* out = gu_file_out(stdout, pool); @@ -95,6 +110,11 @@ int main(int argc, char* argv[]) { // Use a writer with hard-coded utf-8 encoding for now. GuWriter* wtr = gu_new_utf8_writer(out, pool); + // We will keep the latest results in the 'ppool' and + // we will iterate over them by using 'result'. + GuPool* ppool = NULL; + GuEnum* result = NULL; + // The interactive translation loop. // XXX: This currently reads stdin directly, so it doesn't support // encodings properly. TODO: use a locale reader for input @@ -109,20 +129,49 @@ int main(int argc, char* argv[]) { status = EXIT_FAILURE; } break; - } else if (line[0] == '\0') { + } else if (strcmp(line, "") == 0) { // End nicely on empty input break; + } else if (strcmp(line, "\n") == 0) { + // Empty line -> show the next tree for the last sentence + + if (result != NULL) { + clock_t start = clock(); + + PgfExprProb* ep = gu_next(result, PgfExprProb*, ppool); + + clock_t end = clock(); + double cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC; + printf("%.2f sec\n", cpu_time_used); + + // The enumerator will return a null variant at the + // end of the results. + if (ep == NULL) { + goto fail_parse; + } + + print_result(ep, to_concr, wtr, err, ppool); + } + continue; + } + + // We release the last results + if (ppool != NULL) { + gu_pool_free(ppool); + ppool = NULL; + result = NULL; } + // We create a temporary pool for translating a single // sentence, so our memory usage doesn't increase over time. - GuPool* ppool = gu_new_pool(); + ppool = gu_new_pool(); clock_t start = clock(); // Begin parsing a sentence of the specified category - PgfParse* parse = - pgf_parser_parse(from_concr, cat, lin_idx, pool); - if (parse == NULL) { + PgfParseState* state = + pgf_parser_init_state(from_concr, cat, 0, pool); + if (state == NULL) { fprintf(stderr, "Couldn't begin parsing\n"); status = EXIT_FAILURE; break; @@ -133,13 +182,13 @@ int main(int argc, char* argv[]) { PgfLexer *lexer = pgf_new_lexer(rdr, pool); - // naive tokenization + // Tokenization GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), pool); PgfToken tok = pgf_lexer_next_token(lexer, lex_err, pool); while (!gu_exn_is_raised(lex_err)) { // feed the token to get a new parse state - parse = pgf_parse_token(parse, tok, robust_mode, ppool); - if (!parse) { + state = pgf_parser_next_state(state, tok, ppool); + if (!state) { gu_puts("Unexpected token: \"", wtr, err); gu_string_write(tok, wtr, err); gu_puts("\"\n", wtr, err); @@ -149,64 +198,29 @@ int main(int argc, char* argv[]) { tok = pgf_lexer_next_token(lexer, lex_err, pool); } - if (robust_mode) { - PgfExpr expr = pgf_parse_best_result(parse, ppool); - - clock_t end = clock(); - - double cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC; - printf("%.2f sec\n", cpu_time_used); - - if (!gu_variant_is_null(expr)) { - gu_putc(' ', wtr, err); - // Write out the abstract syntax tree - pgf_print_expr(expr, 0, wtr, err); - gu_putc('\n', wtr, err); - } - } else { - // Now begin enumerating the resulting syntax trees - GuEnum* result = pgf_parse_result(parse, ppool); - - clock_t end = clock(); + // Now begin enumerating the resulting syntax trees + result = pgf_parse_result(state, ppool); - double cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC; - printf("%.2f sec\n", cpu_time_used); + PgfExprProb* ep = gu_next(result, PgfExprProb*, ppool); - while (true) { - PgfExpr expr = gu_next(result, PgfExpr, ppool); + clock_t end = clock(); + double cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC; + printf("%.2f sec\n", cpu_time_used); - // The enumerator will return a null variant at the - // end of the results. - if (gu_variant_is_null(expr)) { - break; - } - gu_putc(' ', wtr, err); - // Write out the abstract syntax tree - pgf_print_expr(expr, 0, wtr, err); - gu_putc('\n', wtr, err); - - // Enumerate the concrete syntax trees corresponding - // to the abstract tree. - GuEnum* cts = pgf_lzr_concretize(to_concr, expr, ppool); - while (true) { - PgfCncTree ctree = - gu_next(cts, PgfCncTree, ppool); - if (gu_variant_is_null(ctree)) { - break; - } - gu_puts(" ", wtr, err); - // Linearize the concrete tree as a simple - // sequence of strings. - pgf_lzr_linearize_simple(to_concr , ctree, lin_idx, - wtr, err); - gu_putc('\n', wtr, err); - gu_writer_flush(wtr, err); - } - } + // The enumerator will return a null variant at the + // end of the results. + if (ep == NULL) { + goto fail_parse; } + + print_result(ep, to_concr, wtr, err, ppool); + + continue; fail_parse: // Free all resources allocated during parsing and linearization gu_pool_free(ppool); + ppool = NULL; + result = NULL; } fail_concr: fail_read: |
