summaryrefslogtreecommitdiff
path: root/src/runtime/c
diff options
context:
space:
mode:
authorkr.angelov <kr.angelov@gmail.com>2013-10-09 12:08:51 +0000
committerkr.angelov <kr.angelov@gmail.com>2013-10-09 12:08:51 +0000
commit8cf03bc5b6895568eb08be1415985a64793bb81c (patch)
tree814e95e1b0422362066fef389e8b03741e8478ad /src/runtime/c
parent20e4970ec19949da10b86facd3c6a5a4abb03acb (diff)
a major redesign in the C runtime. The parser and the linearizer now fully support BIND. The following things are still broken: parseval, word completion, handling 'pre', the robust mode
Diffstat (limited to 'src/runtime/c')
-rw-r--r--src/runtime/c/Makefile.am3
-rw-r--r--src/runtime/c/gu/exn.h2
-rw-r--r--src/runtime/c/gu/utf8.h2
-rw-r--r--src/runtime/c/pgf/data.h46
-rw-r--r--src/runtime/c/pgf/lexer.c128
-rw-r--r--src/runtime/c/pgf/lexer.h31
-rw-r--r--src/runtime/c/pgf/linearizer.c23
-rw-r--r--src/runtime/c/pgf/linearizer.h2
-rw-r--r--src/runtime/c/pgf/literals.c266
-rw-r--r--src/runtime/c/pgf/parser.c1901
-rw-r--r--src/runtime/c/pgf/parser.h113
-rw-r--r--src/runtime/c/pgf/parseval.c35
-rw-r--r--src/runtime/c/pgf/pgf.c76
-rw-r--r--src/runtime/c/pgf/pgf.h32
-rw-r--r--src/runtime/c/pgf/printer.c32
-rw-r--r--src/runtime/c/pgf/reader.c32
-rw-r--r--src/runtime/c/utils/pgf-parse.c7
-rw-r--r--src/runtime/c/utils/pgf-translate.c23
18 files changed, 1171 insertions, 1583 deletions
diff --git a/src/runtime/c/Makefile.am b/src/runtime/c/Makefile.am
index a8d507d9d..6716ec042 100644
--- a/src/runtime/c/Makefile.am
+++ b/src/runtime/c/Makefile.am
@@ -39,7 +39,6 @@ pgfinclude_HEADERS = \
pgf/reader.h \
pgf/linearizer.h \
pgf/parser.h \
- pgf/lexer.h \
pgf/literals.h \
pgf/graphviz.h \
pgf/pgf.h
@@ -93,8 +92,6 @@ libpgf_la_SOURCES = \
pgf/parser.h \
pgf/jit.c \
pgf/parseval.c \
- pgf/lexer.c \
- pgf/lexer.h \
pgf/literals.c \
pgf/literals.h \
pgf/reader.h \
diff --git a/src/runtime/c/gu/exn.h b/src/runtime/c/gu/exn.h
index 9ee70438a..c043c4b8a 100644
--- a/src/runtime/c/gu/exn.h
+++ b/src/runtime/c/gu/exn.h
@@ -91,7 +91,7 @@ gu_exn_clear(GuExn* err) {
GuType*
gu_exn_caught(GuExn* err);
-inline const void*
+static inline const void*
gu_exn_caught_data(GuExn* err)
{
return err->data.data;
diff --git a/src/runtime/c/gu/utf8.h b/src/runtime/c/gu/utf8.h
index 3118c4071..db7dccaf9 100644
--- a/src/runtime/c/gu/utf8.h
+++ b/src/runtime/c/gu/utf8.h
@@ -37,7 +37,7 @@ gu_utf8_decode(const uint8_t** utf8);
void
gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err);
-bool
+static inline bool
gu_is_space(uint8_t c) {
return (c == '\t' || c == '\n' || c == '\v' ||
c == '\f' || c == '\r' || c == ' ');
diff --git a/src/runtime/c/pgf/data.h b/src/runtime/c/pgf/data.h
index 2562af651..0dae42491 100644
--- a/src/runtime/c/pgf/data.h
+++ b/src/runtime/c/pgf/data.h
@@ -128,14 +128,16 @@ typedef struct {
extern GU_DECLARE_TYPE(PgfCncCat, abstract);
+typedef GuString PgfToken;
+typedef GuSeq PgfTokens;
+
bool
pgf_tokens_equal(PgfTokens* t1, PgfTokens* t2);
-typedef GuSeq PgfSequence; // -> PgfSymbol
-typedef GuSeq PgfSequences;
+typedef GuSeq PgfSymbols;
typedef struct {
- PgfSequence* form;
+ PgfSymbols* form;
/**< The form of this variant as a list of tokens. */
GuStrings* prefixes;
@@ -154,19 +156,8 @@ extern GU_DECLARE_TYPE(PgfCncFunOverloadMap, GuStringMap);
typedef GuMap PgfCncOverloadMap;
extern GU_DECLARE_TYPE(PgfCncOverloadMap, GuMap);
-typedef GuMap PgfProductionIdx;
-extern GU_DECLARE_TYPE(PgfProductionIdx, GuMap);
-
-typedef GuMap PgfLeftcornerTokIdx;
-extern GU_DECLARE_TYPE(PgfLeftcornerTokIdx, GuMap);
-
typedef struct PgfItem PgfItem;
-typedef struct {
- bool (*match)(PgfConcr* concr, PgfItem* item, PgfToken tok,
- PgfExprProb** out_ep, GuPool *pool);
-} PgfLiteralCallback;
-
typedef GuMap PgfCallbacksMap;
extern GU_DECLARE_TYPE(PgfCallbacksMap, GuMap);
@@ -197,7 +188,7 @@ typedef struct PgfSymbolKP
/** A prefix-dependent symbol. The form that this symbol takes
* depends on the form of a prefix of the following symbol. */
{
- PgfSequence* default_form;
+ PgfSymbols* default_form;
/**< Default form that this symbol takes if none of of the
* variant forms is triggered. */
@@ -214,6 +205,21 @@ typedef struct {
} PgfSymbolBIND;
typedef struct {
+ PgfExprProb* (*match)(PgfConcr* concr, PgfSymbol* psym, size_t lin_idx,
+ GuString sentence, size_t* poffset,
+ GuPool *pool, GuPool *out_pool);
+} PgfLiteralCallback;
+
+typedef GuBuf PgfProductionIdx;
+
+typedef struct {
+ PgfSymbols* syms; // -> PgfSymbol
+ PgfProductionIdx* idx;
+} PgfSequence;
+
+typedef GuSeq PgfSequences;
+
+typedef struct {
PgfAbsFun* absfun;
PgfExprProb *ep;
int funid;
@@ -230,8 +236,6 @@ struct PgfConcr {
GuMap* ccats;
PgfCncFunOverloadMap* fun_indices;
PgfCncOverloadMap* coerce_idx;
- PgfProductionIdx* epsilon_idx;
- PgfLeftcornerTokIdx* leftcorner_tok_idx;
PgfCncFuns* cncfuns;
PgfSequences* sequences;
PgfCIdMap* cnccats;
@@ -274,7 +278,6 @@ typedef struct PgfProductionCoerce
} PgfProductionCoerce;
typedef struct {
- PgfLiteralCallback *callback;
PgfExprProb *ep;
GuSeq* lins;
} PgfProductionExtern;
@@ -287,8 +290,11 @@ typedef struct {
typedef GuSeq PgfProductionSeq;
extern GU_DECLARE_TYPE(PgfProductionSeq, abstract);
-typedef GuBuf PgfProductionBuf;
-extern GU_DECLARE_TYPE(PgfProductionBuf, abstract);
+typedef struct {
+ PgfCCat* ccat;
+ size_t lin_idx;
+ PgfProductionApply* papp;
+} PgfProductionIdxEntry;
struct PgfCCat {
PgfCncCat* cnccat;
diff --git a/src/runtime/c/pgf/lexer.c b/src/runtime/c/pgf/lexer.c
deleted file mode 100644
index dda0bf6e0..000000000
--- a/src/runtime/c/pgf/lexer.c
+++ /dev/null
@@ -1,128 +0,0 @@
-#include <gu/utf8.h>
-#include <pgf/pgf.h>
-#include <pgf/data.h>
-#include <wctype.h>
-
-typedef struct {
- PgfLexer base;
- GuIn* in;
- GuPool* pool;
- GuUCS ucs;
-} PgfSimpleLexer;
-
-static void
-pgf_lexer_read_ucs(PgfSimpleLexer *lexer, GuExn* err)
-{
- lexer->ucs = gu_in_utf8(lexer->in, err);
- if (gu_exn_is_raised(err)) {
- gu_exn_clear(err);
- lexer->ucs = ' ';
- }
-}
-
-static PgfToken
-pgf_simple_lexer_read_token(PgfLexer *base, GuExn* err)
-{
- PgfSimpleLexer* lexer = (PgfSimpleLexer*) base;
- GuPool* tmp_pool = gu_new_pool();
-
- GuStringBuf* buf = gu_string_buf(tmp_pool);
- GuOut* out = gu_string_buf_out(buf);
-
- while (iswspace(lexer->ucs)) {
- lexer->ucs = gu_in_utf8(lexer->in, err);
- if (gu_exn_is_raised(err))
- goto stop;
- }
-
- if (iswalpha(lexer->ucs) ||
- lexer->ucs == '\'' ||
- lexer->ucs == '_') {
- int counter = 0;
- do {
- gu_out_utf8(lexer->ucs, out, err);
- if (gu_exn_is_raised(err))
- goto stop;
- counter++;
- pgf_lexer_read_ucs(lexer, err);
-
- if (lexer->ucs == '.' && counter < 4) {
- // perhaps an abreviation
- gu_out_utf8(lexer->ucs, out, err);
- if (gu_exn_is_raised(err))
- goto stop;
- counter = 0;
- pgf_lexer_read_ucs(lexer, err);
- }
- } while (iswalnum(lexer->ucs) ||
- lexer->ucs == '\'' ||
- lexer->ucs == '_');
- } else if (iswdigit(lexer->ucs) || lexer->ucs == '-') {
- if (lexer->ucs == '-') {
- gu_out_utf8(lexer->ucs, out, err);
- if (gu_exn_is_raised(err))
- goto stop;
-
- pgf_lexer_read_ucs(lexer, err);
- if (!iswdigit(lexer->ucs))
- goto stop;
- }
-
- do {
- gu_out_utf8(lexer->ucs, out, err);
- if (gu_exn_is_raised(err))
- goto stop;
-
- pgf_lexer_read_ucs(lexer, err);
- } while (iswdigit(lexer->ucs));
-
- if (lexer->ucs == '.') {
- gu_out_utf8(lexer->ucs, out, err);
- if (gu_exn_is_raised(err))
- goto stop;
-
- pgf_lexer_read_ucs(lexer, err);
- while (iswdigit(lexer->ucs)) {
- gu_out_utf8(lexer->ucs, out, err);
- if (gu_exn_is_raised(err))
- goto stop;
- pgf_lexer_read_ucs(lexer, err);
- }
- }
- } else {
- gu_out_utf8(lexer->ucs, out, err);
- if (gu_exn_is_raised(err))
- goto stop;
- pgf_lexer_read_ucs(lexer, err);
- }
-
-stop:
- lexer->base.tok = gu_string_buf_freeze(buf, lexer->pool);
-
- gu_pool_free(tmp_pool);
- return lexer->base.tok;
-}
-
-PgfLexer*
-pgf_new_simple_lexer(GuIn *in, GuPool *pool)
-{
- PgfSimpleLexer* lexer = gu_new(PgfSimpleLexer, pool);
- lexer->base.read_token = pgf_simple_lexer_read_token;
- lexer->base.tok = "";
- lexer->in = in;
- lexer->pool = pool;
- lexer->ucs = ' ';
- return ((PgfLexer*) lexer);
-}
-
-PgfToken
-pgf_lexer_read_token(PgfLexer *lexer, GuExn* err)
-{
- return lexer->read_token(lexer, err);
-}
-
-PgfToken
-pgf_lexer_current_token(PgfLexer *lexer)
-{
- return lexer->tok;
-}
diff --git a/src/runtime/c/pgf/lexer.h b/src/runtime/c/pgf/lexer.h
deleted file mode 100644
index b55c07930..000000000
--- a/src/runtime/c/pgf/lexer.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef PGF_LEXER_H_
-#define PGF_LEXER_H_
-
-#include <gu/in.h>
-#include <pgf/expr.h>
-
-/// A single lexical token
-typedef GuString PgfToken;
-typedef GuSeq PgfTokens; // -> PgfToken
-
-typedef struct {
- prob_t prob;
- PgfCId cat;
- PgfToken tok;
-} PgfTokenProb;
-
-typedef struct {
- PgfToken (*read_token)();
- PgfToken tok;
-} PgfLexer;
-
-PgfLexer*
-pgf_new_simple_lexer(GuIn *in, GuPool *pool);
-
-PgfToken
-pgf_lexer_read_token(PgfLexer *lexer, GuExn* err);
-
-PgfToken
-pgf_lexer_current_token(PgfLexer *lexer);
-
-#endif // PGF_LEXER_H_
diff --git a/src/runtime/c/pgf/linearizer.c b/src/runtime/c/pgf/linearizer.c
index aa47bd825..0465b6e1c 100644
--- a/src/runtime/c/pgf/linearizer.c
+++ b/src/runtime/c/pgf/linearizer.c
@@ -455,14 +455,13 @@ pgf_lzr_concretize(PgfConcr* concr, PgfExpr expr, GuPool* pool)
}
void
-pgf_lzr_linearize_sequence(PgfConcr* concr, PgfCncTreeApp* fapp,
- PgfSequence* seq, uint16_t seq_idx,
- PgfLinFuncs** fnsp)
+pgf_lzr_linearize_symbols(PgfConcr* concr, PgfCncTreeApp* fapp,
+ PgfSymbols* syms, uint16_t sym_idx,
+ PgfLinFuncs** fnsp)
{
- size_t nsyms = gu_seq_length(seq);
- PgfSymbol* syms = gu_seq_data(seq);
- for (size_t i = seq_idx; i < nsyms; i++) {
- PgfSymbol sym = syms[i];
+ size_t nsyms = gu_seq_length(syms);
+ for (size_t i = sym_idx; i < nsyms; i++) {
+ PgfSymbol sym = gu_seq_get(syms, PgfSymbol, i);
GuVariantInfo sym_i = gu_variant_open(sym);
switch (sym_i.tag) {
case PGF_SYMBOL_CAT:
@@ -488,7 +487,7 @@ pgf_lzr_linearize_sequence(PgfConcr* concr, PgfCncTreeApp* fapp,
case PGF_SYMBOL_KP: {
// TODO: correct prefix-dependencies
PgfSymbolKP* kp = sym_i.data;
- pgf_lzr_linearize_sequence(concr, fapp, kp->default_form, 0, fnsp);
+ pgf_lzr_linearize_symbols(concr, fapp, kp->default_form, 0, fnsp);
break;
}
case PGF_SYMBOL_NE: {
@@ -528,9 +527,7 @@ pgf_lzr_linearize(PgfConcr* concr, PgfCncTree ctree, size_t lin_idx, PgfLinFuncs
}
gu_require(lin_idx < fun->n_lins);
-
- PgfSequence* seq = fun->lins[lin_idx];
- pgf_lzr_linearize_sequence(concr, fapp, seq, 0, fnsp);
+ pgf_lzr_linearize_symbols(concr, fapp, fun->lins[lin_idx]->syms, 0, fnsp);
if (fns->end_phrase) {
fns->end_phrase(fnsp,
@@ -681,7 +678,7 @@ pgf_lzr_linearize_simple(PgfConcr* concr, PgfCncTree ctree,
}
GuString
-pgf_get_tokens(PgfSequence* seq, uint16_t seq_idx, GuPool* pool)
+pgf_get_tokens(PgfSymbols* syms, uint16_t sym_idx, GuPool* pool)
{
GuPool* tmp_pool = gu_new_pool();
GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool);
@@ -695,7 +692,7 @@ pgf_get_tokens(PgfSequence* seq, uint16_t seq_idx, GuPool* pool)
.err = err
};
- pgf_lzr_linearize_sequence(NULL, NULL, seq, seq_idx, &flin.funcs);
+ pgf_lzr_linearize_symbols(NULL, NULL, syms, sym_idx, &flin.funcs);
GuString tokens = gu_ok(err) ? gu_string_buf_freeze(sbuf, pool)
: "";
diff --git a/src/runtime/c/pgf/linearizer.h b/src/runtime/c/pgf/linearizer.h
index 63ab2705a..1cc244575 100644
--- a/src/runtime/c/pgf/linearizer.h
+++ b/src/runtime/c/pgf/linearizer.h
@@ -76,5 +76,5 @@ pgf_lzr_linearize_simple(PgfConcr* concr, PgfCncTree ctree,
#ifdef PGF_PARSER_H_
// Used internally in the parser
GuString
-pgf_get_tokens(PgfSequence* seq, uint16_t seq_idx, GuPool* pool);
+pgf_get_tokens(PgfSymbols* sym, uint16_t sym_idx, GuPool* pool);
#endif
diff --git a/src/runtime/c/pgf/literals.c b/src/runtime/c/pgf/literals.c
index ad8d900cc..2043dd24c 100644
--- a/src/runtime/c/pgf/literals.c
+++ b/src/runtime/c/pgf/literals.c
@@ -11,48 +11,41 @@ GU_DEFINE_TYPE(PgfCallbacksMap, GuMap,
gu_ptr_type(PgfLiteralCallback), &gu_null_struct);
-static bool
-pgf_match_string_lit(PgfConcr* concr, PgfItem* item, PgfToken tok,
- PgfExprProb** out_ep, GuPool *pool)
+static PgfExprProb*
+pgf_match_string_lit(PgfConcr* concr, PgfSymbol* psym, size_t lin_idx,
+ GuString sentence, size_t* poffset,
+ GuPool *pool, GuPool *out_pool)
{
- GuPool* tmp_pool = gu_new_pool();
-
- size_t lin_idx;
- PgfSequence* seq;
- pgf_item_sequence(item, &lin_idx, &seq, tmp_pool);
gu_assert(lin_idx == 0);
- bool accepted = false;
- int n_syms = gu_seq_length(seq);
- if (n_syms == 0) {
- *out_ep = NULL;
- accepted = true;
- } else if (n_syms == 1) {
- PgfExprProb* ep = gu_new(PgfExprProb, pool);
+ size_t offset = *poffset;
+ while (!gu_is_space(sentence[offset]))
+ offset++;
+
+ size_t len = offset - *poffset;
+ if (len > 0) {
+ PgfExprProb* ep = gu_new(PgfExprProb, out_pool);
ep->prob = 0;
- PgfSymbolKS* sks =
- gu_variant_data(gu_seq_get(seq, PgfSymbol, 0));
-
PgfExprLit *expr_lit =
gu_new_variant(PGF_EXPR_LIT,
PgfExprLit,
- &ep->expr, pool);
+ &ep->expr, out_pool);
+
PgfLiteralStr *lit_str =
gu_new_flex_variant(PGF_LITERAL_STR,
PgfLiteralStr,
- val, strlen(sks->token)+1,
- &expr_lit->lit, pool);
- strcpy(lit_str->val, sks->token);
-
- *out_ep = ep;
- accepted = false;
+ val, len+1,
+ &expr_lit->lit, out_pool);
+ memcpy(lit_str->val, sentence+*poffset, len);
+ lit_str->val[len] = 0;
+
+ pgf_add_extern_tok(psym, lit_str->val, pool);
+ *poffset = offset;
+ return ep;
} else {
- *out_ep = NULL;
+ return NULL;
}
-
- gu_pool_free(tmp_pool);
- return accepted;
}
static PgfLiteralCallback pgf_string_literal_callback =
@@ -60,55 +53,46 @@ static PgfLiteralCallback pgf_string_literal_callback =
-static bool
-pgf_match_int_lit(PgfConcr* concr, PgfItem* item, PgfToken tok,
- PgfExprProb** out_ep, GuPool *pool)
+static PgfExprProb*
+pgf_match_int_lit(PgfConcr* concr, PgfSymbol* psym, size_t lin_idx,
+ GuString sentence, size_t* poffset,
+ GuPool *pool, GuPool *out_pool)
{
- GuPool* tmp_pool = gu_new_pool();
-
- size_t lin_idx;
- PgfSequence* seq;
- pgf_item_sequence(item, &lin_idx, &seq, tmp_pool);
gu_assert(lin_idx == 0);
- bool accepted = false;
- int n_syms = gu_seq_length(seq);
- if (n_syms == 0) {
- int val;
+ size_t offset = *poffset;
+ while (!gu_is_space(sentence[offset]))
+ offset++;
- *out_ep = NULL;
- accepted = gu_string_to_int(tok, &val);
- } else if (n_syms == 1) {
- PgfSymbolKS* sks =
- gu_variant_data(gu_seq_get(seq, PgfSymbol, 0));
+ size_t len = offset - *poffset;
+ if (len > 0) {
+ PgfToken tok = gu_malloc(pool, len+1);
+ memcpy((char*) tok, sentence+*poffset, len);
+ ((char*) tok)[len] = 0;
int val;
- if (!gu_string_to_int(sks->token, &val)) {
- *out_ep = NULL;
- } else {
- PgfExprProb* ep = gu_new(PgfExprProb, pool);
- ep->prob = 0;
-
- PgfExprLit *expr_lit =
- gu_new_variant(PGF_EXPR_LIT,
- PgfExprLit,
- &ep->expr, pool);
- PgfLiteralInt *lit_int =
- gu_new_variant(PGF_LITERAL_INT,
- PgfLiteralInt,
- &expr_lit->lit, pool);
- lit_int->val = val;
-
- *out_ep = ep;
- }
-
- accepted = false;
+ if (!gu_string_to_int(tok, &val))
+ return NULL;
+
+ PgfExprProb* ep = gu_new(PgfExprProb, pool);
+ ep->prob = 0;
+
+ PgfExprLit *expr_lit =
+ gu_new_variant(PGF_EXPR_LIT,
+ PgfExprLit,
+ &ep->expr, pool);
+ PgfLiteralInt *lit_int =
+ gu_new_variant(PGF_LITERAL_INT,
+ PgfLiteralInt,
+ &expr_lit->lit, pool);
+ lit_int->val = val;
+
+ pgf_add_extern_tok(psym, tok, pool);
+ *poffset = offset;
+ return ep;
} else {
- *out_ep = NULL;
+ return NULL;
}
-
- gu_pool_free(tmp_pool);
- return accepted;
}
static PgfLiteralCallback pgf_int_literal_callback =
@@ -116,55 +100,46 @@ static PgfLiteralCallback pgf_int_literal_callback =
-static bool
-pgf_match_float_lit(PgfConcr* concr, PgfItem* item, PgfToken tok,
- PgfExprProb** out_ep, GuPool *pool)
+static PgfExprProb*
+pgf_match_float_lit(PgfConcr* concr, PgfSymbol* psym, size_t lin_idx,
+ GuString sentence, size_t* poffset,
+ GuPool *pool, GuPool *out_pool)
{
- GuPool* tmp_pool = gu_new_pool();
-
- size_t lin_idx;
- PgfSequence* seq;
- pgf_item_sequence(item, &lin_idx, &seq, tmp_pool);
gu_assert(lin_idx == 0);
- bool accepted = false;
- int n_syms = gu_seq_length(seq);
- if (n_syms == 0) {
- double val;
+ size_t offset = *poffset;
+ while (!gu_is_space(sentence[offset]))
+ offset++;
- *out_ep = NULL;
- accepted = gu_string_to_double(tok, &val);
- } else if (n_syms == 1) {
- PgfSymbolKS* sks =
- gu_variant_data(gu_seq_get(seq, PgfSymbol, 0));
+ size_t len = offset - *poffset;
+ if (len > 0) {
+ PgfToken tok = gu_malloc(pool, len+1);
+ memcpy((char*) tok, sentence+*poffset, len);
+ ((char*) tok)[len] = 0;
double val;
- if (!gu_string_to_double(sks->token, &val)) {
- *out_ep = NULL;
- } else {
- PgfExprProb* ep = gu_new(PgfExprProb, pool);
- ep->prob = 0;
-
- PgfExprLit *expr_lit =
- gu_new_variant(PGF_EXPR_LIT,
- PgfExprLit,
- &ep->expr, pool);
- PgfLiteralFlt *lit_flt =
- gu_new_variant(PGF_LITERAL_FLT,
- PgfLiteralFlt,
- &expr_lit->lit, pool);
- lit_flt->val = val;
-
- *out_ep = ep;
- }
-
- accepted = false;
+ if (!gu_string_to_double(tok, &val))
+ return NULL;
+
+ PgfExprProb* ep = gu_new(PgfExprProb, pool);
+ ep->prob = 0;
+
+ PgfExprLit *expr_lit =
+ gu_new_variant(PGF_EXPR_LIT,
+ PgfExprLit,
+ &ep->expr, pool);
+ PgfLiteralFlt *lit_flt =
+ gu_new_variant(PGF_LITERAL_FLT,
+ PgfLiteralFlt,
+ &expr_lit->lit, pool);
+ lit_flt->val = val;
+
+ pgf_add_extern_tok(psym, tok, pool);
+ *poffset = offset;
+ return ep;
} else {
- *out_ep = NULL;
+ return NULL;
}
-
- gu_pool_free(tmp_pool);
- return accepted;
}
static PgfLiteralCallback pgf_float_literal_callback =
@@ -172,45 +147,49 @@ static PgfLiteralCallback pgf_float_literal_callback =
-static bool
-pgf_match_name_lit(PgfConcr* concr, PgfItem* item, PgfToken tok,
- PgfExprProb** out_ep, GuPool *pool)
+static PgfExprProb*
+pgf_match_name_lit(PgfConcr* concr, PgfSymbol* psym, size_t lin_idx,
+ GuString sentence, size_t* poffset,
+ GuPool *pool, GuPool *out_pool)
{
+ gu_assert(lin_idx == 0);
+
GuPool* tmp_pool = gu_new_pool();
+ GuStringBuf *sbuf = gu_string_buf(tmp_pool);
+ GuOut* out = gu_string_buf_out(sbuf);
+ GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool);
- size_t lin_idx;
- PgfSequence* seq;
- pgf_item_sequence(item, &lin_idx, &seq, tmp_pool);
+ size_t offset = *poffset;
- gu_assert(lin_idx == 0);
+ int i = 0;
+ while (iswupper(sentence[offset])) {
+ size_t len = 0;
+ while (!gu_is_space(sentence[offset+len])) {
+ len++;
+ }
- GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool);
-
- bool iscap = false;
- if (strcmp(tok, "-") == 0) {
- iscap = true;
- } else if (*tok) {
- GuIn* in = gu_string_in(tok, tmp_pool);
- iscap = iswupper(gu_in_utf8(in, err));
- }
+ PgfToken tok = gu_malloc(pool, len+1);
+ memcpy((char*) tok, sentence+offset, len);
+ ((char*) tok)[len] = 0;
- size_t n_syms = gu_seq_length(seq);
- if (!iscap && n_syms > 0) {
- GuStringBuf *sbuf = gu_string_buf(tmp_pool);
- GuOut* out = gu_string_buf_out(sbuf);
+ pgf_add_extern_tok(psym, tok, pool);
- for (size_t i = 0; i < n_syms; i++) {
- if (i > 0)
- gu_putc(' ', out, err);
+ if (i > 0)
+ gu_putc(' ', out, err);
+ gu_string_write(tok, out, err);
+
+ i++;
- PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i);
- gu_assert(gu_variant_tag(sym) == PGF_SYMBOL_KS);
- PgfSymbolKS* sks = gu_variant_data(sym);
+ offset += len;
+ *poffset = offset;
- gu_string_write(sks->token, out, err);
- }
+ while (gu_is_space(sentence[offset]))
+ offset++;
+ }
- PgfExprProb* ep = gu_new(PgfExprProb, pool);
+ PgfExprProb* ep = NULL;
+ if (i > 0) {
+ ep = gu_new(PgfExprProb, pool);
ep->prob = 0;
PgfExprApp *expr_app =
@@ -235,14 +214,11 @@ pgf_match_name_lit(PgfConcr* concr, PgfItem* item, PgfToken tok,
val, strlen(val)+1,
&expr_lit->lit, pool);
strcpy(lit_str->val, val);
- *out_ep = ep;
- } else {
- *out_ep = NULL;
}
gu_pool_free(tmp_pool);
-
- return iscap;
+
+ return ep;
}
PgfLiteralCallback pgf_nerc_literal_callback =
diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c
index 0ab757a62..476dda827 100644
--- a/src/runtime/c/pgf/parser.c
+++ b/src/runtime/c/pgf/parser.c
@@ -4,6 +4,7 @@
#include <gu/assert.h>
#include <gu/file.h>
+#include <gu/utf8.h>
#include <math.h>
#include <stdlib.h>
@@ -14,6 +15,8 @@
typedef GuBuf PgfItemBuf;
static GU_DEFINE_TYPE(PgfItemBuf, abstract, _);
+typedef struct PgfParseState PgfParseState;
+
struct PgfItemConts {
PgfCCat* ccat;
size_t lin_idx;
@@ -44,10 +47,14 @@ typedef struct {
PgfConcr* concr;
GuPool* pool; // this pool is used for structures internal to the parser
GuPool* out_pool; // this pool is used for the allocating the final abstract trees
- GuBuf* expr_queue;
+ GuString sentence; // the sentence to be parsed
+ GuBuf* expr_queue; // during the extraction of abstract trees we push them in this queue
PgfExpr meta_var;
PgfProduction meta_prod;
int max_fid;
+ PgfParseState *before;
+ PgfParseState *after;
+ PgfExprEnum en; // enumeration for the generated trees
#ifdef PGF_COUNTS_DEBUG
int item_full_count;
int item_real_count;
@@ -59,33 +66,15 @@ typedef struct {
prob_t beam_size;
} PgfParsing;
-typedef struct {
- PgfCCat* ccat;
- size_t lin_idx;
-} PgfCFCat;
-
-static GU_DEFINE_TYPE(PgfCFCat, struct,
- GU_MEMBER(PgfCFCat, ccat, PgfCCat),
- GU_MEMBER(PgfCFCat, lin_idx, size_t));
-
-extern GuHasher pgf_cfcat_hasher;
-
-GU_DEFINE_TYPE(PgfProductionIdx, GuMap,
- gu_type(PgfCFCat), &pgf_cfcat_hasher,
- gu_ptr_type(PgfProductionBuf), &gu_null_struct);
-
-typedef struct PgfTokenState PgfTokenState;
+typedef enum { BIND_NONE, BIND_HARD, BIND_SOFT } BIND_TYPE;
typedef struct {
- bool (*match_token)(PgfTokenState* ts, PgfToken tok, PgfItem* item);
- PgfToken (*get_token)(PgfTokenState* ts);
- PgfProductionIdx* (*get_lexicon_idx)(PgfTokenState* ts);
-} PgfTokenFn;
-
-struct PgfTokenState {
- PgfTokenFn* fn;
- prob_t lexical_prob;
-};
+ PgfProductionIdx* idx;
+ BIND_TYPE bind_type;
+ size_t offset;
+} PgfLexiconIdxEntry;
+
+typedef GuBuf PgfLexiconIdx;
struct PgfParseState {
PgfParseState* next;
@@ -94,12 +83,14 @@ struct PgfParseState {
PgfItem* meta_item;
PgfContsMap* conts_map;
PgfGenCatMap* generated_cats;
- unsigned short offset;
+
+ bool needs_bind;
+ size_t start_offset;
+ size_t end_offset;
prob_t viterbi_prob;
- PgfParsing* ps;
- PgfTokenState* ts;
+ PgfLexiconIdx* lexicon_idx;
};
typedef struct PgfAnswers {
@@ -115,13 +106,6 @@ typedef struct {
size_t arg_idx;
} PgfExprState;
-typedef struct PgfParseResult PgfParseResult;
-
-struct PgfParseResult {
- PgfParseState* state;
- PgfExprEnum en;
-};
-
typedef struct PgfItemBase PgfItemBase;
struct PgfItem {
@@ -133,15 +117,12 @@ struct PgfItem {
PgfProduction prod;
PgfPArgs* args;
PgfSymbol curr_sym;
- uint16_t seq_idx;
+ uint16_t sym_idx;
uint8_t alt_idx; // position in the pre alternative
uint8_t alt; // the number of the alternative
prob_t inside_prob;
};
-GU_DEFINE_TYPE(PgfLeftcornerTokIdx, GuStringMap,
- gu_ptr_type(PgfProductionIdx), &gu_null_struct);
-
static PgfSymbol
pgf_prev_extern_sym(PgfSymbol sym)
{
@@ -151,10 +132,11 @@ pgf_prev_extern_sym(PgfSymbol sym)
return *((PgfSymbol*) (((PgfSymbolCat*) i.data)+1));
case PGF_SYMBOL_KP:
return *((PgfSymbol*) (((PgfSymbolKP*) i.data)+1));
- case PGF_SYMBOL_KS:
+ case PGF_SYMBOL_KS: {
PgfSymbolKS* sks = (PgfSymbolKS*) i.data;
size_t tok_len = strlen(sks->token);
return *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1));
+ }
case PGF_SYMBOL_LIT:
return *((PgfSymbol*) (((PgfSymbolLit*) i.data)+1));
case PGF_SYMBOL_VAR:
@@ -169,30 +151,53 @@ pgf_prev_extern_sym(PgfSymbol sym)
}
}
-size_t
-pgf_item_lin_idx(PgfItem* item) {
- return item->conts->lin_idx;
+void
+pgf_add_extern_tok(PgfSymbol* psym, PgfToken tok, GuPool* pool) {
+ PgfSymbol new_sym;
+ size_t tok_len = strlen(tok);
+ PgfSymbolKS* sks = (PgfSymbolKS*)
+ gu_alloc_variant(PGF_SYMBOL_KS,
+ sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+tok_len+1,
+ gu_alignof(PgfSymbolKS),
+ &new_sym, pool);
+ strcpy(sks->token, tok);
+ *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1)) = *psym;
+ *psym = new_sym;
}
-int
-pgf_item_sequence_length(PgfItem* item)
+void
+pgf_add_extern_cat(PgfSymbol* psym, int d, int r, GuPool* pool) {
+ PgfSymbol new_sym;
+ PgfSymbolCat* scat = (PgfSymbolCat*)
+ gu_alloc_variant(PGF_SYMBOL_CAT,
+ sizeof(PgfSymbolCat)+sizeof(PgfSymbol),
+ gu_alignof(PgfSymbolCat),
+ &new_sym, pool);
+ *((PgfSymbol*) (scat+1)) = *psym;
+ scat->d = d;
+ scat->r = r;
+ *psym = new_sym;
+}
+
+static size_t
+pgf_item_symbols_length(PgfItem* item)
{
GuVariantInfo i = gu_variant_open(item->prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
- return gu_seq_length(papp->fun->lins[item->conts->lin_idx]);
+ return gu_seq_length(papp->fun->lins[item->conts->lin_idx]->syms);
}
case PGF_PRODUCTION_COERCE: {
return 1;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
- PgfSequence* seq;
-
+ PgfSymbols* syms;
+
if (pext->lins != NULL &&
- (seq = gu_seq_get(pext->lins,PgfSequence*,item->conts->lin_idx)) != NULL) {
- return gu_seq_length(seq);
+ (syms = gu_seq_get(pext->lins,PgfSymbols*,item->conts->lin_idx)) != NULL) {
+ return gu_seq_length(syms);
} else {
int seq_len = 0;
PgfSymbol sym = item->curr_sym;
@@ -220,33 +225,33 @@ pgf_item_sequence_length(PgfItem* item)
}
}
-static PgfSequence*
-pgf_extern_seq_get(PgfItem* item, GuPool* pool)
+static PgfSymbols*
+pgf_extern_syms_get(PgfItem* item, GuPool* pool)
{
- int seq_len = pgf_item_sequence_length(item);
+ int syms_len = pgf_item_symbols_length(item);
- PgfSequence* seq =
- gu_new_seq(PgfSymbol, seq_len, pool);
+ PgfSymbols* syms =
+ gu_new_seq(PgfSymbol, syms_len, pool);
PgfSymbol sym = item->curr_sym;
while (!gu_variant_is_null(sym)) {
- gu_seq_set(seq, PgfSymbol, --seq_len, sym);
+ gu_seq_set(syms, PgfSymbol, --syms_len, sym);
sym = pgf_prev_extern_sym(sym);
}
- return seq;
+ return syms;
}
-void
-pgf_item_sequence(PgfItem* item,
- size_t* lin_idx, PgfSequence** seq,
- GuPool* pool) {
+static void
+pgf_item_symbols(PgfItem* item,
+ size_t* lin_idx, PgfSymbols** syms,
+ GuPool* pool) {
*lin_idx = item->conts->lin_idx;
GuVariantInfo i = gu_variant_open(item->prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
- *seq = papp->fun->lins[item->conts->lin_idx];
+ *syms = papp->fun->lins[item->conts->lin_idx]->syms;
break;
}
case PGF_PRODUCTION_COERCE: {
@@ -254,21 +259,21 @@ pgf_item_sequence(PgfItem* item,
gu_new_variant_i(pool, PGF_SYMBOL_CAT,
PgfSymbolCat,
.d = 0, .r = item->conts->lin_idx);
- *seq = gu_new_seq(PgfSequence*, 1, pool);
- gu_seq_set(*seq, PgfSymbol, 0, sym);
+ *syms = gu_new_seq(PgfSymbol, 1, pool);
+ gu_seq_set(*syms, PgfSymbol, 0, sym);
break;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
if (pext->lins == NULL ||
- (*seq = gu_seq_get(pext->lins, PgfSequence*, item->conts->lin_idx)) == NULL) {
- *seq = pgf_extern_seq_get(item, pool);
+ (*syms = gu_seq_get(pext->lins, PgfSymbols*, item->conts->lin_idx)) == NULL) {
+ *syms = pgf_extern_syms_get(item, pool);
}
break;
}
case PGF_PRODUCTION_META: {
- *seq = pgf_extern_seq_get(item, pool);
+ *syms = pgf_extern_syms_get(item, pool);
break;
}
default:
@@ -285,7 +290,7 @@ pgf_print_production_args(PgfPArgs* args,
for (size_t j = 0; j < n_args; j++) {
if (j > 0)
gu_putc(',',out,err);
-
+
PgfPArg arg = gu_seq_get(args, PgfPArg, j);
if (arg.hypos != NULL &&
@@ -351,31 +356,38 @@ pgf_print_item_seq(PgfItem *item,
GuOut *out, GuExn* err, GuPool* pool)
{
size_t lin_idx;
- PgfSequence seq;
- pgf_item_sequence(item, &lin_idx, &seq, pool);
+ PgfSymbols* syms = NULL;
+ pgf_item_symbols(item, &lin_idx, &syms, pool);
gu_printf(out, err, "%d : ",lin_idx);
size_t index;
- for (index = 0; index < gu_seq_length(seq); index++) {
- if (item->seq_idx == index)
+ for (index = 0; index < gu_seq_length(syms); index++) {
+ if (item->sym_idx == index)
gu_printf(out, err, " . ");
- PgfSymbol *sym = gu_seq_index(seq, PgfSymbol, index);
- pgf_print_symbol(*sym, out, err);
+ PgfSymbol sym = gu_seq_get(syms, PgfSymbol, index);
+ pgf_print_symbol(sym, out, err);
}
- if (item->seq_idx == index)
+ if (item->sym_idx == index)
gu_printf(out, err, " .");
}
static void
+pgf_print_range(PgfParseState* start, PgfParseState* end, GuOut* out, GuExn* err)
+{
+ gu_printf(out, err, "%d-%d",
+ (start != NULL) ? start->end_offset : 0,
+ (start == end) ? end->end_offset : end->start_offset);
+}
+
+static void
pgf_print_item(PgfItem* item, PgfParseState* state, GuOut* out, GuExn* err, GuPool* pool)
{
- gu_printf(out, err, "[%d-%d; C%d -> ",
- item->conts->state ? item->conts->state->offset : 0,
- state ? state->offset : 0,
- item->conts->ccat->fid);
+ gu_printf(out, err, "[");
+ pgf_print_range(item->conts->state, state, out, err);
+ gu_printf(out, err, "; C%d -> ", item->conts->ccat->fid);
GuVariantInfo i = gu_variant_open(item->prod);
switch (i.tag) {
@@ -483,6 +495,45 @@ pgf_print_expr_state0(PgfExprState* st,
#endif
static int
+cmp_string(GuString* psent, size_t* plen, GuString tok)
+{
+ GuString sent = *psent;
+ size_t len = *plen;
+
+ while (*tok != 0) {
+ if (len == 0)
+ return -1;
+
+ if (((uint8_t) *sent) > ((uint8_t) *tok))
+ return 1;
+ else if (((uint8_t) *sent) < ((uint8_t) *tok))
+ return -2;
+
+ tok++;
+ sent++;
+ len--;
+ }
+
+ *psent = sent;
+ *plen = len;
+ return 0;
+}
+
+static bool
+skip_space(GuString* psent, size_t* plen)
+{
+ if (*plen == 0)
+ return false;
+
+ char c = **psent;
+ if (!gu_is_space(c))
+ return false;
+
+ (*psent)++;
+ return true;
+}
+
+static int
cmp_item_prob(GuOrder* self, const void* a, const void* b)
{
PgfItem *item1 = *((PgfItem **) a);
@@ -500,32 +551,31 @@ cmp_item_prob(GuOrder* self, const void* a, const void* b)
}
static GuOrder
-pgf_item_prob_order = { cmp_item_prob };
+pgf_item_prob_order[1] = { { cmp_item_prob } };
static PgfItemContss*
-pgf_parsing_get_contss(PgfContsMap* conts_map, PgfCCat* cat, GuPool *pool)
+pgf_parsing_get_contss(PgfParseState* state, PgfCCat* cat, GuPool *pool)
{
- PgfItemContss* contss = gu_map_get(conts_map, cat, PgfItemContss*);
+ PgfItemContss* contss = gu_map_get(state->conts_map, cat, PgfItemContss*);
if (contss == NULL) {
size_t n_lins = cat->cnccat->n_lins;
contss = gu_new_seq(PgfItemConts*, n_lins, pool);
for (size_t i = 0; i < n_lins; i++) {
gu_seq_set(contss, PgfItemConts*, i, NULL);
}
- gu_map_put(conts_map, cat, PgfItemContss*, contss);
+ gu_map_put(state->conts_map, cat, PgfItemContss*, contss);
}
return contss;
}
static PgfItemConts*
-pgf_parsing_get_conts(PgfContsMap* conts_map,
- PgfCCat* ccat, size_t lin_idx,
- PgfParseState* state,
+pgf_parsing_get_conts(PgfParseState* state,
+ PgfCCat* ccat, size_t lin_idx,
GuPool *pool)
{
gu_require(lin_idx < ccat->cnccat->n_lins);
PgfItemContss* contss =
- pgf_parsing_get_contss(conts_map, ccat, pool);
+ pgf_parsing_get_contss(state, ccat, pool);
PgfItemConts* conts = gu_seq_get(contss, PgfItemConts*, lin_idx);
if (!conts) {
conts = gu_new(PgfItemConts, pool);
@@ -555,13 +605,14 @@ gu_ccat_fini(GuFinalizer* fin)
}
static PgfCCat*
-pgf_parsing_create_completed(PgfParseState* state, PgfItemConts* conts,
+pgf_parsing_create_completed(PgfParsing* ps, PgfParseState* state,
+ PgfItemConts* conts,
prob_t viterbi_prob)
{
- PgfCCat* cat = gu_new_flex(state->ps->pool, PgfCCat, fin, 1);
+ PgfCCat* cat = gu_new_flex(ps->pool, PgfCCat, fin, 1);
cat->cnccat = conts->ccat->cnccat;
cat->viterbi_prob = viterbi_prob;
- cat->fid = state->ps->max_fid++;
+ cat->fid = ps->max_fid++;
cat->conts = conts;
cat->answers = NULL;
cat->prods = NULL;
@@ -569,7 +620,7 @@ pgf_parsing_create_completed(PgfParseState* state, PgfItemConts* conts,
gu_map_put(state->generated_cats, conts, PgfCCat*, cat);
cat->fin[0].fn = gu_ccat_fini;
- gu_pool_finally(state->ps->pool, cat->fin);
+ gu_pool_finally(ps->pool, cat->fin);
#ifdef PGF_COUNTS_DEBUG
state->ps->ccat_full_count++;
@@ -578,15 +629,6 @@ pgf_parsing_create_completed(PgfParseState* state, PgfItemConts* conts,
return cat;
}
-static void
-pgf_parsing_add_production(PgfCCat* ccat, PgfProduction prod)
-{
- if (ccat->prods == NULL || ccat->n_synprods >= gu_seq_length(ccat->prods)) {
- ccat->prods = gu_realloc_seq(ccat->prods, PgfProduction, ccat->n_synprods+1);
- }
- gu_seq_set(ccat->prods, PgfProduction, ccat->n_synprods++, prod);
-}
-
static PgfCCat*
pgf_parsing_get_completed(PgfParseState* state, PgfItemConts* conts)
{
@@ -602,18 +644,18 @@ pgf_item_set_curr_symbol(PgfItem* item, GuPool* pool)
PgfProductionApply* papp = i.data;
PgfCncFun* fun = papp->fun;
gu_assert(item->conts->lin_idx < fun->n_lins);
- PgfSequence* seq = fun->lins[item->conts->lin_idx];
- gu_assert(item->seq_idx <= gu_seq_length(seq));
- if (item->seq_idx == gu_seq_length(seq)) {
+ PgfSymbols* syms = fun->lins[item->conts->lin_idx]->syms;
+ gu_assert(item->sym_idx <= gu_seq_length(syms));
+ if (item->sym_idx == gu_seq_length(syms)) {
item->curr_sym = gu_null_variant;
} else {
- item->curr_sym = gu_seq_get(seq, PgfSymbol, item->seq_idx);
+ item->curr_sym = gu_seq_get(syms, PgfSymbol, item->sym_idx);
}
break;
}
case PGF_PRODUCTION_COERCE: {
- gu_assert(item->seq_idx <= 1);
- if (item->seq_idx == 1) {
+ gu_assert(item->sym_idx <= 1);
+ if (item->sym_idx == 1) {
item->curr_sym = gu_null_variant;
} else {
item->curr_sym = gu_new_variant_i(pool, PGF_SYMBOL_CAT,
@@ -634,12 +676,11 @@ pgf_item_set_curr_symbol(PgfItem* item, GuPool* pool)
}
static PgfItem*
-pgf_new_item(PgfItemConts* conts, PgfProduction prod,
- GuPool* pool, PgfParsing* ps)
+pgf_new_item(PgfParsing* ps, PgfItemConts* conts, PgfProduction prod)
{
PgfItem* item;
if (ps == NULL || ps->free_item == NULL)
- item = gu_new(PgfItem, pool);
+ item = gu_new(PgfItem, ps->pool);
else {
item = ps->free_item;
ps->free_item = ps->free_item->next;
@@ -661,7 +702,7 @@ pgf_new_item(PgfItemConts* conts, PgfProduction prod,
}
case PGF_PRODUCTION_COERCE: {
PgfProductionCoerce* pcoerce = pi.data;
- item->args = gu_new_seq(PgfPArg, 1, pool);
+ item->args = gu_new_seq(PgfPArg, 1, ps->pool);
PgfPArg* parg = gu_seq_index(item->args, PgfPArg, 0);
parg->hypos = NULL;
parg->ccat = pcoerce->coerce;
@@ -671,13 +712,7 @@ pgf_new_item(PgfItemConts* conts, PgfProduction prod,
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = pi.data;
item->args = gu_empty_seq();
- item->inside_prob = pext->ep ? pext->ep->prob : 0;
-
- int n_args = gu_seq_length(item->args);
- for (int i = 0; i < n_args; i++) {
- PgfPArg *arg = gu_seq_index(item->args, PgfPArg, i);
- item->inside_prob += arg->ccat->viterbi_prob;
- }
+ item->inside_prob = pext->ep->prob;
break;
}
case PGF_PRODUCTION_META: {
@@ -698,13 +733,13 @@ pgf_new_item(PgfItemConts* conts, PgfProduction prod,
item->conts = conts;
item->prod = prod;
item->curr_sym = gu_null_variant;
- item->seq_idx = 0;
+ item->sym_idx = 0;
item->alt_idx = 0;
item->alt = 0;
conts->ref_count++;
- pgf_item_set_curr_symbol(item, pool);
+ pgf_item_set_curr_symbol(item, ps->pool);
#ifdef PGF_COUNTS_DEBUG
if (ps != NULL) {
@@ -717,11 +752,11 @@ pgf_new_item(PgfItemConts* conts, PgfProduction prod,
}
static PgfItem*
-pgf_item_copy(PgfItem* item, GuPool* pool, PgfParsing* ps)
+pgf_item_copy(PgfItem* item, PgfParsing* ps)
{
PgfItem* copy;
if (ps == NULL || ps->free_item == NULL)
- copy = gu_new(PgfItem, pool);
+ copy = gu_new(PgfItem, ps->pool);
else {
copy = ps->free_item;
ps->free_item = ps->free_item->next;
@@ -742,14 +777,14 @@ pgf_item_copy(PgfItem* item, GuPool* pool, PgfParsing* ps)
static PgfItem*
pgf_item_update_arg(PgfItem* item, size_t d, PgfCCat *new_ccat,
- GuPool* pool, PgfParsing *ps)
+ PgfParsing *ps)
{
PgfCCat *old_ccat =
gu_seq_index(item->args, PgfPArg, d)->ccat;
- PgfItem* new_item = pgf_item_copy(item, pool, ps);
+ PgfItem* new_item = pgf_item_copy(item, ps);
size_t nargs = gu_seq_length(item->args);
- new_item->args = gu_new_seq(PgfPArg, nargs, pool);
+ new_item->args = gu_new_seq(PgfPArg, nargs, ps->pool);
memcpy(gu_seq_data(new_item->args), gu_seq_data(item->args),
nargs * sizeof(PgfPArg));
gu_seq_set(new_item->args, PgfPArg, d,
@@ -763,8 +798,9 @@ pgf_item_update_arg(PgfItem* item, size_t d, PgfCCat *new_ccat,
static void
pgf_item_advance(PgfItem* item, GuPool* pool)
{
+
if (GU_LIKELY(item->alt == 0)) {
- item->seq_idx++;
+ item->sym_idx++;
pgf_item_set_curr_symbol(item, pool);
}
else
@@ -772,8 +808,7 @@ pgf_item_advance(PgfItem* item, GuPool* pool)
}
static void
-pgf_item_free(PgfParseState* before, PgfParseState* after,
- PgfItem* item)
+pgf_item_free(PgfParsing* ps, PgfItem* item)
{
GuVariantInfo i = gu_variant_open(item->prod);
switch (i.tag) {
@@ -797,37 +832,21 @@ pgf_item_free(PgfParseState* before, PgfParseState* after,
if (cont == NULL)
continue;
- pgf_item_free(before, after, cont);
+ pgf_item_free(ps, cont);
}
}
#ifdef PGF_PARSER_DEBUG
memset(item, 0, sizeof(*item));
#endif
- item->next = before->ps->free_item;
- before->ps->free_item = item;
+ item->next = ps->free_item;
+ ps->free_item = item;
#ifdef PGF_COUNTS_DEBUG
- before->ps->item_real_count--;
+ ps->item_real_count--;
#endif
}
static void
-pgf_parsing_add_transition(PgfParseState* before, PgfParseState* after,
- PgfToken tok, PgfItem* item)
-{
- if (after->ts->fn->match_token(after->ts, tok, item)) {
- if (after->next == NULL) {
- after->viterbi_prob =
- item->inside_prob+item->conts->outside_prob;
- }
-
- gu_buf_heap_push(after->agenda, &pgf_item_prob_order, &item);
- } else {
- pgf_item_free(before, after, item);
- }
-}
-
-static void
pgf_result_predict(PgfParsing* ps,
PgfExprState* cont, PgfCCat* ccat);
@@ -836,12 +855,13 @@ pgf_result_production(PgfParsing* ps,
PgfAnswers* answers, PgfProduction prod);
static void
-pgf_parsing_combine(PgfParseState* before, PgfParseState* after,
+pgf_parsing_combine(PgfParsing* ps,
+ PgfParseState* before, PgfParseState* after,
PgfItem* cont, PgfCCat* cat, int lin_idx)
{
if (cont == NULL) {
- if (after == NULL) {
- pgf_result_predict(before->ps, NULL, cat);
+ if (before->end_offset == strlen(ps->sentence)) {
+ pgf_result_predict(ps, NULL, cat);
}
return;
}
@@ -850,29 +870,29 @@ pgf_parsing_combine(PgfParseState* before, PgfParseState* after,
switch (gu_variant_tag(cont->curr_sym)) {
case PGF_SYMBOL_CAT: {
PgfSymbolCat* scat = gu_variant_data(cont->curr_sym);
- item = pgf_item_update_arg(cont, scat->d, cat, before->ps->pool, before->ps);
+ item = pgf_item_update_arg(cont, scat->d, cat, ps);
break;
}
case PGF_SYMBOL_LIT: {
PgfSymbolLit* slit = gu_variant_data(cont->curr_sym);
- item = pgf_item_update_arg(cont, slit->d, cat, before->ps->pool, before->ps);
+ item = pgf_item_update_arg(cont, slit->d, cat, ps);
break;
}
default:
gu_impossible();
}
- pgf_item_advance(item, before->ps->pool);
- gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
+ pgf_item_advance(item, ps->pool);
+ gu_buf_heap_push(before->agenda, pgf_item_prob_order, &item);
}
static void
-pgf_parsing_production(PgfParseState* state,
+pgf_parsing_production(PgfParsing* ps, PgfParseState* state,
PgfItemConts* conts, PgfProduction prod)
{
PgfItem* item =
- pgf_new_item(conts, prod, state->ps->pool, state->ps);
- gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
+ pgf_new_item(ps, conts, prod);
+ gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
}
static PgfProduction
@@ -904,9 +924,9 @@ pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool)
PgfProductionExtern* pext = i.data;
if (pext->lins == NULL ||
- gu_seq_get(pext->lins,PgfSequence*,item->conts->lin_idx) == NULL) {
- PgfSequence* seq =
- pgf_extern_seq_get(item, pool);
+ gu_seq_get(pext->lins,PgfSymbols*,item->conts->lin_idx) == NULL) {
+ PgfSymbols* syms =
+ pgf_extern_syms_get(item, pool);
size_t n_lins = item->conts->ccat->cnccat->n_lins;
@@ -914,22 +934,20 @@ pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool)
gu_new_variant(PGF_PRODUCTION_EXTERN,
PgfProductionExtern,
&prod, pool);
- new_pext->callback = pext->callback;
new_pext->ep = ep;
- new_pext->lins = gu_new_seq(PgfSequence*, n_lins, pool);
+ new_pext->lins = gu_new_seq(PgfSymbols*, n_lins, pool);
if (pext->lins == NULL) {
for (size_t i = 0; i < n_lins; i++) {
- gu_seq_set(new_pext->lins,PgfSequence*,i,
- NULL);
+ gu_seq_set(new_pext->lins,PgfSymbols*,i,NULL);
}
} else {
for (size_t i = 0; i < n_lins; i++) {
- gu_seq_set(new_pext->lins,PgfSequence*,i,
- gu_seq_get(pext->lins,PgfSequence*,i));
+ gu_seq_set(new_pext->lins,PgfSymbols*,i,
+ gu_seq_get(pext->lins,PgfSymbols*,i));
}
}
- gu_seq_set(new_pext->lins,PgfSequence*,item->conts->lin_idx,seq);
+ gu_seq_set(new_pext->lins,PgfSymbols*,item->conts->lin_idx,syms);
} else {
prod = item->prod;
}
@@ -952,42 +970,41 @@ pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool)
}
static void
-pgf_parsing_complete(PgfParseState* before, PgfParseState* after,
- PgfItem* item, PgfExprProb *ep)
-{
- PgfProduction prod =
- pgf_parsing_new_production(item, ep, before->ps->pool);
-#ifdef PGF_COUNTS_DEBUG
- before->ps->prod_full_count++;
-#endif
-
- PgfCCat* tmp_cat = pgf_parsing_get_completed(before, item->conts);
- PgfCCat* cat = tmp_cat;
- if (cat == NULL) {
- cat = pgf_parsing_create_completed(before, item->conts,
- item->inside_prob);
+pgf_parsing_add_production(PgfParsing* ps,
+ PgfParseState* before, PgfParseState* after,
+ PgfItemConts* conts, PgfProduction prod,
+ prob_t inside_prob)
+{
+ PgfCCat* tmp_ccat = pgf_parsing_get_completed(before, conts);
+ PgfCCat* ccat = tmp_ccat;
+ if (ccat == NULL) {
+ ccat = pgf_parsing_create_completed(ps, before, conts, inside_prob);
}
- pgf_parsing_add_production(cat, prod);
+ if (ccat->prods == NULL || ccat->n_synprods >= gu_seq_length(ccat->prods)) {
+ ccat->prods = gu_realloc_seq(ccat->prods, PgfProduction, ccat->n_synprods+1);
+ }
+ gu_seq_set(ccat->prods, PgfProduction, ccat->n_synprods++, prod);
#ifdef PGF_PARSER_DEBUG
GuPool* tmp_pool = gu_new_pool();
GuOut* out = gu_file_out(stderr, tmp_pool);
GuExn* err = gu_exn(NULL, type, tmp_pool);
- if (tmp_cat == NULL)
- gu_printf(out, err, "[%d-%d; C%d; %d; C%d]\n",
- item->conts->state ? item->conts->state->offset : 0,
- before->offset,
- item->conts->ccat->fid,
- item->conts->lin_idx,
- cat->fid);
- pgf_print_production(cat->fid, prod, out, err, tmp_pool);
+ if (tmp_ccat == NULL) {
+ gu_printf(out, err, "[");
+ pgf_print_range(conts->state, before, out, err);
+ gu_printf(out, err, "; C%d; %d; C%d]\n",
+ conts->ccat->fid,
+ conts->lin_idx,
+ ccat->fid);
+ }
+ pgf_print_production(ccat->fid, prod, out, err, tmp_pool);
gu_pool_free(tmp_pool);
#endif
- if (tmp_cat != NULL) {
+ if (tmp_ccat != NULL) {
PgfItemContss* contss =
- pgf_parsing_get_contss(before->conts_map, cat, before->ps->pool);
+ pgf_parsing_get_contss(before, ccat, ps->pool);
size_t n_contss = gu_seq_length(contss);
for (size_t i = 0; i < n_contss; i++) {
PgfItemConts* conts2 = gu_seq_get(contss, PgfItemConts*, i);
@@ -997,7 +1014,7 @@ pgf_parsing_complete(PgfParseState* before, PgfParseState* after,
* production immediately to the agenda,
* i.e. process it. */
if (conts2) {
- pgf_parsing_production(before, conts2, prod);
+ pgf_parsing_production(ps, before, conts2, prod);
}
}
@@ -1006,7 +1023,7 @@ pgf_parsing_complete(PgfParseState* before, PgfParseState* after,
PgfParseState* state = after;
while (state != NULL) {
PgfItemContss* contss =
- pgf_parsing_get_contss(state->conts_map, cat, state->ps->pool);
+ pgf_parsing_get_contss(state, ccat, ps->pool);
size_t n_contss = gu_seq_length(contss);
for (size_t i = 0; i < n_contss; i++) {
PgfItemConts* conts2 = gu_seq_get(contss, PgfItemConts*, i);
@@ -1016,33 +1033,249 @@ pgf_parsing_complete(PgfParseState* before, PgfParseState* after,
* production immediately to the agenda,
* i.e. process it. */
if (conts2) {
- pgf_parsing_production(state, conts2, prod);
+ pgf_parsing_production(ps, state, conts2, prod);
}
}
state = state->next;
}
- if (cat->answers != NULL) {
- pgf_result_production(before->ps, cat->answers, prod);
+ if (ccat->answers != NULL) {
+ pgf_result_production(ps, ccat->answers, prod);
}
} else {
- size_t n_conts = gu_buf_length(item->conts->items);
+ size_t n_conts = gu_buf_length(conts->items);
for (size_t i = 0; i < n_conts; i++) {
- PgfItem* cont = gu_buf_get(item->conts->items, PgfItem*, i);
- pgf_parsing_combine(before, after, cont, cat, item->conts->lin_idx);
+ PgfItem* cont = gu_buf_get(conts->items, PgfItem*, i);
+ pgf_parsing_combine(ps, before, after, cont, ccat, conts->lin_idx);
}
}
}
static void
-pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
+pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep)
+{
+ PgfProduction prod =
+ pgf_parsing_new_production(item, ep, ps->pool);
+#ifdef PGF_COUNTS_DEBUG
+ before->ps->prod_full_count++;
+#endif
+
+ pgf_parsing_add_production(ps, ps->before, ps->after, item->conts, prod, item->inside_prob);
+}
+
+static int
+pgf_symbols_cmp(GuString* psent, size_t sent_len, BIND_TYPE* pbind, PgfSymbols* syms)
+{
+ GuString sent = *psent;
+
+ size_t n_syms = gu_seq_length(syms);
+ for (size_t i = 0; i < n_syms; i++) {
+ PgfSymbol sym = gu_seq_get(syms, PgfSymbol, i);
+
+ GuVariantInfo inf = gu_variant_open(sym);
+ switch (inf.tag) {
+ case PGF_SYMBOL_CAT:
+ case PGF_SYMBOL_LIT:
+ case PGF_SYMBOL_VAR: {
+ if (sent_len == 0)
+ return -1;
+ return 1;
+ }
+ case PGF_SYMBOL_KS: {
+ PgfSymbolKS* pks = inf.data;
+ if (sent_len == 0)
+ return -1;
+
+ if (*pbind == BIND_HARD)
+ *pbind = BIND_NONE;
+ else {
+ if (*pbind != BIND_SOFT && !skip_space(&sent, &sent_len))
+ return 1;
+
+ while (*sent != 0) {
+ if (!skip_space(&sent, &sent_len))
+ break;
+ }
+ }
+
+ int cmp = cmp_string(&sent, &sent_len, pks->token);
+ if (cmp != 0)
+ return cmp;
+ break;
+ }
+ case PGF_SYMBOL_KP: {
+ return -2;
+ }
+ case PGF_SYMBOL_BIND: {
+ *pbind = BIND_HARD;
+ break;
+ }
+ case PGF_SYMBOL_NE: {
+ return -2;
+ }
+ default:
+ gu_impossible();
+ }
+ }
+
+ *psent = sent;
+ return 0;
+}
+
+static void
+pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state)
+{
+ PgfSequence* epsilon_seq =
+ gu_seq_index(ps->concr->sequences, PgfSequence, 0);
+ if (gu_seq_length(epsilon_seq->syms) == 0) {
+ // Since the sequences are sorted, the epsilon sequence will
+ // always be the first if there is any at all. We should
+ // always add the epsilon in the index, because we do
+ // bottom up prediction for epsilons.
+ PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
+ entry->idx = epsilon_seq->idx;
+ entry->bind_type = BIND_NONE;
+ entry->offset = state->start_offset;
+ }
+
+ size_t i = 0;
+ size_t j = gu_seq_length(ps->concr->sequences)-1;
+ size_t s = j;
+ size_t n = 1;
+ size_t sent_len = strlen(ps->sentence);
+
+ while (state->end_offset + n <= sent_len) {
+ while (i <= j) {
+ size_t k = (i+j) / 2;
+ PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, k);
+
+ GuString current = ps->sentence + state->end_offset;
+ BIND_TYPE bind_type = state->needs_bind ? BIND_NONE : BIND_HARD;
+ switch (pgf_symbols_cmp(&current, n, &bind_type, seq->syms)) {
+ case -2:
+ j = k-1;
+ s = j;
+ break;
+ case -1:
+ j = k-1;
+ break;
+ case 0: {
+ if (seq->idx != NULL) {
+ PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
+ entry->idx = seq->idx;
+ entry->bind_type = bind_type;
+ entry->offset = (current - ps->sentence);
+ }
+ i = k+1;
+ goto next;
+ }
+ case 1:
+ i = k+1;
+ break;
+ }
+ }
+
+next:
+ j = s;
+ n++;
+ }
+}
+
+static PgfParseState*
+pgf_new_parse_state(PgfParsing* ps, size_t start_offset, BIND_TYPE bind_type)
+{
+ PgfParseState** pstate;
+ if (ps->before == NULL && start_offset == 0)
+ pstate = &ps->before;
+ else {
+ if (bind_type != BIND_NONE) {
+ if (ps->before->start_offset == start_offset &&
+ ps->before->end_offset == start_offset &&
+ !ps->before->needs_bind)
+ return ps->before;
+ } else {
+ if (ps->before->start_offset == start_offset)
+ return ps->before;
+ }
+
+ pstate = &ps->after;
+ while (*pstate != NULL) {
+ if (bind_type != BIND_NONE) {
+ if (ps->before->start_offset == start_offset &&
+ ps->before->end_offset == start_offset &&
+ !ps->before->needs_bind)
+ return ps->before;
+ } else {
+ if ((*pstate)->start_offset == start_offset)
+ return *pstate;
+ }
+ if ((*pstate)->start_offset > start_offset)
+ break;
+ pstate = &(*pstate)->next;
+ }
+ }
+
+ size_t end_offset = start_offset;
+ GuString current = ps->sentence + start_offset;
+ size_t len = strlen(current);
+ while (skip_space(&current, &len)) {
+ end_offset++;
+ }
+
+ if (bind_type == BIND_HARD && start_offset != end_offset)
+ return NULL;
+
+ PgfParseState* state = gu_new(PgfParseState, ps->pool);
+ state->next = *pstate;
+ state->agenda = gu_new_buf(PgfItem*, ps->pool);
+ state->meta_item = NULL;
+ state->generated_cats = gu_map_type_new(PgfGenCatMap, ps->pool);
+ state->conts_map = gu_map_type_new(PgfContsMap, ps->pool);
+ state->needs_bind = (bind_type == BIND_NONE) &&
+ (start_offset == end_offset);
+ state->start_offset = start_offset;
+ state->end_offset = end_offset;
+ state->viterbi_prob = 0;
+ state->lexicon_idx =
+ gu_new_buf(PgfLexiconIdxEntry, ps->pool);
+
+ if (ps->before == NULL && start_offset == 0)
+ state->needs_bind = false;
+
+ pgf_parsing_lookahead(ps, state);
+
+ *pstate = state;
+
+ return state;
+}
+
+static void
+pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
+{
+ GuString current = ps->sentence + ps->before->end_offset;
+ size_t len = strlen(current);
+
+ if (!ps->before->needs_bind && cmp_string(&current, &len, tok) == 0) {
+ PgfParseState* state =
+ pgf_new_parse_state(ps, (current - ps->sentence), BIND_NONE);
+ if (state->next == NULL) {
+ state->viterbi_prob =
+ item->inside_prob+item->conts->outside_prob;
+ }
+
+ gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
+ } else {
+ pgf_item_free(ps, item);
+ }
+}
+
+static void
+pgf_parsing_td_predict(PgfParsing* ps,
PgfItem* item, PgfCCat* ccat, size_t lin_idx)
{
PgfItemConts* conts =
- pgf_parsing_get_conts(before->conts_map,
- ccat, lin_idx, before,
- before->ps->pool);
+ pgf_parsing_get_conts(ps->before, ccat, lin_idx, ps->pool);
gu_buf_push(conts->items, PgfItem*, item);
if (gu_buf_length(conts->items) == 1) {
/* First time we encounter this linearization
@@ -1053,71 +1286,52 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
item->inside_prob-conts->ccat->viterbi_prob+
item->conts->outside_prob;
- size_t n_prods = ccat->n_synprods;
- PgfProductionIdx* lexicon_idx = NULL;
- if (after != NULL) {
- lexicon_idx = after->ts->fn->get_lexicon_idx(after->ts);
-
- // we don't know the current token.
- // probably we just compute the list of completions
- if (lexicon_idx == NULL && ccat->fid < after->ps->concr->total_cats)
- n_prods = gu_seq_length(ccat->prods);
- }
-
// Top-down prediction for syntactic rules
- for (size_t i = 0; i < n_prods; i++) {
+ for (size_t i = 0; i < ccat->n_synprods; i++) {
PgfProduction prod =
gu_seq_get(ccat->prods, PgfProduction, i);
- pgf_parsing_production(before, conts, prod);
+ pgf_parsing_production(ps, ps->before, conts, prod);
}
- // Bottom-up prediction for lexical rules
-
- if (lexicon_idx != NULL) {
- PgfCFCat cfc = {ccat, lin_idx};
- PgfProductionBuf* tok_prods =
- gu_map_get(lexicon_idx, &cfc, PgfProductionBuf*);
-
- if (tok_prods != NULL) {
- size_t n_prods = gu_buf_length(tok_prods);
- for (size_t i = 0; i < n_prods; i++) {
- PgfProduction prod =
- gu_buf_get(tok_prods, PgfProduction, i);
-
- pgf_parsing_production(before, conts, prod);
+ // Bottom-up prediction for lexical and epsilon rules
+ size_t n_idcs = gu_buf_length(ps->before->lexicon_idx);
+ for (size_t i = 0; i < n_idcs; i++) {
+ PgfLexiconIdxEntry* lentry =
+ gu_buf_index(ps->before->lexicon_idx, PgfLexiconIdxEntry, i);
+ PgfParseState* state =
+ pgf_new_parse_state(ps, lentry->offset, lentry->bind_type);
+
+ if (state != NULL) {
+ size_t n_entries = gu_buf_length(lentry->idx);
+ for (size_t j = 0; j < n_entries; j++) {
+ PgfProductionIdxEntry* pentry =
+ gu_buf_index(lentry->idx, PgfProductionIdxEntry, j);
+
+ if (pentry->ccat == ccat && pentry->lin_idx == lin_idx) {
+ GuVariantInfo i = { PGF_PRODUCTION_APPLY, pentry->papp };
+ PgfProduction prod = gu_variant_close(i);
+ pgf_parsing_add_production(ps, state, state->next,
+ conts, prod,
+ pentry->papp->fun->absfun->ep.prob);
+ }
}
}
}
-
- // Bottom-up prediction for epsilon rules
- PgfCFCat cfc = {ccat, lin_idx};
- PgfProductionBuf* eps_prods =
- gu_map_get(before->ps->concr->epsilon_idx, &cfc, PgfProductionBuf*);
-
- if (eps_prods != NULL) {
- size_t n_prods = gu_buf_length(eps_prods);
- for (size_t i = 0; i < n_prods; i++) {
- PgfProduction prod =
- gu_buf_get(eps_prods, PgfProduction, i);
-
- pgf_parsing_production(before, conts, prod);
- }
- }
} else {
/* If it has already been completed, combine. */
PgfCCat* completed =
- pgf_parsing_get_completed(before, conts);
+ pgf_parsing_get_completed(ps->before, conts);
if (completed) {
- pgf_parsing_combine(before, after, item, completed, lin_idx);
+ pgf_parsing_combine(ps, ps->before, ps->after, item, completed, lin_idx);
}
- PgfParseState* state = after;
+ PgfParseState* state = ps->after;
while (state != NULL) {
PgfCCat* completed =
pgf_parsing_get_completed(state, conts);
if (completed) {
- pgf_parsing_combine(state, state->next, item, completed, lin_idx);
+ pgf_parsing_combine(ps, state, state->next, item, completed, lin_idx);
}
state = state->next;
@@ -1126,32 +1340,24 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
}
static void
-pgf_parsing_meta_scan(PgfParseState* before, PgfParseState* after,
+pgf_parsing_meta_scan(PgfParsing* ps,
PgfItem* meta_item, prob_t meta_prob)
{
- PgfToken tok = after->ts->fn->get_token(after->ts);
+/* PgfToken tok = after->ts->fn->get_token(after->ts);
if (*tok == 0) {
- PgfItem* item = pgf_item_copy(meta_item, before->ps->pool, before->ps);
+ PgfItem* item = pgf_item_copy(meta_item, before->ps);
item->inside_prob += meta_prob;
- PgfSymbol prev = item->curr_sym;
- size_t tok_len = strlen(tok);
- PgfSymbolKS* sks = (PgfSymbolKS*)
- gu_alloc_variant(PGF_SYMBOL_KS,
- sizeof(PgfSymbolKS)+tok_len+1+sizeof(PgfSymbol),
- gu_alignof(PgfSymbolKS),
- &item->curr_sym, after->ps->pool);
- strcpy((char*) sks->token, (char*) tok);
- *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1)) = prev;
+ pgf_add_extern_tok(item, tok, ps->pool);
- gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
- }
+ gu_buf_heap_push(before->agenda, pgf_item_prob_order, &item);
+ }*/
}
typedef struct {
GuMapItor fn;
- PgfParseState* state;
+ PgfParsing* ps;
PgfItem* meta_item;
} PgfMetaPredictFn;
@@ -1163,11 +1369,11 @@ pgf_parsing_meta_predict(GuMapItor* fn, const void* key, void* value, GuExn* err
PgfAbsCat* abscat = (PgfAbsCat*) key;
prob_t meta_prob = *((prob_t*) value);
PgfMetaPredictFn* clo = (PgfMetaPredictFn*) fn;
- PgfParseState* state = clo->state;
+ PgfParsing* ps = clo->ps;
PgfItem* meta_item = clo->meta_item;
PgfCncCat* cnccat =
- gu_map_get(state->ps->concr->cnccats, abscat->name, PgfCncCat*);
+ gu_map_get(ps->concr->cnccats, abscat->name, PgfCncCat*);
if (cnccat == NULL)
return;
@@ -1181,35 +1387,27 @@ pgf_parsing_meta_predict(GuMapItor* fn, const void* key, void* value, GuExn* err
for (size_t lin_idx = 0; lin_idx < cnccat->n_lins; lin_idx++) {
PgfItem* item =
- pgf_item_copy(meta_item, state->ps->pool, state->ps);
+ pgf_item_copy(meta_item, ps);
item->inside_prob +=
ccat->viterbi_prob+meta_prob;
size_t nargs = gu_seq_length(meta_item->args);
- item->args = gu_new_seq(PgfPArg, nargs+1, state->ps->pool);
+ item->args = gu_new_seq(PgfPArg, nargs+1, ps->pool);
memcpy(gu_seq_data(item->args), gu_seq_data(meta_item->args),
nargs * sizeof(PgfPArg));
gu_seq_set(item->args, PgfPArg, nargs,
((PgfPArg) { .hypos = NULL, .ccat = ccat }));
- PgfSymbol prev = item->curr_sym;
- PgfSymbolCat* scat = (PgfSymbolCat*)
- gu_alloc_variant(PGF_SYMBOL_CAT,
- sizeof(PgfSymbolCat)+sizeof(PgfSymbol),
- gu_alignof(PgfSymbolCat),
- &item->curr_sym, state->ps->pool);
- *((PgfSymbol*) (scat+1)) = prev;
- scat->d = nargs;
- scat->r = lin_idx;
-
- gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
+ pgf_add_extern_cat(&item->curr_sym, nargs, lin_idx, ps->pool);
+
+ gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item);
}
}
}
static void
-pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
- PgfItem* item, PgfSymbol sym) {
+pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
+{
switch (gu_variant_tag(sym)) {
case PGF_SYMBOL_CAT: {
PgfSymbolCat* scat = gu_variant_data(sym);
@@ -1217,124 +1415,136 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
if (parg->ccat->prods == NULL) {
// empty category
- pgf_item_free(before, after, item);
+ pgf_item_free(ps, item);
return;
}
- pgf_parsing_td_predict(before, after, item, parg->ccat, scat->r);
+ pgf_parsing_td_predict(ps, item, parg->ccat, scat->r);
break;
}
case PGF_SYMBOL_KS: {
- if (after != NULL) {
- PgfSymbolKS* sks = gu_variant_data(sym);
- pgf_item_advance(item, after->ps->pool);
- pgf_parsing_add_transition(before, after, sks->token, item);
- }
+ PgfSymbolKS* sks = gu_variant_data(sym);
+ pgf_item_advance(item, ps->pool);
+ pgf_parsing_add_transition(ps, sks->token, item);
break;
}
case PGF_SYMBOL_KP: {
- if (after != NULL) {
+ if (ps->after != NULL) {
PgfSymbolKP* skp = gu_variant_data(sym);
PgfSymbol sym;
if (item->alt == 0) {
PgfItem* new_item;
- new_item = pgf_item_copy(item, after->ps->pool, after->ps);
+ new_item = pgf_item_copy(item, ps);
new_item->alt = 1;
new_item->alt_idx = 0;
sym = gu_seq_get(skp->default_form, PgfSymbol, new_item->alt_idx);
- pgf_parsing_symbol(before, after, new_item, sym);
+ pgf_parsing_symbol(ps, new_item, sym);
for (size_t i = 0; i < skp->n_forms; i++) {
- PgfSequence* syms = skp->forms[i].form;
- PgfSequence* syms2 = skp->default_form;
+ PgfSymbols* syms = skp->forms[i].form;
+ PgfSymbols* syms2 = skp->default_form;
bool skip = false; /*pgf_tokens_equal(toks, toks2);
for (size_t j = 0; j < i; j++) {
PgfTokens* toks2 = skp->forms[j].form;
skip |= pgf_tokens_equal(toks, toks2);
}*/
if (!skip) {
- new_item = pgf_item_copy(item, after->ps->pool, after->ps);
+ new_item = pgf_item_copy(item, ps);
new_item->alt = i+2;
new_item->alt_idx = 0;
sym = gu_seq_get(syms, PgfSymbol, new_item->alt_idx);
- pgf_parsing_symbol(before, after, new_item, sym);
+ pgf_parsing_symbol(ps, new_item, sym);
}
}
} else {
- PgfSequence* syms =
+ PgfSymbols* syms =
(item->alt == 1) ? skp->default_form :
skp->forms[item->alt-2].form;
if (item->alt_idx < gu_seq_length(syms)) {
sym = gu_seq_get(syms, PgfSymbol, item->alt_idx);
- pgf_parsing_symbol(before, after, item, sym);
+ pgf_parsing_symbol(ps, item, sym);
} else {
item->alt = 0;
- pgf_item_advance(item, after->ps->pool);
- gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
+ pgf_item_advance(item, ps->pool);
+ gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item);
}
}
}
break;
}
case PGF_SYMBOL_LIT: {
- if (after != NULL) {
- PgfSymbolLit* slit = gu_variant_data(sym);
- PgfPArg* parg = gu_seq_index(item->args, PgfPArg, slit->d);
+ PgfSymbolLit* slit = gu_variant_data(sym);
+ PgfPArg* parg = gu_seq_index(item->args, PgfPArg, slit->d);
- if (parg->ccat->fid > 0 &&
- parg->ccat->fid >= before->ps->concr->total_cats) {
- pgf_parsing_td_predict(before, after, item, parg->ccat, slit->r);
- }
- else {
- PgfItemConts* conts =
- pgf_parsing_get_conts(before->conts_map,
- parg->ccat, slit->r, before,
- before->ps->pool);
- gu_buf_push(conts->items, PgfItem*, item);
-
- if (gu_buf_length(conts->items) == 1) {
- /* This is the first time when we encounter this
- * literal category so we must call the callback */
-
- PgfLiteralCallback* callback =
- gu_map_get(before->ps->concr->callbacks,
- parg->ccat->cnccat,
- PgfLiteralCallback*);
-
- if (callback != NULL) {
+ if (parg->ccat->fid >= ps->concr->total_cats) {
+ pgf_parsing_td_predict(ps, item, parg->ccat, slit->r);
+ }
+ else {
+ PgfItemConts* conts =
+ pgf_parsing_get_conts(ps->before,
+ parg->ccat, slit->r,
+ ps->pool);
+ gu_buf_push(conts->items, PgfItem*, item);
+
+ if (gu_buf_length(conts->items) == 1) {
+ /* This is the first time when we encounter this
+ * literal category so we must call the callback */
+
+ PgfLiteralCallback* callback =
+ gu_map_get(ps->concr->callbacks,
+ parg->ccat->cnccat,
+ PgfLiteralCallback*);
+
+ if (callback != NULL) {
+ size_t offset = ps->before->end_offset;
+ PgfSymbol curr_sym = gu_null_variant;
+ PgfExprProb *ep =
+ callback->match(ps->concr, &curr_sym,
+ item->conts->lin_idx,
+ ps->sentence, &offset,
+ ps->pool, ps->out_pool);
+
+ if (ep != NULL) {
PgfProduction prod;
PgfProductionExtern* pext =
gu_new_variant(PGF_PRODUCTION_EXTERN,
- PgfProductionExtern,
- &prod, before->ps->pool);
- pext->callback = callback;
- pext->ep = NULL;
+ PgfProductionExtern,
+ &prod, ps->pool);
+ pext->ep = ep;
pext->lins = NULL;
- pgf_parsing_production(before, conts, prod);
+ PgfParseState* state =
+ pgf_new_parse_state(ps, offset, BIND_NONE);
+ PgfItem* item =
+ pgf_new_item(ps, conts, prod);
+ item->curr_sym = curr_sym;
+ item->sym_idx = pgf_item_symbols_length(item);
+ gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
}
} else {
- /* If it has already been completed, combine. */
+ pgf_item_free(ps, item);
+ }
+ } else {
+ /* If it has already been completed, combine. */
+ PgfCCat* completed =
+ pgf_parsing_get_completed(ps->before, conts);
+ if (completed) {
+ pgf_parsing_combine(ps, ps->before, ps->after, item, completed, slit->r);
+ }
+
+ PgfParseState* state = ps->after;
+ while (state != NULL) {
PgfCCat* completed =
- pgf_parsing_get_completed(before, conts);
+ pgf_parsing_get_completed(state, conts);
if (completed) {
- pgf_parsing_combine(before, after, item, completed, slit->r);
- }
-
- PgfParseState* state = after;
- while (state != NULL) {
- PgfCCat* completed =
- pgf_parsing_get_completed(state, conts);
- if (completed) {
- pgf_parsing_combine(state, state->next, item, completed, slit->r);
- }
-
- state = state->next;
+ pgf_parsing_combine(ps, state, state->next, item, completed, slit->r);
}
+
+ state = state->next;
}
}
}
@@ -1344,11 +1554,29 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
// XXX TODO proper support
break;
case PGF_SYMBOL_NE: {
- pgf_item_free(before, after, item);
+ // Nothing can match with a non-existant token
+ pgf_item_free(ps, item);
break;
}
case PGF_SYMBOL_BIND: {
- pgf_item_free(before, after, item);
+ if (ps->before->start_offset == ps->before->end_offset &&
+ ps->before->needs_bind) {
+ PgfParseState* state =
+ pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD);
+ if (state != NULL) {
+ if (state->next == NULL) {
+ state->viterbi_prob =
+ item->inside_prob+item->conts->outside_prob;
+ }
+
+ pgf_item_advance(item, ps->pool);
+ gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
+ } else {
+ pgf_item_free(ps, item);
+ }
+ } else {
+ pgf_item_free(ps, item);
+ }
break;
}
default:
@@ -1357,47 +1585,47 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
}
static void
-pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
+pgf_parsing_item(PgfParsing* ps, PgfItem* item)
{
#ifdef PGF_PARSER_DEBUG
GuPool* tmp_pool = gu_new_pool();
GuOut* out = gu_file_out(stderr, tmp_pool);
GuExn* err = gu_exn(NULL, type, tmp_pool);
- pgf_print_item(item, before, out, err, tmp_pool);
+ pgf_print_item(item, ps->before, out, err, tmp_pool);
gu_pool_free(tmp_pool);
#endif
-
+
GuVariantInfo i = gu_variant_open(item->prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
PgfCncFun* fun = papp->fun;
- PgfSequence* seq = fun->lins[item->conts->lin_idx];
- if (item->seq_idx == gu_seq_length(seq)) {
- pgf_parsing_complete(before, after, item, NULL);
- pgf_item_free(before, after, item);
+ PgfSymbols* syms = fun->lins[item->conts->lin_idx]->syms;
+ if (item->sym_idx == gu_seq_length(syms)) {
+ pgf_parsing_complete(ps, item, NULL);
+ pgf_item_free(ps, item);
} else {
- pgf_parsing_symbol(before, after, item, item->curr_sym);
+ pgf_parsing_symbol(ps, item, item->curr_sym);
}
break;
}
case PGF_PRODUCTION_COERCE: {
PgfProductionCoerce* pcoerce = i.data;
- switch (item->seq_idx) {
+ switch (item->sym_idx) {
case 0:
if (pcoerce->coerce->prods == NULL) {
// empty category
- pgf_item_free(before, after, item);
+ pgf_item_free(ps, item);
return;
}
- pgf_parsing_td_predict(before, after, item,
+ pgf_parsing_td_predict(ps, item,
pcoerce->coerce,
item->conts->lin_idx);
break;
case 1:
- pgf_parsing_complete(before, after, item, NULL);
- pgf_item_free(before, after, item);
+ pgf_parsing_complete(ps, item, NULL);
+ pgf_item_free(ps, item);
break;
default:
gu_impossible();
@@ -1406,85 +1634,56 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
-
- PgfSequence* seq;
+
+ PgfSymbols* syms;
if (pext->lins != NULL &&
- (seq = gu_seq_get(pext->lins,PgfSequence*,item->conts->lin_idx)) != NULL) {
- if (item->seq_idx == gu_seq_length(seq)) {
- pgf_parsing_complete(before, after, item, NULL);
- pgf_item_free(before, after, item);
- } else {
+ (syms = gu_seq_get(pext->lins,PgfSymbols*,item->conts->lin_idx)) != NULL) {
+ if (item->sym_idx == gu_seq_length(syms)) {
+ pgf_parsing_complete(ps, item, NULL);
+ pgf_item_free(ps, item);
+ } else {
PgfSymbol sym =
- gu_seq_get(seq, PgfSymbol, item->seq_idx);
- pgf_parsing_symbol(before, after, item, sym);
+ gu_seq_get(syms, PgfSymbol, item->sym_idx);
+ pgf_parsing_symbol(ps, item, sym);
}
} else {
- PgfToken tok = (after != NULL)
- ? after->ts->fn->get_token(after->ts)
- : "";
-
- PgfExprProb *ep = NULL;
- bool accepted =
- pext->callback->match(before->ps->concr, item,
- tok,
- &ep, before->ps->out_pool);
-
- if (ep != NULL)
- pgf_parsing_complete(before, after, item, ep);
-
- if (accepted) {
- if (after != NULL) {
- PgfSymbol prev = item->curr_sym;
- size_t tok_len = strlen(tok);
- PgfSymbolKS* sks = (PgfSymbolKS*)
- gu_alloc_variant(PGF_SYMBOL_KS,
- sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+tok_len+1,
- gu_alignof(PgfSymbolKS),
- &item->curr_sym, after->ps->pool);
- strcpy((char*) sks->token, (char*) tok);
- *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1)) = prev;
-
- item->seq_idx++;
- pgf_parsing_add_transition(before, after, tok, item);
- }
- } else {
- pgf_item_free(before, after, item);
- }
+ pgf_parsing_complete(ps, item, pext->ep);
+ pgf_item_free(ps, item);
}
break;
}
case PGF_PRODUCTION_META: {
- if (item->seq_idx == pgf_item_sequence_length(item)) {
- if (before->meta_item != NULL)
+ if (item->sym_idx == pgf_item_symbols_length(item)) {
+ if (ps->before->meta_item != NULL)
break;
- before->meta_item = item;
+ ps->before->meta_item = item;
- if (after == NULL) {
- PgfExprProb *ep = gu_new(PgfExprProb, before->ps->pool);
- ep->expr = before->ps->meta_var;
+ if (ps->after == NULL) {
+ PgfExprProb *ep = gu_new(PgfExprProb, ps->pool);
+ ep->expr = ps->meta_var;
ep->prob = item->inside_prob;
size_t n_args = gu_seq_length(item->args);
for (size_t i = 0; i < n_args; i++) {
PgfPArg* arg = gu_seq_index(item->args, PgfPArg, i);
ep->prob -= arg->ccat->viterbi_prob;
}
- pgf_parsing_complete(before, after, item, ep);
+ pgf_parsing_complete(ps, item, ep);
} else {
prob_t meta_token_prob =
item->conts->ccat->cnccat->abscat->meta_token_prob;
if (meta_token_prob != INFINITY) {
- pgf_parsing_meta_scan(before, after, item, meta_token_prob);
+ pgf_parsing_meta_scan(ps, item, meta_token_prob);
}
PgfCIdMap* meta_child_probs =
item->conts->ccat->cnccat->abscat->meta_child_probs;
if (meta_child_probs != NULL) {
- PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, before, item };
+ PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, ps, item };
gu_map_iter(meta_child_probs, &clo.fn, NULL);
}
}
} else {
- pgf_parsing_symbol(before, after, item, item->curr_sym);
+ pgf_parsing_symbol(ps, item, item->curr_sym);
}
break;
}
@@ -1493,66 +1692,6 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
}
}
-static bool
-pgf_parsing_proceed(PgfParseState* state)
-{
- prob_t best_prob = INFINITY;
- if (gu_buf_length(state->ps->expr_queue) > 0) {
- best_prob = gu_buf_get(state->ps->expr_queue, PgfExprState*, 0)->ep.prob;
- }
-
- PgfParseState* before = NULL;
-
- prob_t delta_prob = 0;
- PgfParseState* st = state;
- while (st != NULL) {
- if (gu_buf_length(st->agenda) > 0) {
- PgfItem* item = gu_buf_get(st->agenda, PgfItem*, 0);
- prob_t item_prob =
- item->inside_prob+item->conts->outside_prob+delta_prob;
- if (item_prob < best_prob) {
- best_prob = item_prob;
- before = st;
- }
- }
-
- prob_t state_delta =
- (st->viterbi_prob-(st->next ? st->next->viterbi_prob : 0))*
- state->ps->beam_size;
- //prob_t lexical_prob =
- // st->ts ? st->ts->lexical_prob : 0;
- delta_prob += state_delta; /*fmax(state_delta, lexical_prob)*/; // the calculation of lexical_prob doesn't work properly.
- st = st->next;
- }
-
- if (before == NULL)
- return false;
-
- PgfParseState* after = NULL;
-
- st = state;
- while (st != before) {
- PgfParseState* tmp = st->next;
- st->next = after;
- after = st;
- st = tmp;
- }
-
- PgfItem* item;
- gu_buf_heap_pop(before->agenda, &pgf_item_prob_order, &item);
- pgf_parsing_item(before, after, item);
-
- while (after != NULL) {
- PgfParseState* tmp = after->next;
- after->next = before;
- before = after;
- after = tmp;
- }
- state = before;
-
- return true;
-}
-
static prob_t
pgf_parsing_default_beam_size(PgfConcr* concr)
{
@@ -1567,15 +1706,19 @@ pgf_parsing_default_beam_size(PgfConcr* concr)
}
static PgfParsing*
-pgf_new_parsing(PgfConcr* concr, double heuristics,
+pgf_new_parsing(PgfConcr* concr,
+ GuString sentence, double heuristics,
GuPool* pool, GuPool* out_pool)
{
PgfParsing* ps = gu_new(PgfParsing, pool);
ps->concr = concr;
ps->pool = pool;
ps->out_pool = out_pool;
+ ps->sentence = sentence;
ps->expr_queue = gu_new_buf(PgfExprState*, pool);
ps->max_fid = concr->total_cats;
+ ps->before = NULL;
+ ps->after = NULL;
#ifdef PGF_COUNTS_DEBUG
ps->item_full_count = 0;
ps->item_real_count = 0;
@@ -1602,68 +1745,6 @@ pgf_new_parsing(PgfConcr* concr, double heuristics,
return ps;
}
-static PgfParseState*
-pgf_new_parse_state(PgfParsing* ps,
- PgfParseState* next,
- PgfTokenState* ts,
- GuPool* pool)
-{
- PgfParseState* state = gu_new(PgfParseState, pool);
- state->next = next;
- state->agenda = gu_new_buf(PgfItem*, pool);
- state->meta_item = NULL;
- state->generated_cats = gu_map_type_new(PgfGenCatMap, pool);
- state->conts_map = gu_map_type_new(PgfContsMap, pool);
- state->offset = next ? next->offset+1 : 0;
- state->viterbi_prob = 0;
- state->ps = ps;
- state->ts = ts;
- return state;
-}
-
-typedef struct {
- GuMapItor fn;
- PgfTokenState* ts;
-} PgfLexiconFn;
-
-static void
-pgf_parser_compute_lexicon_prob(GuMapItor* fn, const void* key, void* value, GuExn* err)
-{
- PgfTokenState* ts = ((PgfLexiconFn*) fn)->ts;
- PgfProductionBuf* prods = *((PgfProductionBuf**) value);
-
- if (prods == NULL)
- return;
-
- size_t n_prods = gu_buf_length(prods);
- for (size_t i = 0; i < n_prods; i++) {
- PgfProduction prod =
- gu_buf_get(prods, PgfProduction, i);
-
- GuVariantInfo pi = gu_variant_open(prod);
- switch (pi.tag) {
- case PGF_PRODUCTION_APPLY: {
- PgfProductionApply* papp = pi.data;
- if (ts->lexical_prob > papp->fun->ep->prob) {
- ts->lexical_prob = papp->fun->ep->prob;
- }
- break;
- }
- }
- }
-}
-
-#define pgf_new_token_state(ty, pool) \
- (ty*) pgf_new_token_state_(&pgf_tsfn_##ty, (PgfTokenState*) gu_new(ty, pool))
-
-static PgfTokenState*
-pgf_new_token_state_(PgfTokenFn* fn, PgfTokenState* ts)
-{
- ts->fn = fn;
- ts->lexical_prob = INFINITY;
- return ts;
-}
-
#ifdef PGF_COUNTS_DEBUG
void pgf_parsing_print_counts(PgfParsing* ps)
{
@@ -1676,6 +1757,8 @@ void pgf_parsing_print_counts(PgfParsing* ps)
}
#endif
+/*static bool
+*************
typedef struct {
PgfTokenState ts;
PgfToken tok;
@@ -1744,6 +1827,7 @@ typedef struct {
} PgfPrefixTokenState;
static bool
+^ ^ ^ ^ ^ ^ ^
pgf_prefix_match_token(PgfTokenState* ts0, PgfToken tok, PgfItem* item)
{
PgfPrefixTokenState* ts =
@@ -1754,9 +1838,18 @@ pgf_prefix_match_token(PgfTokenState* ts0, PgfToken tok, PgfItem* item)
PgfSequence* seq;
pgf_item_sequence(item, &lin_idx, &seq, ts->pool);
+ uint16_t seq_idx = item->seq_idx;
+ uint8_t tok_idx = item->tok_idx;
+
+ // go one token back
+ if (tok_idx > 0)
+ tok_idx--;
+ else
+ seq_idx--;
+
ts->tp = gu_new(PgfTokenProb, ts->pool);
ts->tp->tok =
- pgf_get_tokens(seq, item->seq_idx-1, ts->pool);
+ pgf_get_tokens(seq, seq_idx, tok_idx, ts->pool);
ts->tp->cat = item->conts->ccat->cnccat->abscat->name;
ts->tp->prob = item->inside_prob+item->conts->outside_prob;
}
@@ -1794,24 +1887,25 @@ pgf_parser_completions_next(GuEnum* self, void* to, GuPool* pool)
}
*((PgfTokenProb**)to) = ts->tp;
-}
+}*/
GuEnum*
-pgf_parser_completions(PgfParseState* prev, GuString prefix)
+pgf_parsing_completions(PgfParsing* ps, GuString prefix)
{
#ifdef PGF_COUNTS_DEBUG
- pgf_parsing_print_counts(prev->ps);
+ pgf_parsing_print_counts(ps);
#endif
- PgfPrefixTokenState* ts =
+/* PgfPrefixTokenState* ts =
pgf_new_token_state(PgfPrefixTokenState, prev->ps->pool);
ts->en.next = pgf_parser_completions_next;
ts->prefix = prefix;
ts->tp = NULL;
ts->state =
- pgf_new_parse_state(prev->ps, prev, &ts->ts, prev->ps->pool);
+ pgf_new_parse_state(prev->ps, prev, &ts->ts);
- return &ts->en;
+ return &ts->en;*/
+ return NULL;
}
static int
@@ -1981,17 +2075,134 @@ pgf_parse_result_is_new(PgfExprState* st)
return true;
}
+// TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat
+static PgfParsing*
+pgf_parsing_init(PgfConcr* concr, PgfCId cat, size_t lin_idx,
+ GuString sentence, double heuristics,
+ GuExn* err,
+ GuPool* pool, GuPool* out_pool)
+{
+ PgfCncCat* cnccat =
+ gu_map_get(concr->cnccats, cat, PgfCncCat*);
+ if (!cnccat) {
+ GuExnData* exn = gu_raise(err, PgfExn);
+ exn->data = "Unknown start category";
+ return NULL;
+ }
+
+ gu_assert(lin_idx < cnccat->n_lins);
+
+ if (heuristics < 0) {
+ heuristics = pgf_parsing_default_beam_size(concr);
+ }
+
+ PgfParsing* ps =
+ pgf_new_parsing(concr, sentence, heuristics, pool, out_pool);
+ PgfParseState* state =
+ pgf_new_parse_state(ps, 0, BIND_SOFT);
+
+ size_t n_ccats = gu_seq_length(cnccat->cats);
+ for (size_t i = 0; i < n_ccats; i++) {
+ PgfCCat* ccat = gu_seq_get(cnccat->cats, PgfCCat*, i);
+ if (ccat != NULL) {
+ if (ccat->prods == NULL) {
+ // Empty category
+ continue;
+ }
+
+ PgfItemConts* conts =
+ pgf_parsing_get_conts(state, ccat, lin_idx, ps->pool);
+ gu_buf_push(conts->items, PgfItem*, NULL);
+
+#ifdef PGF_COUNTS_DEBUG
+ ps->cont_full_count++;
+#endif
+
+ size_t n_prods = gu_seq_length(ccat->prods);
+ for (size_t i = 0; i < n_prods; i++) {
+ PgfProduction prod =
+ gu_seq_get(ccat->prods, PgfProduction, i);
+ PgfItem* item =
+ pgf_new_item(ps, conts, prod);
+ gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
+ }
+
+ PgfItem *item =
+ pgf_new_item(ps, conts, ps->meta_prod);
+ item->inside_prob =
+ ccat->cnccat->abscat->meta_prob;
+ gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
+ }
+ }
+
+ return ps;
+}
+
+static bool
+pgf_parsing_proceed(PgfParsing* ps)
+{
+ bool has_progress = false;
+
+ prob_t best_prob = INFINITY;
+ if (gu_buf_length(ps->expr_queue) > 0) {
+ best_prob = gu_buf_get(ps->expr_queue, PgfExprState*, 0)->ep.prob;
+ }
+
+ prob_t delta_prob = 0;
+ PgfParseState* st = ps->before;
+ while (st != NULL) {
+ if (gu_buf_length(st->agenda) > 0) {
+ PgfItem* item = gu_buf_get(st->agenda, PgfItem*, 0);
+ prob_t item_prob =
+ item->inside_prob+item->conts->outside_prob+delta_prob;
+ if (item_prob < best_prob) {
+ best_prob = item_prob;
+
+ while (st != ps->before) {
+ PgfParseState* tmp = ps->before->next;
+ ps->before->next = ps->after;
+ ps->after = ps->before;
+ ps->before = tmp;
+ }
+
+ has_progress = true;
+ }
+ }
+
+ prob_t state_delta =
+ (st->viterbi_prob-(st->next ? st->next->viterbi_prob : 0))*
+ ps->beam_size;
+ delta_prob += state_delta;
+ st = st->next;
+ }
+
+ if (has_progress) {
+ PgfItem* item;
+ gu_buf_heap_pop(ps->before->agenda, pgf_item_prob_order, &item);
+ pgf_parsing_item(ps, item);
+ }
+
+ while (ps->after != NULL) {
+ PgfParseState* tmp = ps->after->next;
+ ps->after->next = ps->before;
+ ps->before = ps->after;
+ ps->after = tmp;
+ }
+
+ return has_progress;
+}
+
static PgfExprProb*
-pgf_parse_result_next(PgfParseResult* pr)
+pgf_parse_result_next(PgfParsing* ps)
{
for (;;) {
- while (pgf_parsing_proceed(pr->state));
+ while (pgf_parsing_proceed(ps));
- if (gu_buf_length(pr->state->ps->expr_queue) == 0)
+ if (gu_buf_length(ps->expr_queue) == 0)
break;
PgfExprState* st;
- gu_buf_heap_pop(pr->state->ps->expr_queue, &pgf_expr_state_order, &st);
+ gu_buf_heap_pop(ps->expr_queue, &pgf_expr_state_order, &st);
#ifdef PGF_PARSER_DEBUG
#ifdef PGF_RESULT_DEBUG
@@ -2008,16 +2219,16 @@ pgf_parse_result_next(PgfParseResult* pr)
PgfCCat* ccat =
gu_seq_index(st->args, PgfPArg, st->arg_idx)->ccat;
- if (ccat->fid < pr->state->ps->concr->total_cats) {
+ if (ccat->fid < ps->concr->total_cats) {
st->ep.expr =
- gu_new_variant_i(pr->state->ps->out_pool,
+ gu_new_variant_i(ps->out_pool,
PGF_EXPR_APP, PgfExprApp,
.fun = st->ep.expr,
- .arg = pr->state->ps->meta_var);
+ .arg = ps->meta_var);
st->arg_idx++;
- gu_buf_heap_push(pr->state->ps->expr_queue, &pgf_expr_state_order, &st);
+ gu_buf_heap_push(ps->expr_queue, &pgf_expr_state_order, &st);
} else {
- pgf_result_predict(pr->state->ps, st, ccat);
+ pgf_result_predict(ps, st, ccat);
}
} else if (pgf_parse_result_is_new(st)) {
gu_buf_push(st->answers->exprs, PgfExprProb*, &st->ep);
@@ -2030,10 +2241,10 @@ pgf_parse_result_next(PgfParseResult* pr)
return &st->ep;
}
- PgfExprState* st3 = gu_new(PgfExprState, pr->state->ps->pool);
+ PgfExprState* st3 = gu_new(PgfExprState, ps->pool);
st3->answers = st2->answers;
st3->ep.expr =
- gu_new_variant_i(pr->state->ps->out_pool,
+ gu_new_variant_i(ps->out_pool,
PGF_EXPR_APP, PgfExprApp,
.fun = st2->ep.expr,
.arg = st->ep.expr);
@@ -2041,7 +2252,7 @@ pgf_parse_result_next(PgfParseResult* pr)
st3->args = st2->args;
st3->arg_idx = st2->arg_idx+1;
- gu_buf_heap_push(pr->state->ps->expr_queue, &pgf_expr_state_order, &st3);
+ gu_buf_heap_push(ps->expr_queue, &pgf_expr_state_order, &st3);
}
}
}
@@ -2052,181 +2263,111 @@ pgf_parse_result_next(PgfParseResult* pr)
static void
pgf_parse_result_enum_next(GuEnum* self, void* to, GuPool* pool)
{
- PgfParseResult* pr = gu_container(self, PgfParseResult, en);
- *(PgfExprProb**)to = pgf_parse_result_next(pr);
+ PgfParsing* ps = gu_container(self, PgfParsing, en);
+ *(PgfExprProb**)to = pgf_parse_result_next(ps);
}
-PgfExprEnum*
-pgf_parse_result(PgfParseState* state)
-{
-#ifdef PGF_COUNTS_DEBUG
- pgf_parsing_print_counts(state->ps);
-#endif
+static GuString
+pgf_parsing_last_token(PgfParsing* ps, GuPool* pool)
+{
+ if (ps->before == NULL)
+ return "";
- PgfParseResult* res = gu_new(PgfParseResult, state->ps->pool);
- res->state = state;
- res->en.next = pgf_parse_result_enum_next;
- return &res->en;
-}
+ size_t start = ps->before->end_offset;
+ while (start > 0) {
+ char c = ps->sentence[start-1];
+ if (gu_is_space(c))
+ break;
+ start--;
+ }
-void
-pgf_parse_print_chunks(PgfParseState* state)
-{
-/* if (state->ps->completed == NULL) {
- while (state->ps->completed == NULL) {
- if (!pgf_parsing_proceed(state))
- break;
- }
- if (state->ps->completed == NULL)
- return;
+ size_t end = ps->before->end_offset;
+ while (ps->sentence[end] != 0) {
+ char c = ps->sentence[end];
+ if (gu_is_space(c))
+ break;
+ end++;
}
-
- GuPool* tmp_pool = gu_new_pool();
- GuOut* out = gu_file_out(stdout, tmp_pool);
- GuWriter* wtr = gu_new_utf8_writer(out, tmp_pool);
- GuExn* err = gu_exn(NULL, type, tmp_pool);
- PgfCCat* completed = state->ps->completed;
- if (gu_seq_length(completed->prods) == 0)
- return;
+ char* tok = gu_malloc(pool, end-start+1);
+ memcpy(tok, ps->sentence+start, (end-start));
+ tok[end-start] = 0;
+ return tok;
+}
- size_t n_args = 0;
- size_t arg_idx = 0;
- PgfCCat* ccat = NULL;
- PgfProductionMeta* pmeta = NULL;
+GU_DEFINE_TYPE(PgfParseError, abstract, _);
- PgfProduction prod = gu_seq_get(completed->prods, PgfProduction, 0);
- GuVariantInfo pi = gu_variant_open(prod);
- switch (pi.tag) {
- case PGF_PRODUCTION_APPLY:
- n_args = 1;
- arg_idx = 0;
- ccat = completed;
- break;
- case PGF_PRODUCTION_META:
- pmeta = pi.data;
- n_args = gu_seq_length(pmeta->args);
- arg_idx = 0;
- ccat = gu_seq_index(pmeta->args, PgfPArg, arg_idx)->ccat;
- break;
- }
+GuEnum*
+pgf_parse(PgfConcr* concr, PgfCId cat, GuString sentence,
+ GuExn* err,
+ GuPool* pool, GuPool* out_pool)
+{
+ return pgf_parse_with_heuristics(concr, cat, sentence, -1.0, err, pool, out_pool);
+}
- PgfParseState* next = NULL;
- while (state != NULL) {
- PgfParseState* tmp = state->next;
- state->next = next;
- next = state;
- state = tmp;
+GuEnum*
+pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, GuString sentence,
+ double heuristics,
+ GuExn* err,
+ GuPool* pool, GuPool* out_pool)
+{
+ // Begin parsing a sentence with the specified category
+ PgfParsing* ps =
+ pgf_parsing_init(concr, cat, 0, sentence, heuristics, err, pool, out_pool);
+ if (ps == NULL) {
+ return NULL;
}
- int offset = 0;
-
- state = next;
- next = NULL;
- while (state != NULL) {
- if (state->ts != NULL)
- {
- if (ccat != NULL &&
- offset == ((ccat->conts->state != NULL) ? ccat->conts->state->offset : 0)) {
- PgfCCat *ccat2 = ccat;
- while (ccat2->conts != NULL) {
- ccat2 = ccat2->conts->ccat;
- }
-
- gu_putc('(', wtr, err);
- gu_string_write(ccat2->cnccat->abscat->name, wtr, err);
- gu_putc(' ', wtr, err);
- }
+#ifdef PGF_COUNTS_DEBUG
+ pgf_parsing_print_counts(ps);
+#endif
- gu_string_write(state->ts->tok, wtr, err);
- offset++;
-
- if (ccat != NULL &&
- ccat ==
- gu_map_get(state->generated_cats, ccat->conts, PgfCCat*)) {
- gu_putc(')', wtr, err);
-
- arg_idx++;
- ccat =
- (arg_idx >= n_args) ?
- NULL :
- gu_seq_index(pmeta->args, PgfPArg, arg_idx)->ccat;
- }
-
- gu_putc(' ', wtr, err);
+ while (gu_buf_length(ps->expr_queue) == 0) {
+ if (!pgf_parsing_proceed(ps)) {
+ GuExnData* exn = gu_raise(err, PgfParseError);
+ exn->data = (void*) pgf_parsing_last_token(ps, exn->pool);
+ return NULL;
}
- PgfParseState* tmp = state->next;
- state->next = next;
- next = state;
- state = tmp;
+#ifdef PGF_COUNTS_DEBUG
+ pgf_parsing_print_counts(ps);
+#endif
}
- gu_putc('\n', wtr, err);
- gu_pool_free(tmp_pool);*/
+ // Now begin enumerating the resulting syntax trees
+ ps->en.next = pgf_parse_result_enum_next;
+ return &ps->en;
}
-// TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat
-PgfParseState*
-pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx,
- double heuristics,
- GuPool* pool, GuPool* out_pool)
+GuEnum*
+pgf_complete(PgfConcr* concr, PgfCId cat, GuString sentence,
+ GuString prefix, GuExn *err, GuPool* pool)
{
- PgfCncCat* cnccat =
- gu_map_get(concr->cnccats, cat, PgfCncCat*);
- if (!cnccat)
+ // Begin parsing a sentence of the specified category
+ PgfParsing* ps =
+ pgf_parsing_init(concr, cat, 0, sentence, -1, err, pool, pool);
+ if (ps == NULL) {
return NULL;
-
- gu_assert(lin_idx < cnccat->n_lins);
-
- if (heuristics < 0) {
- heuristics = pgf_parsing_default_beam_size(concr);
}
- PgfParsing* ps =
- pgf_new_parsing(concr, heuristics, pool, out_pool);
- PgfParseState* state =
- pgf_new_parse_state(ps, NULL, NULL, pool);
-
- size_t n_ccats = gu_seq_length(cnccat->cats);
- for (size_t i = 0; i < n_ccats; i++) {
- PgfCCat* ccat = gu_seq_get(cnccat->cats, PgfCCat*, i);
- if (ccat != NULL) {
- if (ccat->prods == NULL) {
- // Empty category
- continue;
- }
-
- PgfItemConts* conts = gu_new(PgfItemConts, pool);
- conts->ccat = ccat;
- conts->lin_idx = lin_idx;
- conts->state = NULL;
- conts->items = gu_new_buf(PgfItem*, pool);
- conts->outside_prob = 0;
- conts->ref_count = 0;
- gu_buf_push(conts->items, PgfItem*, NULL);
+ // Tokenization
+ GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), pool);
+/* PgfToken tok = pgf_lexer_read_token(lexer, lex_err);
+ while (!gu_exn_is_raised(lex_err)) {
+ // feed the token to get a new parse state
+ state = pgf_parser_next_state(state, tok);
+ if (state == NULL) {
+ return NULL;
+ }
-#ifdef PGF_COUNTS_DEBUG
- ps->cont_full_count++;
-#endif
+ tok = pgf_lexer_read_token(lexer, lex_err);
+ }*/
- size_t n_prods = gu_seq_length(ccat->prods);
- for (size_t i = 0; i < n_prods; i++) {
- PgfProduction prod =
- gu_seq_get(ccat->prods, PgfProduction, i);
- PgfItem* item =
- pgf_new_item(conts, prod, pool, ps);
- gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
- }
+ if (gu_exn_caught(lex_err) != gu_type(GuEOF))
+ return NULL;
- PgfItem *item =
- pgf_new_item(conts, ps->meta_prod, pool, ps);
- item->inside_prob =
- ccat->cnccat->abscat->meta_prob;
- gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
- }
- }
- return state;
+ // Now begin enumerating the resulting syntax trees
+ return pgf_parsing_completions(ps, prefix);
}
void
@@ -2242,306 +2383,114 @@ pgf_parser_add_literal(PgfConcr *concr, PgfCId cat,
PgfLiteralCallback*, callback);
}
-typedef struct {
- GuMapItor fn;
- PgfTokens* tokens;
- PgfMorphoCallback* callback;
-} PgfMorphoFn;
-
static void
-pgf_morpho_iter(GuMapItor* fn, const void* key, void* value, GuExn* err)
-{
- PgfMorphoFn* clo = (PgfMorphoFn*) fn;
- PgfCFCat cfc = *((PgfCFCat*) key);
- PgfProductionBuf* prods = *((PgfProductionBuf**) value);
-
- if (prods == NULL)
- return;
-
- GuString analysis = cfc.ccat->cnccat->labels[cfc.lin_idx];
-
- size_t n_prods = gu_buf_length(prods);
- for (size_t i = 0; i < n_prods; i++) {
- PgfProduction prod =
- gu_buf_get(prods, PgfProduction, i);
-
- GuVariantInfo i = gu_variant_open(prod);
- switch (i.tag) {
- case PGF_PRODUCTION_APPLY: {
- PgfProductionApply* papp = i.data;
-
- if (clo->tokens != NULL) {
- // match the tokens with the production
- size_t pos = 0;
- PgfSequence* seq = papp->fun->lins[cfc.lin_idx];
- size_t len = gu_seq_length(seq);
- for (size_t i = 0; i < len; i++) {
- PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i);
-
- GuVariantInfo i = gu_variant_open(sym);
- switch (i.tag) {
- case PGF_SYMBOL_KS: {
- PgfSymbolKS* symks = i.data;
-
- if (pos >= gu_seq_length(clo->tokens))
- goto cont;
-
- PgfToken tok1 = symks->token;
- PgfToken tok2 = gu_seq_get(clo->tokens, PgfToken, pos++);
-
- if (strcmp(tok1, tok2) != 0)
- goto cont;
- }
- default:
- continue;
- }
- }
-
- if (pos != gu_seq_length(clo->tokens))
- goto cont;
- }
-
- PgfCId lemma = papp->fun->absfun->name;
- prob_t prob = papp->fun->absfun->ep.prob;
- clo->callback->callback(clo->callback,
- lemma, analysis, prob, err);
- }
- }
- cont:;
+pgf_morpho_iter(PgfProductionIdx* idx,
+ PgfMorphoCallback* callback,
+ GuExn* err)
+{
+ size_t n_entries = gu_buf_length(idx);
+ for (size_t i = 0; i < n_entries; i++) {
+ PgfProductionIdxEntry* entry =
+ gu_buf_index(idx, PgfProductionIdxEntry, i);
+
+ PgfCId lemma = entry->papp->fun->absfun->name;
+ GuString analysis = entry->ccat->cnccat->labels[entry->lin_idx];
+ prob_t prob = entry->papp->fun->absfun->ep.prob;
+ callback->callback(callback,
+ lemma, analysis, prob, err);
+ if (!gu_ok(err))
+ return;
}
}
-void
-pgf_lookup_morpho(PgfConcr *concr, PgfLexer *lexer,
- PgfMorphoCallback* callback, GuExn* err)
+static int
+pgf_sequence_cmp_fn(GuOrder* self, const void* p1, const void* p2)
{
- GuPool* tmp_pool = gu_local_pool();
-
- GuBuf* tokens = gu_new_buf(PgfToken, tmp_pool);
- GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), tmp_pool);
+ (void) self;
+ GuString sent = (GuString) p1;
+ const PgfSequence* sp2 = p2;
- PgfToken tok = pgf_lexer_read_token(lexer, lex_err);
- if (gu_exn_is_raised(lex_err)) {
- gu_raise(err, PgfExn);
- gu_pool_free(tmp_pool);
- return;
+ BIND_TYPE bind = BIND_HARD;
+ int res = pgf_symbols_cmp(&sent, strlen(sent), &bind, sp2->syms);
+ if (res == 0 && *sent != 0) {
+ res = 1;
}
- PgfProductionIdx* lexicon_idx =
- gu_map_get(concr->leftcorner_tok_idx, tok, PgfProductionIdx*);
- if (lexicon_idx == NULL) {
- gu_pool_free(tmp_pool);
- return;
- }
+ return res;
+}
- do {
- gu_buf_push(tokens, PgfToken, tok);
- tok = pgf_lexer_read_token(lexer, lex_err);
- } while (!gu_exn_is_raised(lex_err));
+static GuOrder pgf_sequence_order[1] = { { pgf_sequence_cmp_fn } };
- PgfMorphoFn clo = { { pgf_morpho_iter }, gu_buf_data_seq(tokens), callback };
- gu_map_iter(lexicon_idx, &clo.fn, err);
+void
+pgf_lookup_morpho(PgfConcr *concr, GuString sentence,
+ PgfMorphoCallback* callback, GuExn* err)
+{
+ PgfSequence* seq = (PgfSequence*)
+ gu_seq_binsearch_(concr->sequences, pgf_sequence_order,
+ sizeof(PgfSequence), 0, (void*) sentence);
- gu_pool_free(tmp_pool);
+ if (seq != NULL && seq->idx != NULL)
+ pgf_morpho_iter(seq->idx, callback, err);
}
typedef struct {
GuEnum en;
- GuEnum* map_en1;
- GuEnum* map_en2;
-
- GuMapItor fn;
- PgfLeftcornerTokIdx* new_idx;
-
- GuPool* pool;
+ PgfSequences* sequences;
+ size_t seq_idx;
} PgfFullFormState;
-static void
-pgf_fullform_iter(GuMapItor* fn, const void* key, void* value, GuExn* err)
-{
- PgfFullFormState* st = gu_container(fn, PgfFullFormState, fn);
- PgfCFCat cfc = *((PgfCFCat*) key);
- PgfProductionBuf* prods = *((PgfProductionBuf**) value);
-
- if (prods == NULL)
- return;
-
- size_t n_prods = gu_buf_length(prods);
- for (size_t i = 0; i < n_prods; i++) {
- PgfProduction prod =
- gu_buf_get(prods, PgfProduction, i);
-
- GuVariantInfo i = gu_variant_open(prod);
- switch (i.tag) {
- case PGF_PRODUCTION_APPLY: {
- PgfProductionApply* papp = i.data;
-
- PgfSequence* seq = papp->fun->lins[cfc.lin_idx];
- GuString tokens = pgf_get_tokens(seq, 0, st->pool);
-
- // create a new production index with keys that
- // are multiword units
- PgfProductionIdx* lexicon_idx =
- gu_map_get(st->new_idx, tokens, PgfProductionIdx*);
- if (lexicon_idx == NULL) {
- lexicon_idx = gu_map_type_new(PgfProductionIdx, st->pool);
- gu_map_put(st->new_idx, tokens, PgfProductionIdx*, lexicon_idx);
- }
-
- PgfProductionBuf* prods =
- gu_map_get(lexicon_idx, &cfc, PgfProductionBuf*);
- if (prods == NULL) {
- prods = gu_new_buf(PgfProduction, st->pool);
- gu_map_put(lexicon_idx, &cfc, PgfProductionBuf*, prods);
- }
-
- gu_buf_push(prods, PgfProduction, prod);
- }
- }
- }
-}
+struct PgfFullFormEntry {
+ GuString tokens;
+ PgfProductionIdx* idx;
+};
static void
gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool)
{
PgfFullFormState* st = gu_container(self, PgfFullFormState, en);
-
- for (;;) {
- if (st->new_idx == NULL) {
- GuMapKeyValue* kv = gu_next(st->map_en1, GuMapKeyValue*, pool);
- if (kv == NULL) {
- *((PgfFullFormEntry**)to) = NULL;
- return;
- }
-
- PgfProductionIdx* lexicon_idx = *((PgfProductionIdx**) kv->value);
-
- // we have an index by the first token but we must re-index
- // by taking into account the multiword units
- st->pool = pool;
- st->new_idx = gu_map_type_new(PgfLeftcornerTokIdx, pool);
- st->fn.fn = pgf_fullform_iter;
- gu_map_iter(lexicon_idx, &st->fn, NULL);
-
- st->map_en2 = gu_map_enum(st->new_idx, pool);
- }
- PgfFullFormEntry* entry =
- gu_next(st->map_en2, PgfFullFormEntry*, pool);
- if (entry != NULL) {
- *((PgfFullFormEntry**)to) = entry;
+ PgfFullFormEntry* entry = NULL;
+
+ size_t n_seqs = gu_seq_length(st->sequences);
+ while (st->seq_idx < n_seqs) {
+ PgfSymbols* syms = gu_seq_index(st->sequences, PgfSequence, st->seq_idx)->syms;
+ GuString tokens = pgf_get_tokens(syms, 0, pool);
+ if (strlen(tokens) > 0 &&
+ gu_seq_index(st->sequences, PgfSequence, st->seq_idx)->idx != NULL) {
+ entry = gu_new(PgfFullFormEntry, pool);
+ entry->tokens = tokens;
+ entry->idx = gu_seq_index(st->sequences, PgfSequence, st->seq_idx)->idx;
+
+ st->seq_idx++;
break;
}
-
- st->new_idx = NULL;
+
+ st->seq_idx++;
}
+
+ *((PgfFullFormEntry**) to) = entry;
}
GuEnum*
pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool)
{
PgfFullFormState* st = gu_new(PgfFullFormState, pool);
- st->en.next = gu_fullform_enum_next;
- st->map_en1 = gu_map_enum(concr->leftcorner_tok_idx, pool);
- st->map_en2 = NULL;
- st->new_idx = NULL;
- st->pool = NULL;
+ st->en.next = gu_fullform_enum_next;
+ st->sequences = concr->sequences;
+ st->seq_idx = 0;
return &st->en;
}
GuString
pgf_fullform_get_string(PgfFullFormEntry* entry)
{
- return (GuString) entry->key;
+ return entry->tokens;
}
void
pgf_fullform_get_analyses(PgfFullFormEntry* entry,
PgfMorphoCallback* callback, GuExn* err)
{
- PgfProductionIdx* lexicon_idx = *((PgfProductionIdx**) entry->value);
- PgfMorphoFn clo = { { pgf_morpho_iter }, NULL, callback };
- gu_map_iter(lexicon_idx, &clo.fn, err);
-}
-
-static void
-pgf_parser_index_token(PgfConcr* concr,
- PgfToken tok,
- PgfCCat* ccat, size_t lin_idx, PgfProduction prod,
- GuPool *pool)
-{
- PgfProductionIdx* set =
- gu_map_get(concr->leftcorner_tok_idx, tok, PgfProductionIdx*);
- if (set == NULL) {
- set = gu_map_type_new(PgfProductionIdx, pool);
- gu_map_put(concr->leftcorner_tok_idx, tok, PgfProductionIdx*, set);
- }
-
- PgfCFCat cfc = {ccat, lin_idx};
- PgfProductionBuf* prods = gu_map_get(set, &cfc, PgfProductionBuf*);
- if (prods == NULL) {
- prods = gu_new_buf(PgfProduction, pool);
- gu_map_put(set, &cfc, PgfProductionBuf*, prods);
- }
-
- gu_buf_push(prods, PgfProduction, prod);
-}
-
-static void
-pgf_parser_index_epsilon(PgfConcr* concr,
- PgfCCat* ccat, size_t lin_idx, PgfProduction prod,
- GuPool *pool)
-{
- PgfCFCat cfc = {ccat, lin_idx};
- PgfProductionBuf* prods =
- gu_map_get(concr->epsilon_idx, &cfc, PgfProductionBuf*);
- if (prods == NULL) {
- prods = gu_new_buf(PgfProduction, pool);
- gu_map_put(concr->epsilon_idx, &cfc, PgfProductionBuf*, prods);
- }
-
- gu_buf_push(prods, PgfProduction, prod);
-}
-
-static void
-pgf_parser_index_symbol(PgfConcr* concr, PgfSymbol sym,
- PgfCCat* ccat, size_t lin_idx, PgfProduction prod,
- GuPool *pool)
-{
- GuVariantInfo i = gu_variant_open(sym);
- switch (i.tag) {
- case PGF_SYMBOL_KS: {
- PgfSymbolKS* sks = i.data;
- pgf_parser_index_token(concr,
- sks->token,
- ccat, lin_idx, prod,
- pool);
- break;
- }
- case PGF_SYMBOL_KP: {
- PgfSymbolKP* skp = i.data;
- PgfSymbol sym =
- gu_seq_get(skp->default_form, PgfSymbol, 0);
- pgf_parser_index_symbol(concr, sym,
- ccat, lin_idx, prod,
- pool);
- for (size_t i = 0; i < skp->n_forms; i++) {
- sym = gu_seq_get(skp->forms[i].form, PgfSymbol, 0);
- pgf_parser_index_symbol(concr, sym,
- ccat, lin_idx, prod,
- pool);
- }
- break;
- }
- case PGF_SYMBOL_CAT:
- case PGF_SYMBOL_LIT:
- case PGF_SYMBOL_NE:
- case PGF_SYMBOL_BIND:
- case PGF_SYMBOL_VAR:
- // Nothing to be done here
- break;
- default:
- gu_impossible();
- }
+ pgf_morpho_iter(entry->idx, callback, err);
}
void
@@ -2559,15 +2508,14 @@ pgf_parser_index(PgfConcr* concr,
break;
PgfSequence* seq = papp->fun->lins[lin_idx];
- if (gu_seq_length(seq) > 0) {
- pgf_parser_index_symbol(concr, gu_seq_get(seq, PgfSymbol, 0),
- ccat, lin_idx, prod,
- pool);
- } else {
- pgf_parser_index_epsilon(concr,
- ccat, lin_idx, prod,
- pool);
+ if (seq->idx == NULL) {
+ seq->idx = gu_new_buf(PgfProductionIdxEntry, pool);
}
+
+ PgfProductionIdxEntry* entry = gu_buf_extend(seq->idx);
+ entry->ccat = ccat;
+ entry->lin_idx = lin_idx;
+ entry->papp = papp;
}
break;
case PGF_PRODUCTION_COERCE:
@@ -2631,24 +2579,3 @@ pgf_ccat_set_viterbi_prob(PgfCCat* ccat) {
return ccat->viterbi_prob;
}
-
-static bool
-pgf_cfcat_eq_fn(GuEquality* self, const void* a, const void* b)
-{
- PgfCFCat *x = (PgfCFCat *) a;
- PgfCFCat *y = (PgfCFCat *) b;
-
- return (x->ccat->fid == y->ccat->fid && x->lin_idx == y->lin_idx);
-}
-
-static GuHash
-pgf_cfcat_hash_fn(GuHasher* self, const void* a)
-{
- PgfCFCat *x = (PgfCFCat *) a;
- return ((x->ccat->fid << 16) ^ x->lin_idx);
-}
-
-GuHasher pgf_cfcat_hasher = {
- { pgf_cfcat_eq_fn },
- pgf_cfcat_hash_fn
-};
diff --git a/src/runtime/c/pgf/parser.h b/src/runtime/c/pgf/parser.h
index 5e88b0742..93addbf7f 100644
--- a/src/runtime/c/pgf/parser.h
+++ b/src/runtime/c/pgf/parser.h
@@ -5,119 +5,14 @@
#include <pgf/data.h>
#include <pgf/expr.h>
-/// Parsing
-/** @file
- *
- * @todo Querying the parser for expected continuations
- *
- * @todo Literals and custom categories
- *
- * @todo HOAS, dependent types...
- */
-
-typedef struct PgfParseState PgfParseState;
-
-/** @}
- *
- * @name Parsing a sentence
- *
- * The progress of parsing is controlled by the client code. Firstly, the
- * parsing of a sentence is initiated with #pgf_parser_parse. This returns an
- * initial #PgfParse object, which represents the state of the parsing. A new
- * parse state is obtained by feeding a token with #pgf_parse_token. The old
- * parse state is unaffected by this, so backtracking - and even branching -
- * can be accomplished by retaining the earlier #PgfParse objects.
- *
- * @{
- */
-
-/// Begin parsing
-PgfParseState*
-pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx,
- double heuristics,
- GuPool* pool, GuPool* out_pool);
-/**<
- * @param parser The parser to use
- *
- * @param cat The identifier of the abstract category to parse
- *
- * @param lin_idx The index of the field of the concrete category to parse
- *
- * @pool
- *
- * @return An initial parsing state.
-*/
-
-
-/// Feed a token to the parser
-PgfParseState*
-pgf_parser_next_state(PgfParseState* prev, PgfToken tok);
-/**<
- * @param parse The current parse state
- *
- * @param tok The token to feed
- *
- * @pool
- *
- * @return A new parse state obtained by feeding \p tok as an input to \p
- * parse, or \c NULL if the token was unexpected.
- *
- * @note The new parse state partially depends on the old one, so it doesn't
- * make sense to use a \p pool argument with a longer lifetime than that of
- * the pool used to create \parse.
- */
-
-GuEnum*
-pgf_parser_completions(PgfParseState* prev, GuString prefix);
-
void
-pgf_parser_add_literal(PgfConcr *concr, PgfCId cat,
- PgfLiteralCallback* callback);
-
-/** @}
- * @name Retrieving abstract syntax trees
- *
- * After the desired tokens have been fed to the parser, the resulting parse
- * state can be queried for completed results. The #pgf_parse_result function
- * returns an enumeration (#GuEnum) of possible abstract syntax trees whose
- * linearization is the sequence of tokens fed so far.
- *
- * @{
- */
+pgf_add_extern_tok(PgfSymbol* psym, PgfToken tok, GuPool* pool);
-/// Retrieve the current parses from the parse state.
-PgfExprEnum*
-pgf_parse_result(PgfParseState* state);
-/**<
- * @param parse A parse state
- *
- * @pool
- *
- * @return An enumeration of #PgfExpr elements representing the abstract
- * syntax trees that would linearize to the sequence of tokens fed to produce
- * \p parse. The enumeration may yield zero, one or more abstract syntax
- * trees, depending on whether the parse was unsuccesful, unambiguously
- * succesful, or ambiguously successful.
- */
-
-// Use this procedure only on your own risk.
-// It is dirty and it will probably be removed or replaced
-// with something else. Currently it is here only for experimental
-// purposes.
void
-pgf_parse_print_chunks(PgfParseState* state);
-
-size_t
-pgf_item_lin_idx(PgfItem* item);
+pgf_add_extern_cat(PgfSymbol* psym, int d, int r, GuPool* pool);
void
-pgf_item_sequence(PgfItem* item,
- size_t* lin_idx, PgfSequence** seq,
- GuPool* pool);
-
-int
-pgf_item_sequence_length(PgfItem* item);
-
-/** @} */
+pgf_parser_add_literal(PgfConcr *concr, PgfCId cat,
+ PgfLiteralCallback* callback);
#endif // PGF_PARSER_H_
diff --git a/src/runtime/c/pgf/parseval.c b/src/runtime/c/pgf/parseval.c
index ebd52d4ce..4feadbf07 100644
--- a/src/runtime/c/pgf/parseval.c
+++ b/src/runtime/c/pgf/parseval.c
@@ -1,8 +1,9 @@
#include <pgf/pgf.h>
+#include <pgf/data.h>
#include <pgf/linearizer.h>
#include <pgf/parser.h>
-typedef struct {
+/*typedef struct {
int start, end;
PgfCId cat;
int lin_idx;
@@ -123,24 +124,50 @@ pgf_metrics_lzn_end_phrase2(PgfLinFuncs** funcs, PgfCId cat, int fid, int lin_id
}
static PgfLinFuncs pgf_metrics_lin_funcs1 = {
+v v v v v v v
+ .symbol_tokens = pgf_metrics_lzn_symbol_tokens,
+ .expr_literal = pgf_metrics_lzn_expr_literal,
+ .begin_phrase = pgf_metrics_lzn_begin_phrase,
+ .end_phrase = pgf_metrics_lzn_end_phrase1
+=============
.symbol_token = pgf_metrics_lzn_symbol_token,
.expr_literal = pgf_metrics_lzn_expr_literal,
.begin_phrase = pgf_metrics_lzn_begin_phrase,
.end_phrase = pgf_metrics_lzn_end_phrase1
+*************
+ .symbol_tokens = pgf_metrics_lzn_symbol_tokens,
+ .expr_literal = pgf_metrics_lzn_expr_literal,
+ .begin_phrase = pgf_metrics_lzn_begin_phrase,
+ .end_phrase = pgf_metrics_lzn_end_phrase1,
+ .symbol_glue = NULL
+^ ^ ^ ^ ^ ^ ^
};
static PgfLinFuncs pgf_metrics_lin_funcs2 = {
+v v v v v v v
+ .symbol_tokens = pgf_metrics_lzn_symbol_tokens,
+ .expr_literal = pgf_metrics_lzn_expr_literal,
+ .begin_phrase = pgf_metrics_lzn_begin_phrase,
+ .end_phrase = pgf_metrics_lzn_end_phrase2
+=============
.symbol_token = pgf_metrics_lzn_symbol_token,
.expr_literal = pgf_metrics_lzn_expr_literal,
.begin_phrase = pgf_metrics_lzn_begin_phrase,
.end_phrase = pgf_metrics_lzn_end_phrase2
+*************
+ .symbol_tokens = pgf_metrics_lzn_symbol_tokens,
+ .expr_literal = pgf_metrics_lzn_expr_literal,
+ .begin_phrase = pgf_metrics_lzn_begin_phrase,
+ .end_phrase = pgf_metrics_lzn_end_phrase2,
+ .symbol_glue = NULL
+^ ^ ^ ^ ^ ^ ^
};
-
+*/
bool
pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat,
double *precision, double *recall, double *exact)
{
- GuPool* pool = gu_new_pool();
+/* GuPool* pool = gu_new_pool();
GuEnum* en_lins1 =
pgf_lzr_concretize(concr, expr, pool);
@@ -190,6 +217,6 @@ pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat,
*precision = ((double) state.matches)/((double) state.found);
*recall = ((double) state.matches)/((double) gu_buf_length(state.phrases));
*exact = pgf_expr_eq(expr, ep->expr) ? 1 : 0;
-
+*/
return true;
}
diff --git a/src/runtime/c/pgf/pgf.c b/src/runtime/c/pgf/pgf.c
index a1eb62c95..39b032adb 100644
--- a/src/runtime/c/pgf/pgf.c
+++ b/src/runtime/c/pgf/pgf.c
@@ -4,7 +4,6 @@
#include <pgf/reader.h>
#include <pgf/linearizer.h>
#include <pgf/parser.h>
-#include <pgf/lexer.h>
#include <gu/file.h>
#include <gu/string.h>
#include <gu/enum.h>
@@ -61,7 +60,8 @@ pgf_load_meta_child_probs(PgfPGF* pgf, const char* fpath,
PgfAbsCat* abscat1 =
gu_map_get(pgf->abstract.cats, cat1, PgfAbsCat*);
if (abscat1 == NULL) {
- gu_raise(err, PgfExn);
+ GuExnData* exn = gu_raise(err, PgfExn);
+ exn->data = "Unknown category name";
goto close;
}
@@ -73,6 +73,8 @@ pgf_load_meta_child_probs(PgfPGF* pgf, const char* fpath,
PgfAbsCat* abscat2 = gu_map_get(pgf->abstract.cats, cat2, PgfAbsCat*);
if (abscat2 == NULL) {
gu_raise(err, PgfExn);
+ GuExnData* exn = gu_raise(err, PgfExn);
+ exn->data = "Unknown category name";
goto close;
}
@@ -228,73 +230,3 @@ pgf_linearize(PgfConcr* concr, PgfExpr expr, GuOut* out, GuExn* err)
gu_pool_free(tmp_pool);
}
-
-GuEnum*
-pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
- GuPool* pool, GuPool* out_pool)
-{
- return pgf_parse_with_heuristics(concr, cat, lexer, -1.0, pool, out_pool);
-}
-
-GuEnum*
-pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
- double heuristics,
- GuPool* pool, GuPool* out_pool)
-{
- // Begin parsing a sentence of the specified category
- PgfParseState* state =
- pgf_parser_init_state(concr, cat, 0, heuristics, pool, out_pool);
- if (state == NULL) {
- return NULL;
- }
-
- // Tokenization
- GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), pool);
- PgfToken tok = pgf_lexer_read_token(lexer, lex_err);
- while (!gu_exn_is_raised(lex_err)) {
- // feed the token to get a new parse state
- state = pgf_parser_next_state(state, tok);
- if (state == NULL) {
- return NULL;
- }
-
- tok = pgf_lexer_read_token(lexer, lex_err);
- }
-
- if (gu_exn_caught(lex_err) != gu_type(GuEOF))
- return NULL;
-
- // Now begin enumerating the resulting syntax trees
- return pgf_parse_result(state);
-}
-
-GuEnum*
-pgf_complete(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
- GuString prefix, GuPool* pool)
-{
- // Begin parsing a sentence of the specified category
- PgfParseState* state =
- pgf_parser_init_state(concr, cat, 0, -1, pool, pool);
- if (state == NULL) {
- return NULL;
- }
-
- // Tokenization
- GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), pool);
- PgfToken tok = pgf_lexer_read_token(lexer, lex_err);
- while (!gu_exn_is_raised(lex_err)) {
- // feed the token to get a new parse state
- state = pgf_parser_next_state(state, tok);
- if (state == NULL) {
- return NULL;
- }
-
- tok = pgf_lexer_read_token(lexer, lex_err);
- }
-
- if (gu_exn_caught(lex_err) != gu_type(GuEOF))
- return NULL;
-
- // Now begin enumerating the resulting syntax trees
- return pgf_parser_completions(state, prefix);
-}
diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h
index 9c95f6f90..b586eb7b0 100644
--- a/src/runtime/c/pgf/pgf.h
+++ b/src/runtime/c/pgf/pgf.h
@@ -37,6 +37,7 @@ extern GU_DECLARE_TYPE(PgfCId, typedef);
extern GU_DECLARE_TYPE(PgfExn, abstract);
+extern GU_DECLARE_TYPE(PgfParseError, abstract);
/// @name PGF Grammar objects
/// @{
@@ -50,7 +51,6 @@ typedef struct PgfConcr PgfConcr;
*/
#include <pgf/expr.h>
-#include <pgf/lexer.h>
#include <pgf/graphviz.h>
/// An enumeration of #PgfExpr elements.
@@ -120,8 +120,16 @@ pgf_print_name(PgfConcr*, PgfCId id);
void
pgf_linearize(PgfConcr* concr, PgfExpr expr, GuOut* out, GuExn* err);
+bool
+pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat,
+ double *precision, double *recall, double *exact);
+
+PgfExprEnum*
+pgf_generate_all(PgfPGF* pgf, PgfCId cat, GuPool* pool);
+
PgfExprEnum*
-pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
+pgf_parse(PgfConcr* concr, PgfCId cat, GuString sentence,
+ GuExn* err,
GuPool* pool, GuPool* out_pool);
typedef struct PgfMorphoCallback PgfMorphoCallback;
@@ -132,10 +140,10 @@ struct PgfMorphoCallback {
};
void
-pgf_lookup_morpho(PgfConcr *concr, PgfLexer *lexer,
+pgf_lookup_morpho(PgfConcr *concr, GuString sentence,
PgfMorphoCallback* callback, GuExn* err);
-typedef GuMapKeyValue PgfFullFormEntry;
+typedef struct PgfFullFormEntry PgfFullFormEntry;
GuEnum*
pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool);
@@ -148,20 +156,14 @@ pgf_fullform_get_analyses(PgfFullFormEntry* entry,
PgfMorphoCallback* callback, GuExn* err);
PgfExprEnum*
-pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
- double heuristics,
+pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat,
+ GuString sentence, double heuristics,
+ GuExn* err,
GuPool* pool, GuPool* out_pool);
GuEnum*
-pgf_complete(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
- GuString prefix, GuPool* pool);
-
-bool
-pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat,
- double *precision, double *recall, double *exact);
-
-PgfExprEnum*
-pgf_generate_all(PgfPGF* pgf, PgfCId cat, GuPool* pool);
+pgf_complete(PgfConcr* concr, PgfCId cat, GuString string,
+ GuString prefix, GuExn* err, GuPool* pool);
/// @}
diff --git a/src/runtime/c/pgf/printer.c b/src/runtime/c/pgf/printer.c
index 1b2775b94..9ce74d495 100644
--- a/src/runtime/c/pgf/printer.c
+++ b/src/runtime/c/pgf/printer.c
@@ -169,21 +169,14 @@ pgf_print_cncfun(PgfCncFun *cncfun, PgfSequences* sequences,
GuOut *out, GuExn *err)
{
gu_printf(out,err," F%d := (", cncfun->funid);
-
- size_t n_seqs = gu_seq_length(sequences);
-
+
for (size_t i = 0; i < cncfun->n_lins; i++) {
if (i > 0) gu_putc(',', out, err);
- PgfSequence* seq = cncfun->lins[i];
- for (size_t seqid = 0; seqid < n_seqs; seqid++) {
- if (gu_seq_data(gu_seq_get(sequences, PgfSequence*, seqid)) == gu_seq_data(seq)) {
- gu_printf(out,err,"S%d", seqid);
- break;
- }
- }
+ PgfSequence* seq = cncfun->lins[i];
+ gu_printf(out,err,"S%d", (seq - ((PgfSequence*) gu_seq_data(sequences))));
}
-
+
gu_puts(")", out, err);
if (cncfun->absfun != NULL) {
@@ -204,7 +197,7 @@ pgf_print_token(PgfToken tok, GuOut *out, GuExn *err)
}
static void
-pgf_print_sequence(PgfSequence* seq, GuOut *out, GuExn *err);
+pgf_print_symbols(PgfSymbols* syms, GuOut *out, GuExn *err);
void
pgf_print_symbol(PgfSymbol sym, GuOut *out, GuExn *err)
@@ -224,11 +217,11 @@ pgf_print_symbol(PgfSymbol sym, GuOut *out, GuExn *err)
PgfSymbolKP* skp = gu_variant_data(sym);
gu_puts("pre {", out, err);
- pgf_print_sequence(skp->default_form, out, err);
+ pgf_print_symbols(skp->default_form, out, err);
for (size_t i = 0; i < skp->n_forms; i++) {
gu_puts("; ", out, err);
- pgf_print_sequence(skp->forms[i].form, out, err);
+ pgf_print_symbols(skp->forms[i].form, out, err);
gu_puts(" / ", out, err);
size_t n_prefixes = gu_seq_length(skp->forms[i].prefixes);
@@ -269,13 +262,13 @@ pgf_print_symbol(PgfSymbol sym, GuOut *out, GuExn *err)
}
static void
-pgf_print_sequence(PgfSequence* seq, GuOut *out, GuExn *err)
+pgf_print_symbols(PgfSymbols* syms, GuOut *out, GuExn *err)
{
- int n_syms = gu_seq_length(seq);
+ int n_syms = gu_seq_length(syms);
for (int i = 0; i < n_syms; i++) {
if (i > 0) gu_putc(' ', out, err);
- PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i);
+ PgfSymbol sym = gu_seq_get(syms, PgfSymbol, i);
pgf_print_symbol(sym, out, err);
}
}
@@ -338,10 +331,9 @@ pgf_print_concrete(PgfCId cncname, PgfConcr* concr,
gu_puts(" sequences\n", out, err);
size_t n_seqs = gu_seq_length(concr->sequences);
for (size_t i = 0; i < n_seqs; i++) {
- PgfSequence* seq = gu_seq_get(concr->sequences, PgfSequence*, i);
-
gu_printf(out,err," S%d := ", i);
- pgf_print_sequence(seq, out, err);
+ PgfSymbols* syms = gu_seq_index(concr->sequences, PgfSequence, i)->syms;
+ pgf_print_symbols(syms, out, err);
gu_putc('\n', out, err);
}
diff --git a/src/runtime/c/pgf/reader.c b/src/runtime/c/pgf/reader.c
index 9f08662e3..1e29c119c 100644
--- a/src/runtime/c/pgf/reader.c
+++ b/src/runtime/c/pgf/reader.c
@@ -606,13 +606,13 @@ pgf_read_printnames(PgfReader* rdr)
return printnames;
}
-static PgfSequence*
-pgf_read_sequence(PgfReader* rdr);
+static PgfSymbols*
+pgf_read_symbols(PgfReader* rdr);
static void
pgf_read_alternative(PgfReader* rdr, PgfAlternative* alt)
{
- alt->form = pgf_read_sequence(rdr);
+ alt->form = pgf_read_symbols(rdr);
gu_return_on_exn(rdr->err,);
size_t n_prefixes = pgf_read_len(rdr);
@@ -692,7 +692,7 @@ pgf_read_symbol(PgfReader* rdr)
break;
}
case PGF_SYMBOL_KP: {
- PgfSequence* default_form = pgf_read_sequence(rdr);
+ PgfSymbols* default_form = pgf_read_symbols(rdr);
gu_return_on_exn(rdr->err, gu_null_variant);
size_t n_forms = pgf_read_len(rdr);
@@ -732,21 +732,21 @@ pgf_read_symbol(PgfReader* rdr)
return sym;
}
-static PgfSequence*
-pgf_read_sequence(PgfReader* rdr)
+static PgfSymbols*
+pgf_read_symbols(PgfReader* rdr)
{
size_t len = pgf_read_len(rdr);
gu_return_on_exn(rdr->err, NULL);
- PgfSequence* seq = gu_new_seq(PgfSymbol, len, rdr->opool);
+ PgfSymbols* syms = gu_new_seq(PgfSymbol, len, rdr->opool);
for (size_t i = 0; i < len; i++) {
PgfSymbol sym = pgf_read_symbol(rdr);
gu_return_on_exn(rdr->err, NULL);
- gu_seq_set(seq, PgfSymbol, i, sym);
+ gu_seq_set(syms, PgfSymbol, i, sym);
}
- return seq;
+ return syms;
}
static PgfSequences*
@@ -755,12 +755,14 @@ pgf_read_sequences(PgfReader* rdr)
size_t len = pgf_read_len(rdr);
gu_return_on_exn(rdr->err, NULL);
- PgfSequences* seqs = gu_new_seq(PgfSequence*, len, rdr->opool);
+ PgfSequences* seqs = gu_new_seq(PgfSequence, len, rdr->opool);
for (size_t i = 0; i < len; i++) {
- PgfSequence* seq = pgf_read_sequence(rdr);
+ PgfSymbols* syms = pgf_read_symbols(rdr);
gu_return_on_exn(rdr->err, NULL);
- gu_seq_set(seqs, PgfSequence*, i, seq);
+
+ gu_seq_index(seqs, PgfSequence, i)->syms = syms;
+ gu_seq_index(seqs, PgfSequence, i)->idx = NULL;
}
return seqs;
@@ -793,7 +795,7 @@ pgf_read_cncfun(PgfReader* rdr, PgfAbstr* abstr, PgfConcr* concr, int funid)
return NULL;
}
- cncfun->lins[i] = gu_seq_get(concr->sequences, PgfSequence*, seqid);
+ cncfun->lins[i] = gu_seq_index(concr->sequences, PgfSequence, seqid);
}
return cncfun;
@@ -1149,8 +1151,6 @@ pgf_read_concrete(PgfReader* rdr, PgfAbstr* abstr)
gu_new_int_map(PgfCCat*, &gu_null_struct, rdr->opool);
concr->fun_indices = gu_map_type_new(PgfCncFunOverloadMap, rdr->opool);
concr->coerce_idx = gu_map_type_new(PgfCncOverloadMap, rdr->opool);
- concr->epsilon_idx = gu_map_type_new(PgfProductionIdx, rdr->opool);
- concr->leftcorner_tok_idx = gu_map_type_new(PgfLeftcornerTokIdx,rdr->opool);
pgf_read_lindefs(rdr, concr);
pgf_read_ccats(rdr, concr);
concr->cnccats = pgf_read_cnccats(rdr, abstr, concr);
@@ -1200,7 +1200,7 @@ pgf_read_pgf(PgfReader* rdr) {
pgf_read_abstract(rdr, &pgf->abstract);
gu_return_on_exn(rdr->err, NULL);
-
+
pgf->concretes = pgf_read_concretes(rdr, &pgf->abstract);
gu_return_on_exn(rdr->err, NULL);
diff --git a/src/runtime/c/utils/pgf-parse.c b/src/runtime/c/utils/pgf-parse.c
index 2790e49de..cb5fc6c7e 100644
--- a/src/runtime/c/utils/pgf-parse.c
+++ b/src/runtime/c/utils/pgf-parse.c
@@ -110,12 +110,11 @@ int main(int argc, char* argv[]) {
clock_t start = clock();
- GuIn *in = gu_string_in(line, ppool);
- PgfLexer *lexer = pgf_new_simple_lexer(in, ppool);
- GuEnum* result = pgf_parse_with_heuristics(concr, cat, lexer, heuristics, ppool, ppool);
+ GuExn* parse_err = gu_new_exn(NULL, gu_kind(type), ppool);
+ GuEnum* result = pgf_parse_with_heuristics(concr, cat, line, heuristics, parse_err, ppool, ppool);
PgfExprProb* ep = NULL;
- if (result != NULL)
+ if (gu_ok(parse_err))
ep = gu_next(result, PgfExprProb*, ppool);
clock_t end = clock();
diff --git a/src/runtime/c/utils/pgf-translate.c b/src/runtime/c/utils/pgf-translate.c
index aa98f1a7a..abd22267c 100644
--- a/src/runtime/c/utils/pgf-translate.c
+++ b/src/runtime/c/utils/pgf-translate.c
@@ -2,6 +2,7 @@
#include <gu/map.h>
#include <gu/enum.h>
#include <gu/file.h>
+#include <gu/exn.h>
#include <pgf/pgf.h>
#include <pgf/parser.h>
#include <pgf/literals.h>
@@ -153,23 +154,19 @@ int main(int argc, char* argv[]) {
// sentence, so our memory usage doesn't increase over time.
ppool = gu_new_pool();
- GuIn *in =
- gu_string_in(line, ppool);
- PgfLexer *lexer =
- pgf_new_simple_lexer(in, ppool);
-
clock_t start = clock();
+ GuExn* parse_err = gu_new_exn(NULL, gu_kind(type), ppool);
result =
- pgf_parse(from_concr, cat, lexer, ppool, ppool);
- if (result == NULL) {
- PgfToken tok =
- pgf_lexer_current_token(lexer);
-
- if (*tok == 0)
- gu_puts("Couldn't begin parsing", out, err);
- else {
+ pgf_parse(from_concr, cat, line, parse_err, ppool, ppool);
+ if (!gu_ok(parse_err)) {
+ if (gu_exn_caught(parse_err) == gu_type(PgfExn)) {
+ GuString msg = gu_exn_caught_data(parse_err);
+ gu_string_write(msg, out, err);
+ gu_putc('\n', out, err);
+ } else if (gu_exn_caught(parse_err) == gu_type(PgfParseError)) {
gu_puts("Unexpected token: \"", out, err);
+ GuString tok = gu_exn_caught_data(parse_err);
gu_string_write(tok, out, err);
gu_puts("\"\n", out, err);
}