summaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
authorkr.angelov <kr.angelov@gmail.com>2014-12-29 10:59:20 +0000
committerkr.angelov <kr.angelov@gmail.com>2014-12-29 10:59:20 +0000
commit3bd40dbab68c8354d8cfceb6dad32d24b13bc723 (patch)
treeb313cba147c811a720b1f17d5a7c1277fab68a72 /src/runtime
parent8fd24c3839e7d171e0c4170ae17b26c7ff5aec1b (diff)
API for word alignment in the C runtime and in the Haskell binding
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/c/Makefile.am1
-rw-r--r--src/runtime/c/gu/string.c6
-rw-r--r--src/runtime/c/gu/string.h3
-rw-r--r--src/runtime/c/pgf/aligner.c214
-rw-r--r--src/runtime/c/pgf/linearizer.c4
-rw-r--r--src/runtime/c/pgf/pgf.h10
-rw-r--r--src/runtime/haskell-bind/PGF2.hsc29
-rw-r--r--src/runtime/haskell-bind/PGF2/FFI.hs4
8 files changed, 269 insertions, 2 deletions
diff --git a/src/runtime/c/Makefile.am b/src/runtime/c/Makefile.am
index 4129c6157..726f00080 100644
--- a/src/runtime/c/Makefile.am
+++ b/src/runtime/c/Makefile.am
@@ -77,6 +77,7 @@ libpgf_la_SOURCES = \
pgf/hopu.c \
pgf/printer.c \
pgf/graphviz.c \
+ pgf/aligner.c \
pgf/pgf.c \
pgf/pgf.h \
libpgf_la_LDFLAGS = "-no-undefined"
diff --git a/src/runtime/c/gu/string.c b/src/runtime/c/gu/string.c
index d380fca49..0947cf9e0 100644
--- a/src/runtime/c/gu/string.c
+++ b/src/runtime/c/gu/string.c
@@ -82,6 +82,12 @@ gu_string_buf_freeze(GuStringBuf* sb, GuPool* pool)
return p;
}
+void
+gu_string_buf_flush(GuStringBuf* sb)
+{
+ gu_buf_flush(sb->buf);
+}
+
GuIn*
gu_string_in(GuString s, GuPool* pool)
{
diff --git a/src/runtime/c/gu/string.h b/src/runtime/c/gu/string.h
index 111050606..e4729239c 100644
--- a/src/runtime/c/gu/string.h
+++ b/src/runtime/c/gu/string.h
@@ -33,6 +33,9 @@ gu_string_buf_out(GuStringBuf* sb);
GuString
gu_string_buf_freeze(GuStringBuf* sb, GuPool* pool);
+void
+gu_string_buf_flush(GuStringBuf* sb);
+
GuString
gu_format_string_v(const char* fmt, va_list args, GuPool* pool);
diff --git a/src/runtime/c/pgf/aligner.c b/src/runtime/c/pgf/aligner.c
new file mode 100644
index 000000000..a3eb4e2c0
--- /dev/null
+++ b/src/runtime/c/pgf/aligner.c
@@ -0,0 +1,214 @@
+#include "data.h"
+#include "linearizer.h"
+#include "pgf.h"
+#include <gu/utf8.h>
+
+typedef struct {
+ PgfLinFuncs* funcs;
+ GuBuf* parent_stack;
+ GuBuf* parent_current;
+ GuBuf* phrases;
+ PgfAlignmentPhrase* last_phrase;
+ GuStringBuf* sbuf;
+ size_t n_matches;
+ GuExn* err;
+ bool bind;
+ bool capit;
+ GuPool* out_pool;
+ GuPool* tmp_pool;
+} PgfAlignerLin;
+
+static void
+pgf_aligner_flush_phrase(PgfAlignerLin* alin)
+{
+ size_t n_fids = gu_buf_length(alin->parent_current);
+
+ if (alin->n_matches == n_fids &&
+ alin->n_matches == alin->last_phrase->n_fids) {
+ // if the current compound word has the same parents
+ // as the last one then we just combine them with a space
+
+ alin->last_phrase->phrase =
+ gu_format_string(alin->out_pool, "%s %s",
+ alin->last_phrase->phrase,
+ gu_string_buf_freeze(alin->sbuf, alin->tmp_pool));
+ } else {
+ // push the current word to the buffer of words
+
+ PgfAlignmentPhrase* phrase =
+ gu_new_flex(alin->out_pool, PgfAlignmentPhrase, fids, n_fids);
+ phrase->phrase = gu_string_buf_freeze(alin->sbuf, alin->out_pool);
+ phrase->n_fids = n_fids;
+ for (size_t i = 0; i < n_fids; i++) {
+ phrase->fids[i] = gu_buf_get(alin->parent_current, int, i);
+ }
+ gu_buf_push(alin->phrases, PgfAlignmentPhrase*, phrase);
+
+ alin->last_phrase = phrase;
+ }
+
+ alin->n_matches = 0;
+}
+
+static void
+pgf_aligner_push_parent(PgfAlignerLin* alin, int fid)
+{
+ gu_buf_push(alin->parent_current, int, fid);
+
+ if (alin->last_phrase != NULL) {
+ for (size_t i = 0; i < alin->last_phrase->n_fids; i++) {
+ if (fid == alin->last_phrase->fids[i]) {
+ alin->n_matches++;
+ break;
+ }
+ }
+ }
+}
+
+static void
+pgf_aligner_lzn_symbol_token(PgfLinFuncs** funcs, PgfToken tok)
+{
+ PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs);
+ if (!gu_ok(alin->err)) {
+ return;
+ }
+
+ // get the tree node id that generates this token
+ size_t n_parents = gu_buf_length(alin->parent_stack);
+ int fid = gu_buf_get(alin->parent_stack, int, n_parents-1);
+
+ // how many nodes so far are involved in the current compound word
+ size_t n_fids = gu_buf_length(alin->parent_current);
+
+ if (alin->bind) {
+ // here we glue tokens
+
+ alin->bind = false;
+
+ bool found = false;
+ for (size_t i = 0; i < n_fids; i++) {
+ int current_fid = gu_buf_get(alin->parent_current, int, i);
+ if (fid == current_fid) {
+ found = true;
+ break;
+ }
+ }
+
+ // add the tree node id to the list of parents if it has not
+ // been added already.
+ if (!found) {
+ pgf_aligner_push_parent(alin, fid);
+ }
+ } else {
+ // here we start a new (compound) word
+
+ pgf_aligner_flush_phrase(alin);
+ gu_string_buf_flush(alin->sbuf);
+ gu_buf_flush(alin->parent_current);
+
+ pgf_aligner_push_parent(alin, fid);
+ }
+
+ GuOut* out = gu_string_buf_out(alin->sbuf);
+
+ if (alin->capit) {
+ GuUCS c = gu_utf8_decode((const uint8_t**) &tok);
+ c = gu_ucs_to_upper(c);
+ gu_out_utf8(c, out, alin->err);
+ alin->capit = false;
+ }
+
+ gu_string_write(tok, out, alin->err);
+}
+
+static void
+pgf_aligner_lzn_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, int lindex, PgfCId fun)
+{
+ PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs);
+ gu_buf_push(alin->parent_stack, int, fid);
+}
+
+static void
+pgf_aligner_lzn_end_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, int lindex, PgfCId fun)
+{
+ PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs);
+ gu_buf_pop(alin->parent_stack, int);
+}
+
+static void
+pgf_aligner_lzn_symbol_ne(PgfLinFuncs** funcs)
+{
+ PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs);
+ gu_raise(alin->err, PgalinNonExist);
+}
+
+static void
+pgf_aligner_lzn_symbol_bind(PgfLinFuncs** funcs)
+{
+ PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs);
+ alin->bind = true;
+}
+
+static void
+pgf_aligner_lzn_symbol_capit(PgfLinFuncs** funcs)
+{
+ PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs);
+ alin->capit = true;
+}
+
+static PgfLinFuncs pgf_file_lin_funcs = {
+ .symbol_token = pgf_aligner_lzn_symbol_token,
+ .begin_phrase = pgf_aligner_lzn_begin_phrase,
+ .end_phrase = pgf_aligner_lzn_end_phrase,
+ .symbol_ne = pgf_aligner_lzn_symbol_ne,
+ .symbol_bind = pgf_aligner_lzn_symbol_bind,
+ .symbol_capit = pgf_aligner_lzn_symbol_capit
+};
+
+GuSeq*
+pgf_align_words(PgfConcr* concr, PgfExpr expr,
+ GuExn* err, GuPool* pool)
+{
+ GuPool* tmp_pool = gu_local_pool();
+
+ GuEnum* cts =
+ pgf_lzr_concretize(concr, expr, err, tmp_pool);
+ if (!gu_ok(err)) {
+ gu_pool_free(tmp_pool);
+ return NULL;
+ }
+
+ GuBuf* phrases = gu_new_buf(PgfAlignmentPhrase*, pool);
+
+ PgfCncTree ctree = gu_next(cts, PgfCncTree, tmp_pool);
+ if (!gu_variant_is_null(ctree)) {
+ ctree = pgf_lzr_wrap_linref(ctree, tmp_pool);
+
+ PgfAlignerLin alin = {
+ .funcs = &pgf_file_lin_funcs,
+ .parent_stack = gu_new_buf(int, tmp_pool),
+ .parent_current = gu_new_buf(int, tmp_pool),
+ .phrases = phrases,
+ .last_phrase = NULL,
+ .sbuf = gu_string_buf(tmp_pool),
+ .n_matches = 0,
+ .err = err,
+ .bind = true,
+ .capit = false,
+ .out_pool = pool,
+ .tmp_pool = tmp_pool
+ };
+ gu_buf_push(alin.parent_stack, int, -1);
+
+ pgf_lzr_linearize(concr, ctree, 0, &alin.funcs, tmp_pool);
+ if (!gu_ok(err)) {
+ gu_pool_free(tmp_pool);
+ return NULL;
+ }
+
+ pgf_aligner_flush_phrase(&alin);
+ }
+
+ gu_pool_free(tmp_pool);
+ return gu_buf_data_seq(phrases);
+}
diff --git a/src/runtime/c/pgf/linearizer.c b/src/runtime/c/pgf/linearizer.c
index bbec2f3c2..409d60a2c 100644
--- a/src/runtime/c/pgf/linearizer.c
+++ b/src/runtime/c/pgf/linearizer.c
@@ -1160,8 +1160,10 @@ pgf_linearize(PgfConcr* concr, PgfExpr expr, GuOut* out, GuExn* err)
GuEnum* cts =
pgf_lzr_concretize(concr, expr, err, tmp_pool);
- if (!gu_ok(err))
+ if (!gu_ok(err)) {
+ gu_pool_free(tmp_pool);
return;
+ }
PgfCncTree ctree = gu_next(cts, PgfCncTree, tmp_pool);
if (!gu_variant_is_null(ctree)) {
diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h
index e542e4213..e2fc6f74d 100644
--- a/src/runtime/c/pgf/pgf.h
+++ b/src/runtime/c/pgf/pgf.h
@@ -75,6 +75,16 @@ pgf_has_linearization(PgfConcr* concr, PgfCId id);
void
pgf_linearize(PgfConcr* concr, PgfExpr expr, GuOut* out, GuExn* err);
+typedef struct {
+ GuString phrase;
+ size_t n_fids;
+ int fids[];
+} PgfAlignmentPhrase;
+
+GuSeq*
+pgf_align_words(PgfConcr* concr, PgfExpr expr,
+ GuExn* err, GuPool* pool);
+
bool
pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat,
double *precision, double *recall, double *exact);
diff --git a/src/runtime/haskell-bind/PGF2.hsc b/src/runtime/haskell-bind/PGF2.hsc
index 44f9d2b1c..02f74dd7a 100644
--- a/src/runtime/haskell-bind/PGF2.hsc
+++ b/src/runtime/haskell-bind/PGF2.hsc
@@ -15,7 +15,7 @@
module PGF2 (-- * PGF
PGF,readPGF,abstractName,startCat,
-- * Concrete syntax
- Concr,languages,parse,parseWithHeuristics,linearize,
+ Concr,languages,parse,parseWithHeuristics,linearize,alignWords,
-- * Trees
Expr,readExpr,showExpr,mkApp,unApp,mkStr,
-- * Morphology
@@ -362,6 +362,33 @@ linearize lang e = unsafePerformIO $
else do lin <- gu_string_buf_freeze sb pl
peekCString lin
+alignWords :: Concr -> Expr -> [(String, [Int])]
+alignWords lang e = unsafePerformIO $
+ withGuPool $ \pl ->
+ do exn <- gu_new_exn pl
+ seq <- pgf_align_words (concr lang) (expr e) exn pl
+ failed <- gu_exn_is_raised exn
+ if failed
+ then do is_nonexist <- gu_exn_caught exn gu_exn_type_PgfLinNonExist
+ if is_nonexist
+ then return []
+ else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn
+ if is_exn
+ then do c_msg <- (#peek GuExn, data.data) exn
+ msg <- peekCString c_msg
+ throwIO (PGFError msg)
+ else throwIO (PGFError "The abstract tree cannot be linearized")
+ else do len <- (#peek GuSeq, len) seq
+ arr <- peekArray (fromIntegral (len :: CInt)) (seq `plusPtr` (#offset GuSeq, data))
+ mapM peekAlignmentPhrase arr
+ where
+ peekAlignmentPhrase :: Ptr () -> IO (String, [Int])
+ peekAlignmentPhrase ptr = do
+ c_phrase <- (#peek PgfAlignmentPhrase, phrase) ptr
+ phrase <- peekCString c_phrase
+ n_fids <- (#peek PgfAlignmentPhrase, n_fids) ptr
+ fids <- peekArray (fromIntegral (n_fids :: CInt)) (ptr `plusPtr` (#offset PgfAlignmentPhrase, fids))
+ return (phrase, fids)
-----------------------------------------------------------------------------
-- Helper functions
diff --git a/src/runtime/haskell-bind/PGF2/FFI.hs b/src/runtime/haskell-bind/PGF2/FFI.hs
index b96c93e17..f36fa1368 100644
--- a/src/runtime/haskell-bind/PGF2/FFI.hs
+++ b/src/runtime/haskell-bind/PGF2/FFI.hs
@@ -21,6 +21,7 @@ data GuString
data GuStringBuf
data GuMapItor
data GuOut
+data GuSeq
data GuPool
foreign import ccall fopen :: CString -> CString -> IO (Ptr ())
@@ -135,6 +136,9 @@ foreign import ccall "pgf/pgf.h pgf_print_name"
foreign import ccall "pgf/pgf.h pgf_linearize"
pgf_linearize :: Ptr PgfConcr -> PgfExpr -> Ptr GuOut -> Ptr GuExn -> IO ()
+foreign import ccall "pgf/pgf.h pgf_align_words"
+ pgf_align_words :: Ptr PgfConcr -> PgfExpr -> Ptr GuExn -> Ptr GuPool -> IO (Ptr GuSeq)
+
foreign import ccall "pgf/pgf.h pgf_parse_with_heuristics"
pgf_parse_with_heuristics :: Ptr PgfConcr -> CString -> CString -> Double -> Ptr PgfCallbacksMap -> Ptr GuExn -> Ptr GuPool -> Ptr GuPool -> IO (Ptr GuEnum)