summaryrefslogtreecommitdiff
path: root/contrib/c-bindings/gf_lexing.c
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/c-bindings/gf_lexing.c')
-rw-r--r--contrib/c-bindings/gf_lexing.c286
1 files changed, 286 insertions, 0 deletions
diff --git a/contrib/c-bindings/gf_lexing.c b/contrib/c-bindings/gf_lexing.c
new file mode 100644
index 000000000..21353ec8b
--- /dev/null
+++ b/contrib/c-bindings/gf_lexing.c
@@ -0,0 +1,286 @@
+/* GF C Bindings
+ Copyright (C) 2010 Kevin Kofler
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "gf_lexing.h"
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+typedef char **(*GF_Lexer)(const char *str);
+typedef char *(*GF_Unlexer)(char **arr);
+
+static inline void freev(char **p)
+{
+ char **q = p;
+ while (*q)
+ free(*(q++));
+ free(p);
+}
+
+static char **words(const char *str)
+{
+ unsigned char *buf = (unsigned char *) strdup(str);
+ unsigned char *p = buf, *q;
+ char **result, **r;
+ size_t count = 0u;
+ while (isspace(*p)) p++;
+ q = p;
+ if (*p) count++;
+ while (*p) {
+ if (isspace(*p)) {
+ *(p++) = 0;
+ while (isspace(*p)) *(p++) = 0;
+ if (*p) count++;
+ } else p++;
+ }
+ r = result = malloc((count+1)*sizeof(char *));
+ if (count) while (1) {
+ *(r++) = strdup((char *) q);
+ if (!--count) break;
+ while (*q) q++;
+ while (!*q) q++;
+ }
+ *r = NULL;
+ return result;
+}
+
+static char *unwords(char **arr)
+{
+ size_t len = 0u;
+ char **p = arr, *result, *r;
+ while (*p)
+ len += strlen(*(p++)) + 1u;
+ if (!len) return calloc(1, 1);
+ r = result = malloc(len);
+ p = arr;
+ while (1) {
+ size_t l = strlen(*p);
+ strcpy(r, *(p++));
+ if (!*p) break;
+ r += l;
+ *(r++) = ' ';
+ }
+ return result;
+}
+
+static char **lines(const char *str)
+{
+ unsigned char *buf = (unsigned char *) strdup(str);
+ unsigned char *p = buf, *q;
+ char **result, **r;
+ size_t count = 0u;
+ while (*p == '\n') p++;
+ q = p;
+ if (*p) count++;
+ while (*p) {
+ if (*p == '\n') {
+ *(p++) = 0;
+ while (*p == '\n') *(p++) = 0;
+ if (*p) count++;
+ } else p++;
+ }
+ r = result = malloc((count+1)*sizeof(char *));
+ if (count) while (1) {
+ *(r++) = strdup((char *) q);
+ if (!--count) break;
+ while (*q) q++;
+ while (!*q) q++;
+ }
+ *r = NULL;
+ return result;
+}
+
+static char *unlines(char **arr)
+{
+ size_t len = 0u;
+ char **p = arr, *result, *r;
+ while (*p)
+ len += strlen(*(p++)) + 1u;
+ if (!len) return calloc(1, 1);
+ r = result = malloc(len);
+ p = arr;
+ while (1) {
+ size_t l = strlen(*p);
+ strcpy(r, *(p++));
+ if (!*p) break;
+ r += l;
+ *(r++) = '\n';
+ }
+ return result;
+}
+
+static char *appLexer(GF_Lexer f, const char *str)
+{
+ char **arr = f(str), **p = arr, *result;
+ int ofs = 0;
+ while (*p && **p) p++;
+ while (*p) {
+ if (**p) p[-ofs] = *p; else ofs++;
+ p++;
+ }
+ p[-ofs] = NULL;
+ result = unwords(arr);
+ freev(arr);
+ return result;
+}
+
+static char *appUnlexer(GF_Unlexer f, const char *str)
+{
+ char **arr = lines(str), **p = arr, *result;
+ while (*p) {
+ char **warr = words(*p);
+ free(*p);
+ *(p++) = f(warr);
+ freev(warr);
+ }
+ result = unlines(arr);
+ freev(arr);
+ return result;
+}
+
+static inline int isPunct(char c)
+{
+ return c && strchr(".?!,:;", c);
+}
+
+static inline int isMajorPunct(char c)
+{
+ return c && strchr(".?!", c);
+}
+
+static inline int isMinorPunct(char c)
+{
+ return c && strchr(",:;", c);
+}
+
+static char *charToStr(char c)
+{
+ char *result = malloc(2), *p = result;
+ *(p++) = c;
+ *p = 0;
+ return result;
+}
+
+static char **lexChars(const char *str)
+{
+ char **result = malloc((strlen(str)+1)*sizeof(char *)), **r = result;
+ const char *p = str;
+ while (*p) {
+ if (!isspace(*p)) *(r++) = charToStr(*p);
+ p++;
+ }
+ *r = NULL;
+ return result;
+}
+
+static char **lexText(const char *str)
+{
+ char **result = malloc((strlen(str)+1)*sizeof(char *)), **r = result;
+ const char *p = str;
+ int uncap = 1;
+ while (*p) {
+ if (isMajorPunct(*p)) {
+ *(r++) = charToStr(*(p++));
+ uncap = 1;
+ } else if (isMinorPunct(*p)) {
+ *(r++) = charToStr(*(p++));
+ uncap = 0;
+ } else if (isspace(*p)) {
+ p++;
+ uncap = 0;
+ } else {
+ const char *q = p;
+ char *word;
+ size_t l;
+ while (*p && !isspace(*p) && !isPunct(*p)) p++;
+ l = p - q;
+ word = malloc(l + 1);
+ strncpy(word, q, l);
+ word[l] = 0;
+ if (uncap) *word = tolower(*word);
+ *(r++) = word;
+ uncap = 0;
+ }
+ }
+ *r = NULL;
+ return result;
+}
+
+static char *unlexText(char **arr)
+{
+ size_t len = 0u;
+ char **p = arr, *result, *r;
+ int cap = 1;
+ while (*p)
+ len += strlen(*(p++)) + 1u;
+ if (!len) return calloc(1, 1);
+ r = result = malloc(len);
+ p = arr;
+ while (1) {
+ size_t l = strlen(*p);
+ char *word = *(p++);
+ if (*word == '"' && word[l-1] == '"') word++, l--;
+ strncpy(r, word, l);
+ if (cap) *r = toupper(*r);
+ if (!*p) break;
+ r += l;
+ if (isPunct(**p) && !(*p)[1]) {
+ *(r++) = **p;
+ if (!p[1]) break;
+ cap = isMajorPunct(**(p++));
+ } else cap = 0;
+ *(r++) = ' ';
+ }
+ return result;
+
+}
+
+static char *stringop_chars(const char *str)
+{
+ return appLexer(lexChars, str);
+}
+
+static char *stringop_lextext(const char *str)
+{
+ return appLexer(lexText, str);
+}
+
+static char *stringop_words(const char *str)
+{
+ return appLexer(words, str);
+}
+
+static char *stringop_unlextext(const char *str)
+{
+ return appUnlexer(unlexText, str);
+}
+
+static char *stringop_unwords(const char *str)
+{
+ return appUnlexer(unwords, str);
+}
+
+GF_StringOp gf_stringOp(const char *op)
+{
+ if (!strcmp(op, "chars")) return stringop_chars;
+ if (!strcmp(op, "lextext")) return stringop_lextext;
+ if (!strcmp(op, "words")) return stringop_words;
+ if (!strcmp(op, "unlextext")) return stringop_unlextext;
+ if (!strcmp(op, "unwords")) return stringop_unwords;
+ return NULL;
+}