summaryrefslogtreecommitdiff
path: root/src/runtime/python/examples
diff options
context:
space:
mode:
authorprasanth.kolachina <prasanth.kolachina@cse.gu.se>2016-09-19 08:32:08 +0000
committerprasanth.kolachina <prasanth.kolachina@cse.gu.se>2016-09-19 08:32:08 +0000
commitef33f1ab35443af8b182afc3524e76c8a50136a8 (patch)
tree2dde0302d6f42b29137e5c75e0f20bdd8f99b716 /src/runtime/python/examples
parent8729339d2603a7e18ef05a31bfd47f299084cb8f (diff)
python examples compatible with both Python 2 and 3
Diffstat (limited to 'src/runtime/python/examples')
-rw-r--r--src/runtime/python/examples/gf_utils.py576
-rw-r--r--src/runtime/python/examples/translation_pipeline.py718
2 files changed, 701 insertions, 593 deletions
diff --git a/src/runtime/python/examples/gf_utils.py b/src/runtime/python/examples/gf_utils.py
index bb637cf04..2be326e0e 100644
--- a/src/runtime/python/examples/gf_utils.py
+++ b/src/runtime/python/examples/gf_utils.py
@@ -1,283 +1,369 @@
#!/usr/bin/env python
+# Python 2 and 3 compatible
+from __future__ import print_function
+
"""
"""
-import argparse, re, string, sys, time;
-from itertools import imap, count;
+import argparse, codecs, re, string, sys, time;
+try:
+ from itertools import imap as map;
+ from itertools import count;
+except ImportError:
+ from itertools import count;
+ pass;
from operator import itemgetter;
import pgf;
-def lexerI(sentence):
- return sentence.rstrip(string.whitespace+string.punctuation);
+class Lexer(object):
+ def __init__(self, lang='None', grammar=None, gflang=None):
+ import translation_pipeline;
+ lexers = {'None': self.lexerI, \
+ 'Eng': self.lexerI, \
+ 'Chi': self.lexerChi, \
+ 'Translator': translation_pipeline.pipeline_lexer, \
+ 'Web': self.lexerWeb
+ };
+ if grammar:
+ self._pgf = grammar;
+ self._lang = gflang;
-def lexerChi(sentence):
- sentence = sentence.decode('utf-8');
+ self.tokenize = lexers[lang];
+ return;
+
+ def lexerI(self, sentence):
+ #return sentence.decode('utf-8').rstrip(string.whitespace+string.punctuation).encode('utf-8');
+ return sentence.rstrip(string.whitespace+string.punctuation);
+
+ def lexerChi(self, sentence):
+ #sentence = sentence.decode('utf-8');
tokens, idx, n = [], 0, len(sentence);
prev = True;
while idx < n:
- if sentence[idx] in string.whitespace:
- prev = True;
- idx += 1;
- continue;
- if 0 < ord(sentence[idx]) < 128:
- if sentence[idx] in string.punctuation:
- prev = True;
- if prev:
- tokens.append( sentence[idx] );
- prev = False;
- else:
- tokens[-1] = tokens[-1]+sentence[idx];
- else:
- prev = True;
- tokens.append( sentence[idx] );
- idx += 1;
- return ' '.join(tokens).encode('utf-8');
+ if sentence[idx] in string.whitespace:
+ prev = True;
+ idx += 1;
+ continue;
+ if 0 < ord(sentence[idx]) < 128:
+ if sentence[idx] in string.punctuation:
+ prev = True;
+ if prev:
+ tokens.append( sentence[idx] );
+ prev = False;
+ else:
+ tokens[-1] = tokens[-1]+sentence[idx];
+ else:
+ prev = True;
+ tokens.append( sentence[idx] );
+ idx += 1;
+ return ' '.join(tokens);#.encode('utf-8');
-def lexer(lang='translator'):
- if lang[-3:] == 'Eng':
- return lexerI;
- elif lang[-3:] == 'Chi':
- return lexerChi;
- elif lang == 'translator':
- import translation_pipeline;
- return translation_pipeline.pipeline_lexer;
- else:
- return lexerI;
+ def lexerWeb(self, sentence):
+ tokensList = re.split('\s+?', sentence.strip());
+ for idx, token in enumerate(tokensList):
+ if not token[0].isupper():
+ continue;
+ lowertoken = tokensList[idx].lower();
+ count = 0;
+ for analysis in self._pgf.languages[self._lang].lookupMorpho(lowertoken):
+ count += 1;
+ tokensList[idx] = lowertoken if count else token;
+ for idx, token in enumerate(tokensList):
+ if token.find('-') == -1:
+ continue;
+ count = 0;
+ for analysis in self._pgf.languages[self._lang].lookupMorpho(token):
+ count += 1;
+ if count:
+ continue;
+ token = tokensList[idx].replace('-', '');
+ for analysis in self._pgf.languages[self._lang].lookupMorpho(token):
+ count += 1;
+ if count:
+ tokensList[idx] = token;
+ continue;
+ token = tokensList[idx].replace('-', ' ');
+ return ' '.join(tokensList);
def postprocessor(sentence):
- if sentence == None:
- return '';
- if sentence.startswith('* ') or sentence.startswith('% '):
- sentence = sentence[2:];
- sentence = sentence.replace(' &+ ', '');
- sentence = sentence.replace('<+>', ' ');
- return sentence;
+ if sentence == None:
+ return '';
+ if sentence.startswith('* ') or sentence.startswith('% '):
+ sentence = sentence[2:];
+ sentence = sentence.replace(' &+ ', '');
+ sentence = sentence.replace('<+>', ' ');
+ return sentence;
def readJohnsonRerankerTrees(inputStream):
- endOfParse = False;
- while True:
- sentheader = inputStream.next();
- if sentheader == '':
- break;
- parsescount, sentidx = map(int, sentheader.strip().split());
- parsesBlock = [];
- for i in xrange(parsescount):
- parseprob = inputStream.next();
- if parseprob.strip() == '':
- endOfParse = True;
- break;
- parse = inputStream.next();
- parsesBlock.append( (float(parseprob.strip()), pgf.readExpr(parse.strip())) );
- yield sentidx, parsesBlock;
- if not endOfParse:
- _ = inputStream.next();
- endOfParse = False;
+ endOfParse = False;
+ while True:
+ sentheader = inputStream.next();
+ if sentheader == '':
+ break;
+ parsescount, sentidx = map(int, sentheader.strip().split());
+ parsesBlock = [];
+ for i in xrange(parsescount):
+ parseprob = inputStream.next();
+ if parseprob.strip() == '':
+ endOfParse = True;
+ break;
+ parse = inputStream.next();
+ parsesBlock.append((float(parseprob.strip()), pgf.readExpr(parse.strip())));
+ yield sentidx, parsesBlock;
+ if not endOfParse:
+ _ = inputStream.next();
+ endOfParse = False;
def readMosesNbestFormat(inputStream):
- transBlock = [];
- currentHypothesisId = 0;
- while True:
- line = inputStream.next();
- if line == '':
- break;
- fields = line.strip().split('|||');
- if str(fields[0].strip()) != str(currentHypothesisId):
- yield currentHypothesisId, transBlock;
- transBlock = [];
- currentHypothesisId = int(fields[0]);
- transBlock.append( (map(float, tuple([val.strip() for val in fields[3].split()])), fields[1].strip()) );
+ transBlock = [];
+ currentHypothesisId = 0;
+ while True:
+ line = inputStream.next();
+ if line == '':
+ break;
+ fields = line.strip().split('|||');
+ if str(fields[0].strip()) != str(currentHypothesisId):
+ yield currentHypothesisId, transBlock;
+ transBlock = [];
+ currentHypothesisId = int(fields[0]);
+ transBlock.append( (map(float, \
+ tuple([val.strip() for val in fields[3].split()])), \
+ fields[1].strip()) );
-def printJohnsonRerankerFormat(gfparsesList, sentid=count(1)):
- johnsonRepr = [];
- parseHash = {};
- for parse in sorted(gfparsesList, key=itemgetter(0)):
- if not parseHash.has_key(parse[1]):
- johnsonRepr.append( str(-1*parse[0]) );
- johnsonRepr.append( str(parse[1]) );
- parseHash.setdefault(parse[1], []).append(parse[0]);
- curid = sentid.next();
- if len(gfparsesList):
- johnsonRepr.insert(0, '%d %d' %(len(parseHash.values()), curid));
- duplicateInstances = len(filter(lambda X: len(parseHash[X]) > 1, parseHash.keys()));
- #if duplicateInstances: print >>sys.stderr, "%d duplicate parses found in K-best parsing" %(duplicateInstances);
- return '\n'.join(johnsonRepr)+'\n';
+def printJohnsonRerankerFormat(gfparsesList, sentids=count(1)):
+ johnsonRepr = [];
+ parseHash = {};
+ for parse in sorted(gfparsesList, key=itemgetter(0)):
+ if parse[1] not in parseHash:
+ johnsonRepr.append( str(-1*parse[0]) );
+ johnsonRepr.append( str(parse[1]) );
+ parseHash.setdefault(parse[1], []).append(parse[0]);
+ curid = next(sentids);
+ if len(gfparsesList):
+ johnsonRepr.insert(0, '%d %d' %(len(parseHash.values()), curid));
+ duplicateInstances = len(list(filter(lambda X: len(parseHash[X]) > 1, \
+ parseHash.keys())));
+ return '\n'.join(johnsonRepr)+'\n';
-def printMosesNbestFormat(hypothesisList, sentid=count(1)):
- mosesRepr = [];
- sid = sentid.next();
- for hypScores, hypStr in hypothesisList:
- if not hasattr(hypScores, '__iter__'):
- hypScores = (hypScores, );
- mosesRepr.append("%d ||| %s ||| NULL ||| %s" %(sid, hypStr, ' '.join(['%.6f'%score for score in hypScores])));
- return '\n'.join(mosesRepr);
+def printMosesNbestFormat(hypothesisList, sentids=count(1)):
+ mosesRepr = [];
+ sid = next(sentids);
+ for hypScores, hypStr in hypothesisList:
+ if not hasattr(hypScores, '__iter__'):
+ hypScores = (hypScores, );
+ mosesRepr.append("%d ||| %s ||| NULL ||| %s" \
+ %(sid, hypStr, ' '.join('%.6f'%score for score in hypScores)) );
+ return '\n'.join(mosesRepr);
def getKLinearizations(grammar, tgtlanguage, abstractParsesList, K=10):
- generator = grammar.languages[tgtlanguage].linearizeAll;
- for parsesBlock in abstractParsesList:
- kBestTrans = [];
- for parseprob, parse in parsesBlock:
- for linstring in generator(parse, n=K):
- kBestTrans.append( ((parseprob,), postprocessor(linstring)) );
- yield kBestTrans;
+ generator = grammar.languages[tgtlanguage].linearizeAll;
+ for parsesBlock in abstractParsesList:
+ kBestTrans = [];
+ for parseprob, parse in parsesBlock:
+ for linstring in generator(parse, n=K):
+ kBestTrans.append( ((parseprob,), postprocessor(linstring)) );
+ yield kBestTrans;
-def getKBestParses(grammar, language, K, serializable=False, sentid=count(1), max_length=50):
- parser = grammar.languages[language].parse;
- import translation_pipeline
- def worker(sentence):
- sentence = sentence.strip();
- curid = sentid.next();
- tstart = time.time();
- kBestParses = [];
- parseScores = {};
- if len(sentence.split()) > max_length:
- tend, err = time.time(), "Sentence too long (%d tokens). Might potentially run out of memory" %(len(sentence.split()));
- print >>sys.stderr, '%d\t%.4f\t%s' %(curid, tend-tstart, err);
- return tend-tstart, kBestParses; # temporary hack to make sure parser does not get killed for very long sentences;
- try:
- callbacks = [('PN', translation_pipeline.parseNames(grammar, args.srclang, sentence)), ('Symb', translation_pipeline.parseUnknown(grammar, args.srclang, sentence))]
- for parseidx, parse in enumerate( parser(sentence, heuristics=0, callbacks=callbacks) ):
- parseScores[parse[0]] = True;
- kBestParses.append( (parse[0], str(parse[1]) if serializable else parse[1]) );
- if parseidx == K-1: break;
- #if len(parseScores) >= K: break;
- tend = time.time();
- print >>sys.stderr, '%d\t%.4f' %(curid, tend-tstart);
- return tend-tstart, kBestParses;
- except pgf.ParseError, err:
- tend = time.time();
- print >>sys.stderr, '%d\t%.4f\t%s' %(curid, tend-tstart, err);
- return tend-tstart, kBestParses;
- except UnicodeEncodeError, err:
- tend = time.time();
- print >>sys.stderr, '%d\t%.4f\t%s' %(curid, tend-tstart, err);
- return tend-tstart, kBestParses;
- return worker;
+def getKBestParses(grammar, language, K, callbacks=[], \
+ serializable=False, sentids=count(1), max_length=50):
+ parser = grammar.languages[language].parse;
+ import translation_pipeline;
+ callbacks_PN = translation_pipeline.parseNames;
+ callbacks_Symb = translation_pipeline.parseUnknown;
+ def worker(sentence):
+ sentence = sentence.strip();
+ curid = next(sentids);
+ tstart = time.time();
+ kBestParses = [];
+ parseScores = {};
+ if len(sentence.split()) > max_length:
+ # temporary hack to make sure parser does not get
+ # killed for very long sentences;
+ tend, err = time.time(), \
+ "Sentence too long (%d tokens). Might potentially run out of memory" \
+ %(len(sentence.split()));
+ print('%d\t%.4f\t%s' %(curid, tend-tstart, err), file=sys.stderr);
+ return tend-tstart, kBestParses;
+
+ # with modified API for callbacks, each callback function has to
+ # be freshly created for each sentence; otherwise, they do not
+ # work.
+ try:
+ callbacks = [('PN', callbacks_PN(grammar, language, sentence)),\
+ ('Symb', callbacks_Symb(grammar, language, sentence))];
+ for parseidx, parse in enumerate(parser(sentence, \
+ heuristics=0, callbacks=callbacks)):
+ parseScores[parse[0]] = True;
+ kBestParses.append((parse[0], str(parse[1]) if serializable \
+ else parse[1]));
+ if parseidx == K-1:
+ break;
+ tend = time.time();
+ print('%d\t%.4f' %(curid, tend-tstart), file=sys.stderr);
+ return tend-tstart, kBestParses;
+ except pgf.ParseError as err:
+ tend = time.time();
+ print('%d\t%.4f\t%s' %(curid, tend-tstart, err), file=sys.stderr);
+ return tend-tstart, kBestParses;
+ except UnicodeEncodeError as err:
+ tend = time.time();
+ print('%d\t%.4f\t%s' %(curid, tend-tstart, err), file=sys.stderr);
+ return tend-tstart, kBestParses;
+ return worker;
def pgf_parse(args):
- grammar = pgf.readPGF(args.pgfgrammar);
- import translation_pipeline;
-
- preprocessor = lexer();
- inputSet = translation_pipeline.web_lexer(grammar, args.srclang, imap(preprocessor, args.inputstream) );
- outputPrinter = lambda X: "%f\t%s" %(X[0], str(X[1])); #operator.itemgetter(1);
- parser = getKBestParses(grammar, args.srclang, 1);
-
- sentidx = 0;
- for time, parsesBlock in imap(parser, inputSet):
- sentidx += 1;
- print >>args.outputstream, "%d\t%f\t%s" %(sentidx, time, str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else '');
- return;
+ grammar = pgf.readPGF(args.pgfgrammar);
+ preprocessor = Lexer().tokenize;
+ #if sys.version_info < (3, 0):
+ # args.inputstream = codecs.getreader('utf-8')(args.inputstream);
+ inputSet = map(preprocessor, args.inputstream);
+ web_preprocessor = Lexer('Web', grammar, args.srclang).tokenize;
+ inputSet = map(web_preprocessor, inputSet);
+ outputPrinter = lambda X: "%f\t%s" %(X[0], str(X[1]));
+ parser = getKBestParses(grammar, args.srclang, 1);
+
+ sentidx = 0;
+ for time, parsesBlock in map(parser, inputSet):
+ sentidx += 1;
+ print("%d\t%f\t%s" %(sentidx, time, \
+ str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else ''), \
+ file=args.outputstream);
+ return;
def pgf_kparse(args):
- grammar = pgf.readPGF(args.pgfgrammar);
- import translation_pipeline;
-
- preprocessor = lexer();
- inputSet = translation_pipeline.web_lexer(grammar, args.srclang, imap(preprocessor, args.inputstream) );
- outputPrinter = printJohnsonRerankerFormat;
- parser = getKBestParses(grammar, args.srclang, args.K);
-
- sentidx = 0;
- for time, parsesBlock in imap(parser, inputSet):
- sentidx += 1;
- strParses = str(outputPrinter(parsesBlock));
- if not (strParses == '\n'):
- print >>args.outputstream, strParses;
- return;
+ grammar = pgf.readPGF(args.pgfgrammar);
+ preprocessor = Lexer().tokenize;
+ #if sys.version_info < (3, 0):
+ # args.inputstream = codecs.getreader('utf-8')(args.inputstream);
+ inputSet = map(preprocessor, args.inputstream);
+ web_preprocessor = Lexer('Web', grammar, args.srclang).tokenize;
+ inputSet = map(web_preprocessor, inputSet);
+ outputPrinter = printJohnsonRerankerFormat;
+ parser = getKBestParses(grammar, args.srclang, args.K);
+
+ sentidx = 0;
+ for time, parsesBlock in map(parser, inputSet):
+ sentidx += 1;
+ strParses = str(outputPrinter(parsesBlock));
+ if not (strParses == '\n'):
+ print(strParses, file=args.outputstream);
+ return;
def pgf_linearize(args):
- grammar = pgf.readPGF(args.pgfgrammar);
- outputPrinter = postprocessor;
- inputSet = [];
- for line in args.inputstream:
- try:
- sentid, parsetime, parserepr = line.strip('\n').split('\t', 2);
- except ValueError:
- print line.strip();
- parseprob, abstree = parserepr.split('\t') if parserepr.strip() else (0, '');
- inputSet.append( (int(sentid), float(parsetime), float(parseprob), pgf.readExpr(abstree) if abstree else None) );
- linearizer = grammar.languages[args.tgtlang].linearize;
- for sentid, _, _, abstree in inputSet:
- if abstree:
- print >>args.outputstream, str(outputPrinter(linearizer(abstree)));
- else:
- print >>args.outputstream, "";
- return;
+ grammar = pgf.readPGF(args.pgfgrammar);
+ def parse_line(line):
+ try:
+ sentid, parsetime, parserepr = line.strip('\n').split('\t', 2);
+ except ValueError:
+ print("Line not in proper format: %s" %(line), file=stderr);
+ parseprob, abstree = parserepr.split('\t') if parserepr.strip() \
+ else (0, '');
+ return ((int(sentid), float(parsetime), float(parseprob), \
+ pgf.readExpr(abstree) if abstree else None));
-def pgf_klinearize(args):
- grammar = pgf.readPGF(args.pgfgrammar);
- outputPrinter = printMosesNbestFormat;
- inputSet = [(sentid, parsesBlock) for sentid, parsesBlock in readJohnsonRerankerTrees(args.inputstream)];
- sentIdsList = imap(itemgetter(0), inputSet);
- parsesBlocks = map(itemgetter(1), inputSet);
+ #if sys.version_info < (3, 0):
+ # args.inputstream = codecs.getreader('utf-8')(args.inputstream);
+ inputSet = map(parse_line, (line for line in args.inputstream));
+ outputPrinter = postprocessor;
+ linearizer = grammar.languages[args.tgtlang].linearize;
+ for sentid, _, _, abstree in inputSet:
+ if abstree:
+ print(str(outputPrinter(linearizer(abstree))), \
+ file=args.outputstream);
+ else:
+ print("", file=args.outputstream);
+ return;
- for transBlock in getKLinearizations(grammar, args.tgtlang, parsesBlocks, args.K):
- strTrans = str(outputPrinter(transBlock, sentIdsList));
- if strTrans:
- print >>args.outputstream, strTrans;
- return;
+def pgf_klinearize(args):
+ grammar = pgf.readPGF(args.pgfgrammar);
+ #if sys.version_info < (3, 0):
+ # args.inputstream = codecs.getreader('utf-8')(args.inputstream);
+ inputSet = [(sentid, parsesBlock) \
+ for sentid, parsesBlock in readJohnsonRerankerTrees(args.inputstream)];
+ outputPrinter = printMosesNbestFormat;
+ sentIdsList = map(itemgetter(0), inputSet);
+ parsesBlocks = map(itemgetter(1), inputSet);
+
+ for transBlock in getKLinearizations(grammar, args.tgtlang, parsesBlocks, args.K):
+ strTrans = str(outputPrinter(transBlock, sentIdsList));
+ if strTrans:
+ print(strTrans, file=args.outputstream);
+ return;
def cmdLineParser():
- argparser = argparse.ArgumentParser(prog='gf_utils.py', description='Examples for carrying out (K-best) parsing, translation and linearization using GF C runtime.');
-
- subparsers = argparser.add_subparsers();
- parser = subparsers.add_parser('parse', help='GF parsing of sentences');
- kparser = subparsers.add_parser('kparse', help='K-best GF parsing of sentences');
- linearizer = subparsers.add_parser('linearize', help='Linearize GF abstract syntax treess');
- klinearizer = subparsers.add_parser('klinearize', help='Linearize K-variants of GF abstract syntax trees');
-
- parser.set_defaults(func=pgf_parse);
- parser.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
- help='PGF Grammar file');
- parser.add_argument('-p', '--start-sym', dest='startcat', required=False, \
- help='Start symbol in the grammar');
- parser.add_argument('-s', '--src-lang', dest='srclang', required=True, \
- help='Source language');
- parser.add_argument('-i', '--input', dest='inputstream', nargs='?', type=argparse.FileType(mode='r'), default=sys.stdin, \
- help='Input file') ;
- parser.add_argument('-o', '--output', dest='outputstream', nargs='?', type=argparse.FileType(mode='w'), default=sys.stdout, \
- help='Output file');
-
- kparser.set_defaults(func=pgf_kparse);
- kparser.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
- help='PGF Grammar file');
- kparser.add_argument('-p', '--start-sym', dest='startcat', required=False, \
- help='Start symbol in the grammar');
- kparser.add_argument('-s', '--src-lang', dest='srclang', required=True, \
- help='Source language');
- kparser.add_argument('-K', dest='K', required=True, type=int, \
- help='K value for multiple parses');
- kparser.add_argument('-i', '--input', dest='inputstream', nargs='?', type=argparse.FileType(mode='r'), default=sys.stdin, \
- help='Input file');
- kparser.add_argument('-o', '--output', dest='outputstream', nargs='?', type=argparse.FileType(mode='w'), default=sys.stdout, \
- help='Output file');
-
- linearizer.set_defaults(func=pgf_linearize);
- linearizer.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
- help='PGF Grammar file');
- linearizer.add_argument('-t', '--tgt-lang', dest='tgtlang', required=True, \
- help='Target language');
- linearizer.add_argument('-i', '--input', dest='inputstream', nargs='?', type=argparse.FileType(mode='r'), default=sys.stdin, \
- help='Input file');
- linearizer.add_argument('-o', '--output', dest='outputstream', nargs='?', type=argparse.FileType(mode='w'), default=sys.stdout, \
- help='Output file');
-
- klinearizer.set_defaults(func=pgf_klinearize);
- klinearizer.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
- help='PGF Grammar file');
- klinearizer.add_argument('-t', '--tgt-lang', dest='tgtlang', required=True, \
- help='Target language');
- klinearizer.add_argument('-K', dest='K', required=True, type=int, \
- help='K value for multiple linearizations');
- klinearizer.add_argument('-i', '--input', dest='inputstream', nargs='?', type=argparse.FileType(mode='r'), default=sys.stdin, \
- help='Input file');
- klinearizer.add_argument('-o', '--output', dest='outputstream', nargs='?', type=argparse.FileType(mode='w'), default=sys.stdout, \
- help='Output file');
+ argparser = argparse.ArgumentParser(prog='gf_utils.py', \
+ description='Examples for carrying out (K-best) parsing, \
+ translation and linearization using GF C runtime.');
+
+ subparsers = argparser.add_subparsers();
+ parser = subparsers.add_parser('parse', help='GF parsing of sentences');
+ kparser = subparsers.add_parser('kparse', help='K-best GF parsing of sentences');
+ linearizer = subparsers.add_parser('linearize', help='Linearize GF abstract syntax treess');
+ klinearizer = subparsers.add_parser('klinearize', help='Linearize K-variants of GF abstract syntax trees');
+
+ parser.set_defaults(func=pgf_parse);
+ parser.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
+ help='PGF Grammar file');
+ parser.add_argument('-p', '--start-sym', dest='startcat', required=False, \
+ help='Start symbol in the grammar');
+ parser.add_argument('-s', '--src-lang', dest='srclang', required=True, \
+ help='Source language');
+ parser.add_argument('-i', '--input', dest='inputstream', nargs='?', \
+ type=argparse.FileType(mode='r'), default=sys.stdin, \
+ help='Input file') ;
+ parser.add_argument('-o', '--output', dest='outputstream', nargs='?', \
+ type=argparse.FileType(mode='w'), default=sys.stdout, \
+ help='Output file');
+
+ kparser.set_defaults(func=pgf_kparse);
+ kparser.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
+ help='PGF Grammar file');
+ kparser.add_argument('-p', '--start-sym', dest='startcat', required=False, \
+ help='Start symbol in the grammar');
+ kparser.add_argument('-s', '--src-lang', dest='srclang', required=True, \
+ help='Source language');
+ kparser.add_argument('-K', dest='K', required=True, \
+ type=int, \
+ help='K value for multiple parses');
+ kparser.add_argument('-i', '--input', dest='inputstream', nargs='?', \
+ type=argparse.FileType(mode='r'), default=sys.stdin, \
+ help='Input file');
+ kparser.add_argument('-o', '--output', dest='outputstream', nargs='?', \
+ type=argparse.FileType(mode='w'), default=sys.stdout, \
+ help='Output file');
+
+ linearizer.set_defaults(func=pgf_linearize);
+ linearizer.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
+ help='PGF Grammar file');
+ linearizer.add_argument('-t', '--tgt-lang', dest='tgtlang', required=True, \
+ help='Target language');
+ linearizer.add_argument('-i', '--input', dest='inputstream', nargs='?', \
+ type=argparse.FileType(mode='r'), default=sys.stdin, \
+ help='Input file');
+ linearizer.add_argument('-o', '--output', dest='outputstream', nargs='?', \
+ type=argparse.FileType(mode='w'), default=sys.stdout, \
+ help='Output file');
+
+ klinearizer.set_defaults(func=pgf_klinearize);
+ klinearizer.add_argument('-g', '--pgf', dest='pgfgrammar', required=True, \
+ help='PGF Grammar file');
+ klinearizer.add_argument('-t', '--tgt-lang', dest='tgtlang', required=True, \
+ help='Target language');
+ klinearizer.add_argument('-K', '--kbest', dest='K', required=True, \
+ type=int, \
+ help='K value for multiple linearizations');
+ klinearizer.add_argument('-i', '--input', dest='inputstream', nargs='?', \
+ type=argparse.FileType(mode='r'), default=sys.stdin, \
+ help='Input file');
+ klinearizer.add_argument('-o', '--output', dest='outputstream', nargs='?', \
+ type=argparse.FileType(mode='w'), default=sys.stdout, \
+ help='Output file');
+
+ return argparser;
- return argparser;
if __name__ == '__main__':
- args = cmdLineParser().parse_args(sys.argv[1:]);
- args.func(args);
+ args = cmdLineParser().parse_args(sys.argv[1:]);
+ args.func(args);
diff --git a/src/runtime/python/examples/translation_pipeline.py b/src/runtime/python/examples/translation_pipeline.py
index bfd8b5c94..e8dc92583 100644
--- a/src/runtime/python/examples/translation_pipeline.py
+++ b/src/runtime/python/examples/translation_pipeline.py
@@ -1,6 +1,18 @@
#!/usr/bin/env python
+# Python 2 and 3 compatible
+from __future__ import print_function
+
+"""
+"""
+
import argparse, codecs, copy, itertools, logging, math, operator, os, os.path, re, string, sys, time;
+try:
+ from itertools import imap as map;
+ from itertools import ifilter as filter;
+except ImportError:
+ pass;
+
import xml.etree.ElementTree as etree;
import pgf;
@@ -8,392 +20,402 @@ import gf_utils;
# http://snipplr.com/view/25657/indent-xml-using-elementtree/
def indentXMLNodes(elem, level=0):
- i = "\n" + level*" "
- if len(elem):
- if not elem.text or not elem.text.strip():
- elem.text = i + " "
- if not elem.tail or not elem.tail.strip():
- elem.tail = i
- for elem in elem:
- indentXMLNodes(elem, level+1)
- if not elem.tail or not elem.tail.strip():
- elem.tail = i
+ i = "\n" + level*" "
+ if len(elem):
+ if not elem.text or not elem.text.strip():
+ elem.text = i + " "
+ if not elem.tail or not elem.tail.strip():
+ elem.tail = i
+ for elem in elem:
+ indentXMLNodes(elem, level+1)
+ if not elem.tail or not elem.tail.strip():
+ elem.tail = i
else:
- if level and (not elem.tail or not elem.tail.strip()):
- elem.tail = i
+ if level and (not elem.tail or not elem.tail.strip()):
+ elem.tail = i
def readTranslationPipelineOptions(propsfile, default_namespace):
- with codecs.open(propsfile, 'r', 'utf-8') as infile:
- for line in infile:
- if not line.strip():
- continue;
- key, value = line.strip().split('=', 1);
- key, value = key.strip(), value.strip();
- if key == 'srclang': default_namespace.srclang = value;
- elif key == 'tgtlangs': default_namespace.tgtlangs = [val.strip() for val in ','.split(value)];
- elif key == 'input': default_namespace.input = value;
- elif key == 'format': default_namespace.format = value;
- elif key == 'exp_directory': default_namespace.exp_directory = value;
- else:
- #print >>sys.stderr, "Unknown option-%s found in props file. Ignoring and proceeding." %(key);
- logging.warning("Unknown option-%s found in props file. Ignoring and proceeding." %(key));
- continue;
- return default_namespace;
+ with codecs.open(propsfile, 'r', 'utf-8') as infile:
+ for line in infile:
+ if not line.strip():
+ continue;
+ key, value = line.strip().split('=', 1);
+ key, value = key.strip(), value.strip();
+ if key == 'srclang':
+ default_namespace.srclang = value;
+ elif key == 'tgtlangs':
+ default_namespace.tgtlangs = [val.strip() for val in ','.split(value)];
+ elif key == 'input':
+ default_namespace.input = value;
+ elif key == 'format':
+ default_namespace.format = value;
+ elif key == 'exp_directory':
+ default_namespace.exp_directory = value;
+ else:
+ logging.warning("Unknown option-%s found in props file. Ignoring and proceeding." %(key));
+ continue;
+ return default_namespace;
def sgmReader(sgmDoc):
- root = sgmDoc.getroot();
- for element in root.iter():
- if element.text is not None and element.text.strip():
- yield element.text.strip().encode('utf-8');
+ root = sgmDoc.getroot();
+ for element in root.iter():
+ if element.text is not None and element.text.strip():
+ yield element.text.strip().encode('utf-8');
def addToSgm(sgmDoc, strItem):
- for node in sgmDoc.findall('.//seg'):
- if not node.text.strip():
- strItem = strItem.decode('utf-8');
- node.text = ' %s ' %(strItem if strItem.strip() else 'EMPTY');
- return;
- logging.error("No more nodes available for adding content");
- return;
+ for node in sgmDoc.findall('.//seg'):
+ if not node.text.strip():
+ strItem = strItem.decode('utf-8');
+ node.text = ' %s ' %(strItem if strItem.strip() else 'EMPTY');
+ return;
+ logging.error("No more nodes available for adding content");
+ return;
def sgmWriter(sgmDoc):
- indentXMLNodes( sgmDoc.getroot() );
- return etree.tostring(sgmDoc.getroot(), encoding='utf-8', method='xml');
+ indentXMLNodes( sgmDoc.getroot() );
+ return etree.tostring(sgmDoc.getroot(), encoding='utf-8', method='xml');
def getXMLSkeleton(sgmDoc, tgtlang):
- skeletonDoc = copy.deepcopy(sgmDoc);
- root = skeletonDoc.getroot();
- root.tag = 'tstset';
- root.attrib['trlang'] = tgtlang[-3:];
- root.find('doc').attrib['sysid'] = tgtlang[:-3];
- for node in root.findall('.//seg'):
- node.text = '';
- return skeletonDoc;
+ skeletonDoc = copy.deepcopy(sgmDoc);
+ root = skeletonDoc.getroot();
+ root.tag = 'tstset';
+ root.attrib['trlang'] = tgtlang[-3:];
+ root.find('doc').attrib['sysid'] = tgtlang[:-3];
+ for node in root.findall('.//seg'):
+ node.text = '';
+ return skeletonDoc;
def pipeline_lexer(sentence):
- tokens = sentence.strip().split();
- #tokens = filter(None, re.split('(\W+)', sentence.strip()));
- n = len(tokens);
- idx = len(tokens)-1;
- while idx >= 0:
- if tokens[idx] in ".?!)":
- idx -= 1;
- else:
- break;
- tokens = tokens[:idx+1];
- idx = 0;
- while idx < len(tokens):
- if tokens[idx] in "'\"(":
- idx += 1;
- else:
- break;
- tokens = tokens[idx:];
- return ' '.join(tokens);
-
-def web_lexer(grammar, lang, sentences):
- for instance in sentences:
- tokensList = re.split('\s+?', instance.strip());
- for idx, token in enumerate(tokensList):
- if not token[0].isupper():
- continue;
- lowertoken = tokensList[idx].lower();
- count = 0;
- for analysis in grammar.languages[lang].lookupMorpho(lowertoken):
- count += 1;
- tokensList[idx] = lowertoken if count else token;
- for idx, token in enumerate(tokensList):
- if token.find('-') == -1:
- continue;
- count = 0;
- for analysis in grammar.languages[lang].lookupMorpho(token):
- count += 1;
- if count:
- continue;
- token = tokensList[idx].replace('-', '');
- for analysis in grammar.languages[lang].lookupMorpho(token):
- count += 1;
- if count:
- tokensList[idx] = token;
- continue;
- token = tokensList[idx].replace('-', ' ');
- yield ' '.join(tokensList);
+ tokens = sentence.strip().split();
+ #tokens = filter(None, re.split('(\W+)', sentence.strip()));
+ n = len(tokens);
+ idx = len(tokens)-1;
+ while idx >= 0:
+ if tokens[idx] in ".?!)":
+ idx -= 1;
+ else:
+ break;
+ tokens = tokens[:idx+1];
+ idx = 0;
+ while idx < len(tokens):
+ if tokens[idx] in "'\"(":
+ idx += 1;
+ else:
+ break;
+ tokens = tokens[idx:];
+ return ' '.join(tokens);
def clean_gfstrings(sentence):
- absFuncName = re.compile('\[[^]]+?\]');
- untranslatedEntries = {};
- for entry in re.findall(absFuncName, sentence):
- untranslatedEntries[entry] = untranslatedEntries.setdefault(entry, 0)+1;
- for entry in untranslatedEntries:
- while untranslatedEntries[entry] > 1:
- sentence = sentence.replace(entry, '', 1);
- untranslatedEntries[entry] -= 1;
- sentence = sentence.replace(entry, ' '.join(entry[1:-1].split('_')[:-1]) if entry.find('_') != -1 else '');
- return ' '.join( sentence.split() );
+ absFuncName = re.compile('\[[^]]+?\]');
+ untranslatedEntries = {};
+ for entry in re.findall(absFuncName, sentence):
+ untranslatedEntries[entry] = untranslatedEntries.setdefault(entry, 0)+1;
+ for entry in untranslatedEntries:
+ while untranslatedEntries[entry] > 1:
+ sentence = sentence.replace(entry, '', 1);
+ untranslatedEntries[entry] -= 1;
+ sentence = sentence.replace(entry, \
+ ' '.join(entry[1:-1].split('_')[:-1]) if entry.find('_') != -1 \
+ else '');
+ return ' '.join( sentence.split() );
def parseNames(grammar, language, sentence):
- def callback(lin_idx, start):
- moving_start, end, eot = start, len(sentence), True;
- if moving_start < end and (not sentence[moving_start].isupper()):
- return None;
- while moving_start < end:
- if sentence[moving_start] in string.whitespace:
- eot = True;
- elif eot and sentence[moving_start].isupper():
- eot = False;
- elif eot and (not sentence[moving_start].isupper()):
- end = moving_start-1;
- break;
- moving_start += 1;
- possible_name = sentence[start:end].strip();
- if possible_name:
- if language.endswith('Eng') and (possible_name == "I" or possible_name == "I'm"):
- return None;
- elif language.endswith('Eng') and possible_name.endswith("'s"):
- end_idx = possible_name.rfind("'s");
- if end_idx != -1:
- possible_name = possible_name[:end_idx].strip();
- end -= 2;
- if not possible_name:
- return None;
- expr, prob = None, None;
- for analysis in grammar.languages[language].lookupMorpho(possible_name):
- category = grammar.functionType(analysis[0]).cat;
- if prob < analysis[-1]:
- if category == "PN":
- expr, prob = pgf.Expr(analysis[0], []), analysis[-1];
- elif category == "Weekday":
- expr, prob = pgf.Expr("weekdayPN", [pgf.Expr(analysis[0], [])]), analysis[-1];
- elif category == "Month":
- expr, prob = pgf.Expr("monthPN", [pgf.Expr(analysis[0], [])]), analysis[-1];
- elif category == "Language":
- return None;
- # generic named entity
- if expr == None:
- expr = pgf.Expr(possible_name);
- expr = pgf.Expr("MkSymb", [expr]);
- expr = pgf.Expr("SymbPN", [expr]);
- return (expr, 0, end);
- return None;
- return callback;
+ def callback(lin_idx, start):
+ moving_start, end, eot = start, len(sentence), True;
+ if moving_start < end and (not sentence[moving_start].isupper()):
+ return None;
+ while moving_start < end:
+ if sentence[moving_start] in string.whitespace:
+ eot = True;
+ elif eot and sentence[moving_start].isupper():
+ eot = False;
+ elif eot and (not sentence[moving_start].isupper()):
+ end = moving_start-1;
+ break;
+ moving_start += 1;
+ possible_name = sentence[start:end].strip();
+ if possible_name:
+ if language.endswith('Eng') and \
+ (possible_name == "I" or possible_name == "I'm"):
+ return None;
+ elif language.endswith('Eng') and possible_name.endswith("'s"):
+ end_idx = possible_name.rfind("'s");
+ if end_idx != -1:
+ possible_name = possible_name[:end_idx].strip();
+ end -= 2;
+ if not possible_name:
+ return None;
+ expr, prob = None, None;
+ for analysis in grammar.languages[language].lookupMorpho(possible_name):
+ category = grammar.functionType(analysis[0]).cat;
+ if prob < analysis[-1]:
+ if category == "PN":
+ expr, prob = pgf.Expr(analysis[0], []), analysis[-1];
+ elif category == "Weekday":
+ expr, prob = pgf.Expr("weekdayPN", \
+ [pgf.Expr(analysis[0], [])]), analysis[-1];
+ elif category == "Month":
+ expr, prob = pgf.Expr("monthPN", \
+ [pgf.Expr(analysis[0], [])]), analysis[-1];
+ elif category == "Language":
+ return None;
+ # generic named entity
+ if expr == None:
+ expr = pgf.Expr(possible_name);
+ expr = pgf.Expr("MkSymb", [expr]);
+ expr = pgf.Expr("SymbPN", [expr]);
+ return (expr, 0, end);
+ return None;
+ return callback;
def parseUnknown(grammar, language, sentence):
- def callback(lin_idx, start):
- moving_start, end, eot = start, len(sentence), True;
- isNewToken = (moving_start == 0) or (moving_start > 1 and sentence[moving_start-1].isspace()) # -- added to deal with segmentation errors like may => ma_N + Symb y
- if moving_start < end and (not sentence[moving_start].isupper()):
- while moving_start < end:
- if sentence[moving_start] in string.whitespace:
- end = moving_start;
- break;
- moving_start += 1;
- unknown_word = sentence[start:end].strip();
- if unknown_word and isNewToken:
- count = 0;
- for analysis in grammar.languages[language].lookupMorpho(unknown_word):
- count += 1;
- if not count:
- expr = pgf.Expr("MkSymb", [pgf.Expr(unknown_word)]);
- return (expr, 0, end);
- return None;
- return callback;
+ def callback(lin_idx, start):
+ moving_start, end, eot = start, len(sentence), True;
+ # -- added to deal with segmentation errors like may => ma_N + Symb y
+ isNewToken = (moving_start == 0) or \
+ (moving_start > 1 and sentence[moving_start-1].isspace())
+ if moving_start < end and (not sentence[moving_start].isupper()):
+ while moving_start < end:
+ if sentence[moving_start] in string.whitespace:
+ end = moving_start;
+ break;
+ moving_start += 1;
+ unknown_word = sentence[start:end].strip();
+ if unknown_word and isNewToken:
+ count = 0;
+ for analysis in grammar.languages[language].lookupMorpho(unknown_word):
+ count += 1;
+ if not count:
+ expr = pgf.Expr("MkSymb", [pgf.Expr(unknown_word)]);
+ return (expr, 0, end);
+ return None;
+ return callback;
-def parseTester(grammar, language):
- def callback(lin_idx, sentence, start):
- if start < len(sentence):
- return (pgf.Expr(sentence[start]), 0, start+1);
- return None;
- return callback;
+def parseTester(grammar, language, sentence):
+ def callback(lin_idx, start):
+ if start < len(sentence):
+ return (pgf.Expr(sentence[start]), 0, start+1);
+ return None;
+ return callback;
def translateWordsAsChunks(grammar, language, tgtlanguages, word):
- parser = grammar.languages[language].parse;
- linearizersList = dict((lang, grammar.languages[lang].linearize) for lang in tgtlanguages);
- translations = [];
- try:
- for parseidx, parse in enumerate( parser(word) ):
- for lang in tgtlanguages:
- trans = linearizersList[lang](parse[1]);
- translations.append(( lang, gf_utils.postprocessor(trans.strip() if trans else '') ) );
- break;
- except pgf.ParseError, err:
- return [];
- return translations;
+ parser = grammar.languages[language].parse;
+ linearizersList = dict((lang, grammar.languages[lang].linearize) \
+ for lang in tgtlanguages);
+ translations = [];
+ try:
+ for parseidx, parse in enumerate( parser(word) ):
+ for lang in tgtlanguages:
+ trans = linearizersList[lang](parse[1]);
+ translations.append((lang, gf_utils.postprocessor(\
+ trans.strip() if trans else '')));
+ break;
+ except pgf.ParseError as err:
+ return [];
+ return translations;
def translateWord(grammar, language, tgtlanguages, word):
- possible_translations = translateWordsAsChunks(grammar, language, tgtlanguages, word);
- if len(possible_translations):
- return possible_translations;
-
- lowerword = word.lower();
- try:
- partialExprList = grammar.languages[language].parse(word, cat='Chunk');
- for expr in partialExprList:
- return [(lang, gf_utils.gf_postprocessor( grammar.languages[lang].linearize(expr[1]) )) for lang in tgtlanguages];
- except pgf.ParseError:
- morphAnalysis = grammar.languages[language].lookupMorpho(word) + grammar.languages[language].lookupMorpho(lowerword);
- for morph in morphAnalysis:
- countPositiveLanguages = filter(None, [grammar.languages[lang].hasLinearization(morph[0]) for lang in tgtlanguages]);
- if len(countPositiveLanguages) > 0.5*len(tgtlanguages):
- return [(lang, gf_utils.gf_postprocessor( grammar.languages[lang].linearize( pgf.readExpr(morph[0]) ) )) for lang in tgtlanguages];
- return [(lang, word) for lang in tgtlanguages];
+ possible_translations = translateWordsAsChunks(grammar, language, \
+ tgtlanguages, word);
+ if len(possible_translations):
+ return possible_translations;
+ lowerword = word.lower();
+ try:
+ partialExprList = grammar.languages[language].parse(word, cat='Chunk');
+ for expr in partialExprList:
+ return [(lang, gf_utils.gf_postprocessor(\
+ grammar.languages[lang].linearize(expr[1]))) \
+ for lang in tgtlanguages];
+ except pgf.ParseError:
+ morphAnalysis = grammar.languages[language].lookupMorpho(word) +\
+ grammar.languages[language].lookupMorpho(lowerword);
+ for morph in morphAnalysis:
+ countPositiveLanguages = list(filter(None, \
+ [grammar.languages[lang].hasLinearization(morph[0]) \
+ for lang in tgtlanguages]));
+ if len(countPositiveLanguages) > 0.5*len(tgtlanguages):
+ return [(lang, \
+ gf_utils.gf_postprocessor(grammar.languages[lang].linearize(pgf.readExpr(morph[0])))) \
+ for lang in tgtlanguages];
+ return [(lang, word) for lang in tgtlanguages];
def translationByLookup(grammar, language, tgtlanguages, sentence):
- parser = grammar.languages[language].parse;
- linearizersList = dict([(lang, grammar.languages[lang].linearize) for lang in tgtlanguages]);
- queue = [sentence.strip().split()];
- transChunks = {};
- while len(queue):
- head = queue[0];
- if not len(head):
- pass;
- elif len(head) == 1 and head[0].strip():
- for lang, wordchoice in translateWord(grammar, language, tgtlanguages, head[0]):
- transChunks.setdefault(lang, []).append( gf_utils.postprocessor(wordchoice) );
- else:
- try:
- for parseidx, parse in enumerate( parser(' '.join(head)) ):
- for lang in tgtlanguages:
- if linearizersList[lang](parse[1]) == None:
- transChunks.setdefault(lang, []).append( ' ' );
- else:
- transChunks.setdefault(lang, []).append( gf_utils.postprocessor( linearizersList[lang](parse[1]).strip() ) );
- break;
- except pgf.ParseError, err:
- #unseenToken = re.findall('"[^"]+?"', err.message)[0][1:-1];
- unseenToken = err.message.strip().split()[-1][1:-1];
- idx = head.index(unseenToken);
- queue.insert(1, head[:idx] );
- queue.insert(2, [head[idx]] );
- queue.insert(3, head[idx+1:] );
- del queue[0];
- for lang in tgtlanguages:
- yield (lang, ' '.join(transChunks[lang]));
+ parser = grammar.languages[language].parse;
+ linearizersList = dict([(lang, grammar.languages[lang].linearize) \
+ for lang in tgtlanguages]);
+ queue = [sentence.strip().split()];
+ transChunks = {};
+ while len(queue):
+ head = queue[0];
+ if not len(head):
+ pass;
+ elif len(head) == 1 and head[0].strip():
+ for lang, wordchoice in translateWord(grammar, language, \
+ tgtlanguages, head[0]):
+ transChunks.setdefault(lang, []).append(\
+ gf_utils.postprocessor(wordchoice));
+ else:
+ try:
+ for parseidx, parse in enumerate(parser(' '.join(head))):
+ for lang in tgtlanguages:
+ if linearizersList[lang](parse[1]) == None:
+ transChunks.setdefault(lang, []).append(' ');
+ else:
+ transChunks.setdefault(lang, []).append(\
+ gf_utils.postprocessor(linearizersList[lang](parse[1]).strip()));
+ break;
+ except pgf.ParseError as err:
+ #unseenToken = re.findall('"[^"]+?"', err.message)[0][1:-1];
+ unseenToken = err.message.strip().split()[-1][1:-1];
+ idx = head.index(unseenToken);
+ queue.insert(1, head[:idx] );
+ queue.insert(2, [head[idx]] );
+ queue.insert(3, head[idx+1:] );
+ del queue[0];
+ for lang in tgtlanguages:
+ yield (lang, ' '.join(transChunks[lang]));
def pipelineParsing(grammar, language, sentences, K=20):
- #buf = [sent for sent in sentences];
- buf, sentences = itertools.tee(sentences, 2);
- sentences = itertools.imap(gf_utils.lexer(lang=language), sentences);
- parser = gf_utils.getKBestParses(grammar, language, K);
- for sent, (time, parsesBlock) in itertools.izip(buf, itertools.imap(parser, sentences)):
- yield (sent, parsesBlock);
+ #buf = [sent for sent in sentences];
+ buf, sentences = itertools.tee(sentences, 2);
+ parser = gf_utils.getKBestParses(grammar, language, K);
+ for sent, (time, parsesBlock) in zip(buf, map(parser, sentences)):
+ yield (sent, parsesBlock);
def translation_pipeline(props):
- if props.propsfile:
- props = readTranslationPipelineOptions(props.propsfile, props);
-
- # UGLY HACK FOR K-best translation: if K-best translation output format is only txt
- if props.bestK != 1:
- props.format = 'txt';
-
- if not os.path.isdir( props.exp_directory ):
- logging.info("Creating output directory: %s" %(props.exp_directory));
- os.makedirs(props.exp_directory);
+ if props.propsfile:
+ props = readTranslationPipelineOptions(props.propsfile, props);
- if not props.srclang:
- logging.critical("Mandatory option source-lang missing. Can not determine source language.");
- sys.exit(1);
+ # UGLY HACK FOR K-best translation: if K-best translation output format is only txt
+ if props.bestK != 1:
+ props.format = 'txt';
- grammar = pgf.readPGF(props.pgffile);
+ if not os.path.isdir( props.exp_directory ):
+ logging.info("Creating output directory: %s" %(props.exp_directory));
+ os.makedirs(props.exp_directory);
- sourceLanguage = filter(None, [lang if lang[-3:] == props.srclang else '' for lang in grammar.languages.keys()])[0];
- logging.info("Translating from %s" %(sourceLanguage));
+ if not props.srclang:
+ logging.critical("Mandatory option source-lang missing. Can not determine source language.");
+ sys.exit(1);
- if len(props.tgtlangs):
- target_langs = props.tgtlangs;
- else:
- target_langs = filter(None, [lang[-3:] if lang != sourceLanguage else '' for lang in grammar.languages.keys()]);
- targetLanguages = filter(None, [lang if lang[-3:] in target_langs else '' for lang in grammar.languages.keys()]);
- logging.info("Translating into the following languages: %s" %(','.join(targetLanguages)));
+ grammar = pgf.readPGF(props.pgffile);
+
+ sourceLanguage = filter(None, [lang if lang[-3:] == props.srclang else '' for lang in grammar.languages.keys()]);
+ sourceLanguage = list(sourceLanguage)[0];
+ logging.info("Translating from %s" %(sourceLanguage));
+
+ if len(props.tgtlangs):
+ target_langs = props.tgtlangs;
+ else:
+ target_langs = filter(None, [lang[-3:] if lang != sourceLanguage \
+ else '' for lang in grammar.languages.keys()]);
+ targetLanguages = filter(None, [lang if lang[-3:] in target_langs \
+ else '' for lang in grammar.languages.keys()]);
+ targetLanguages = list(targetLanguages);
+ logging.info("Translating into the following languages: %s" %(','.join(targetLanguages)));
+
+ K = props.bestK if props.bestK != 1 else 20; # by default we look for 20 best parses
+ bestK = props.bestK;
+
+ if not props.input:
+ logging.info( "Input file name missing. Reading input from stdin." );
+ inputStream = sys.stdin;
+ outputPrefix = os.getpid();
+ else:
+ inputStream = codecs.open(props.input, 'r');
+ outputPrefix = os.path.splitext( os.path.split(props.input)[1] )[0];
- K = props.bestK if props.bestK != 1 else 20; # by default we look for 20 best parses
- bestK = props.bestK;
-
- if not props.input:
- logging.info( "Input file name missing. Reading input from stdin." );
- inputStream = sys.stdin;
- outputPrefix = os.getpid();
-
- else:
- inputStream = codecs.open(props.input, 'r');
- outputPrefix = os.path.splitext( os.path.split(props.input)[1] )[0];
+ if props.format == 'sgm':
+ inputDoc = etree.parse(inputStream);
+ reader = sgmReader;
+ skeletonDoc = getXMLSkeleton;
+ addItem = addToSgm;
+ writer = sgmWriter;
+ elif props.format == 'txt':
+ logging.info("Input format is txt. Assuming one-sentence-per-line format.");
+ inputDoc = inputStream;
+ reader = lambda X: X;
+ skeletonDoc = lambda X, lang: list();
+ addItem = lambda X, y: list.append(X, y);
+ writer = lambda X: ('\n'.join(X) if bestK == 1 else \
+ '\n'.join(map(gf_utils.printMosesNbestFormat, X)));
- if props.format == 'sgm':
- inputDoc = etree.parse(inputStream);
- reader = sgmReader;
- skeletonDoc = getXMLSkeleton;
- addItem = addToSgm;
- writer = sgmWriter;
- elif props.format == 'txt':
- logging.info("Input format is txt. Assuming one-sentence-per-line format.");
- inputDoc = inputStream;
- reader = lambda X: X;
- skeletonDoc = lambda X, lang: list();
- addItem = lambda X, y: list.append(X, y);
- writer = lambda X: ('\n'.join(X) if bestK == 1 else '\n'.join(map(gf_utils.printMosesNbestFormat, X)));
+ translationBlocks = {};
+ for tgtlang in targetLanguages+['abstract']:
+ translationBlocks[tgtlang] = skeletonDoc(inputDoc, tgtlang);
- translationBlocks = {};
- for tgtlang in targetLanguages+['abstract']:
- translationBlocks[tgtlang] = skeletonDoc(inputDoc, tgtlang);
-
- preprocessor = pipeline_lexer;
- postprocessor = clean_gfstrings;
-
- logging.info( "Parsing text in %s" %(sourceLanguage) );
- # 1. Get Abstract Trees for sentences in source language.
- tokenized_sentences = itertools.imap(preprocessor, reader(inputDoc));
- absParses = [parsesBlock for parsesBlock in pipelineParsing(grammar, sourceLanguage, web_lexer(grammar, sourceLanguage, tokenized_sentences), K)];
-
- logging.info( "Linearizing into %s" %(','.join(targetLanguages)) );
- # 2. Linearize in all target Languages
- for idx, parsesBlock in enumerate( itertools.imap(operator.itemgetter(1), absParses) ):
- translationBuffer = {};
- if not len(parsesBlock):
- # failed to parse;
- # translate using lookup
- for tgtlang, translation in translationByLookup(grammar, sourceLanguage, targetLanguages, absParses[idx][0]):
- if bestK == 1:
- addItem(translationBlocks[tgtlang], postprocessor(translation));
- else:
- addItem(translationBlocks[tgtlang], [((0,), postprocessor(translation))]);
- addItem(translationBlocks['abstract'], '');
- else:
- bestTranslationIdx = 0;
- for tgtlang in targetLanguages:
- translationBuffer[tgtlang] = gf_utils.getKLinearizations(grammar, tgtlang, [parsesBlock], K=bestK).next();
- if bestK == 1:
- for tidx, translation in enumerate(translationBuffer[tgtlang]):
- if postprocessor(translation[1]).strip():
- if tidx > bestTranslationIdx:
- bestTranslationIdx = tidx;
- break;
- for tgtlang in targetLanguages:
- if bestK == 1:
- translation = postprocessor(translationBuffer[tgtlang][bestTranslationIdx][1]) if len(translationBuffer[tgtlang]) > bestTranslationIdx else ((None,), '');
- abstract = str(parsesBlock[bestTranslationIdx][1]);
- else:
- translation = translationBuffer[tgtlang] if len(translationBuffer[tgtlang]) else [];
- abstract = parsesBlock;
- addItem(translationBlocks[tgtlang], translation);
- addItem(translationBlocks['abstract'], abstract);
-
- for tgtlang in targetLanguages+['abstract']:
- outputFile = os.path.join( props.exp_directory, '%s-%s.%s' %(outputPrefix, tgtlang[-3:] if tgtlang!='abstract' else 'abstract', props.format) );
- logging.info( "Writing translations for %s to %s" %(tgtlang, outputFile) );
- with codecs.open(outputFile, 'w') as outputStream:
- print >>outputStream, writer(translationBlocks[tgtlang]);
- return;
+ preprocessor = pipeline_lexer;
+ postprocessor = clean_gfstrings;
+
+ logging.info( "Parsing text in %s" %(sourceLanguage) );
+ # 1. Get Abstract Trees for sentences in source language.
+ tokenized_sentences = map(preprocessor, reader(inputDoc));
+ web_lexer = gf_utils.Lexer('Web', grammar, sourceLanguage).tokenize;
+ absParses = [parsesBlock for parsesBlock in \
+ pipelineParsing(grammar, sourceLanguage, \
+ map(web_lexer, tokenized_sentences), K)];
+
+ logging.info( "Linearizing into %s" %(','.join(targetLanguages)) );
+ # 2. Linearize in all target Languages
+ for idx, parsesBlock in enumerate( map(operator.itemgetter(1), absParses) ):
+ translationBuffer = {};
+ if not len(parsesBlock):
+ # failed to parse;
+ # translate using lookup
+ for tgtlang, translation in translationByLookup(grammar, sourceLanguage,\
+ targetLanguages, absParses[idx][0]):
+ if bestK == 1:
+ addItem(translationBlocks[tgtlang], postprocessor(translation));
+ else:
+ addItem(translationBlocks[tgtlang], [((0,), postprocessor(translation))]);
+ addItem(translationBlocks['abstract'], '');
+ else:
+ bestTranslationIdx = 0;
+ for tgtlang in targetLanguages:
+ translationBuffer[tgtlang] = next(gf_utils.getKLinearizations(grammar, \
+ tgtlang, [parsesBlock], K=bestK));
+ if bestK == 1:
+ for tidx, translation in enumerate(translationBuffer[tgtlang]):
+ if postprocessor(translation[1]).strip():
+ if tidx > bestTranslationIdx:
+ bestTranslationIdx = tidx;
+ break;
+ for tgtlang in targetLanguages:
+ if bestK == 1:
+ translation = postprocessor(translationBuffer[tgtlang][bestTranslationIdx][1]) \
+ if len(translationBuffer[tgtlang]) > bestTranslationIdx \
+ else ((None,), '');
+ abstract = str(parsesBlock[bestTranslationIdx][1]);
+ else:
+ translation = translationBuffer[tgtlang] \
+ if len(translationBuffer[tgtlang]) \
+ else [];
+ abstract = parsesBlock;
+ addItem(translationBlocks[tgtlang], translation);
+ addItem(translationBlocks['abstract'], abstract);
+
+ for tgtlang in targetLanguages+['abstract']:
+ outputFile = os.path.join( props.exp_directory, '%s-%s.%s' %(outputPrefix, tgtlang[-3:] \
+ if tgtlang!='abstract' \
+ else 'abstract', props.format) );
+ logging.info( "Writing translations for %s to %s" %(tgtlang, outputFile) );
+ with codecs.open(outputFile, 'w', encoding='utf-8') as outputStream:
+ print(writer(translationBlocks[tgtlang]), file=outputStream);
+ return;
def cmdLineParser():
- argparser = argparse.ArgumentParser(prog='translation_pipeline.py', description='Run the GF translation pipeline on standard test-sets');
- argparser.add_argument('-g', '--pgf', dest='pgffile', required=True, help='PGF grammar file to run the pipeline');
- argparser.add_argument('-s', '--source', dest='srclang', default='', help='Source language of input sentences');
- argparser.add_argument('-t', '--target', dest='tgtlangs', nargs='*', default=[], help='Target languages to linearize (default is all other languages)');
- argparser.add_argument('-i', '--input', dest='input', default='', help='input file (default will accept STDIN)');
- argparser.add_argument('-e', '--exp', dest='exp_directory', default=os.getcwd(), help='experiement directory to write translation files');
- argparser.add_argument('-f', '--format', dest='format', default='txt', choices=['txt', 'sgm'], help='input file format (output files will be written in the same format)');
- argparser.add_argument('-p', '--props', dest='propsfile', default='', help='properties file for the translation pipeline (specify the above arguments in a file)');
- argparser.add_argument('-K', dest='bestK', type=int, default=1, help='K value for K-best translation');
- return argparser;
+ argparser = argparse.ArgumentParser(prog='translation_pipeline.py', description='Run the GF translation pipeline on standard test-sets');
+ argparser.add_argument('-g', '--pgf', dest='pgffile', required=True, help='PGF grammar file to run the pipeline');
+ argparser.add_argument('-s', '--source', dest='srclang', default='', help='Source language of input sentences');
+ argparser.add_argument('-t', '--target', dest='tgtlangs', nargs='*', default=[], help='Target languages to linearize (default is all other languages)');
+ argparser.add_argument('-i', '--input', dest='input', default='', help='input file (default will accept STDIN)');
+ argparser.add_argument('-e', '--exp', dest='exp_directory', default=os.getcwd(), help='experiement directory to write translation files');
+ argparser.add_argument('-f', '--format', dest='format', default='txt', choices=['txt', 'sgm'], help='input file format (output files will be written in the same format)');
+ argparser.add_argument('-p', '--props', dest='propsfile', default='', help='properties file for the translation pipeline (specify the above arguments in a file)');
+ argparser.add_argument('-K', dest='bestK', type=int, default=1, help='K value for K-best translation');
+ return argparser;
if __name__ == '__main__':
- logging.basicConfig(level='INFO');
- pipelineEnv = cmdLineParser().parse_args(sys.argv[1:]);
- translation_pipeline(pipelineEnv);
+ logging.basicConfig(level='INFO');
+ pipelineEnv = cmdLineParser().parse_args(sys.argv[1:]);
+ translation_pipeline(pipelineEnv);