diff options
| author | aarne <aarne@chalmers.se> | 2010-06-01 22:48:43 +0000 |
|---|---|---|
| committer | aarne <aarne@chalmers.se> | 2010-06-01 22:48:43 +0000 |
| commit | b3c302ca6fa99abaa5cbc3ed69f138aecc9d7e98 (patch) | |
| tree | 219cec765f861782b3d67db699ab7227b59cc3a5 /examples/phrasebook | |
| parent | 83015a80184e4b2b1e34a4a7cd1b3832ec680d35 (diff) | |
updated phrasebook doc
Diffstat (limited to 'examples/phrasebook')
| -rw-r--r-- | examples/phrasebook/GreetingsSpa.gf | 3 | ||||
| -rw-r--r-- | examples/phrasebook/Implementation.html | 30 | ||||
| -rw-r--r-- | examples/phrasebook/Makefile | 2 | ||||
| -rw-r--r-- | examples/phrasebook/Ontology.html | 8 | ||||
| -rw-r--r-- | examples/phrasebook/WordsFin.gf | 4 | ||||
| -rw-r--r-- | examples/phrasebook/missing.txt | 2 | ||||
| -rw-r--r-- | examples/phrasebook/phrasebook.html | 466 | ||||
| -rw-r--r-- | examples/phrasebook/phrasebook.txt | 230 | ||||
| -rw-r--r-- | examples/phrasebook/picpic.jpg | bin | 0 -> 214926 bytes |
9 files changed, 657 insertions, 88 deletions
diff --git a/examples/phrasebook/GreetingsSpa.gf b/examples/phrasebook/GreetingsSpa.gf index 6008688f6..673bac85e 100644 --- a/examples/phrasebook/GreetingsSpa.gf +++ b/examples/phrasebook/GreetingsSpa.gf @@ -11,6 +11,9 @@ lin GDamn = ss "joder" ; GExcuse = ss "perdón" ; GExcusePol = ss "perdone" ; + GCongratulations = ss "felicitaciones" ; + GGoodLuck = ss "buena suerte" ; + GHappyBirthday = ss "feliz cumpleaños" ; GGoodMorning, GGoodDay = ss "buenos dÃas" ; GGoodEvening = ss "buenas tardes" ; GGoodNight = ss "buenas noches" ; diff --git a/examples/phrasebook/Implementation.html b/examples/phrasebook/Implementation.html index 41bab9f70..ff2275979 100644 --- a/examples/phrasebook/Implementation.html +++ b/examples/phrasebook/Implementation.html @@ -106,8 +106,10 @@ gfdoc - a rudimentary GF document generator. Too property = mkAP too_AdA (mkAP property) ; PropQuality property = mkAP property ; - ThePlace kind = placeNP the_Det kind ; - APlace kind = placeNP a_Det kind ; + ThePlace kind = let dd = if_then_else Det kind.isPl thePl_Det theSg_Det + in placeNP dd kind ; + APlace kind = let dd = if_then_else Det kind.isPl thePl_Det theSg_Det + in placeNP dd kind ; IMale, IFemale = mkPerson i_Pron ; YouFamMale, YouFamFemale = mkPerson youSg_Pron ; @@ -130,7 +132,11 @@ gfdoc - a rudimentary GF document generator. NNumeral n = mkCard <lin Numeral n : Numeral> ; - AHave p obj = mkCl p.name have_V2 obj ; + SHave p obj = mkS (mkCl p.name have_V2 obj) ; + SHaveNo p k = mkS negativePol (mkCl p.name have_V2 (mkNP aPl_Det k)) ; + SHaveNoMass p m = mkS negativePol (mkCl p.name have_V2 (mkNP m)) ; + QDoHave p obj = mkQS (mkQCl (mkCl p.name have_V2 obj)) ; + AHaveCurr p curr = mkCl p.name have_V2 (mkNP aPl_Det curr) ; ACitizen p n = mkCl p.name n ; ABePlace p place = mkCl p.name place.at ; @@ -166,12 +172,20 @@ These are used in Words for each language. } ; NPPlace : Type = {name : NP ; at : Adv ; to : Adv} ; - CNPlace : Type = {name : CN ; at : Prep ; to : Prep} ; + CNPlace : Type = {name : CN ; at : Prep ; to : Prep; isPl : Bool} ; mkCNPlace : CN -> Prep -> Prep -> CNPlace = \p,i,t -> { name = p ; at = i ; - to = t + to = t ; + isPl = False + } ; + + mkCNPlacePl : CN -> Prep -> Prep -> CNPlace = \p,i,t -> { + name = p ; + at = i ; + to = t ; + isPl = True } ; placeNP : Det -> CNPlace -> NPPlace = \det,kind -> @@ -344,7 +358,7 @@ Means of transportation Actions: the predication patterns are very often language-dependent. <pre> - AHasAge p num = mkCl p.name (mkNP (mkNP num L.year_N) (mkAdv "old")); + AHasAge p num = mkCl p.name (mkNP (mkNP num L.year_N) (ParadigmsEng.mkAdv "old")); AHasChildren p num = mkCl p.name have_V2 (mkNP num L.child_N) ; AHasRoom p num = mkCl p.name have_V2 (mkNP (mkNP a_Det (mkN "room")) (SyntaxEng.mkAdv for_Prep (mkNP num (mkN "person")))) ; @@ -456,10 +470,10 @@ auxiliaries mkNPDay day (SyntaxEng.mkAdv on_Prep day) (SyntaxEng.mkAdv on_Prep (mkNP a_Quant plNum (mkCN (mkN d)))) ; - mkCompoundPlace : Str -> Str -> Str -> {name : CN ; at : Prep ; to : Prep} = \comp, p, i -> + mkCompoundPlace : Str -> Str -> Str -> {name : CN ; at : Prep ; to : Prep; isPl : Bool} = \comp, p, i -> mkCNPlace (mkCN (P.mkN comp (mkN p))) (P.mkPrep i) to_Prep ; - mkPlace : Str -> Str -> {name : CN ; at : Prep ; to : Prep} = \p,i -> + mkPlace : Str -> Str -> {name : CN ; at : Prep ; to : Prep; isPl : Bool} = \p,i -> mkCNPlace (mkCN (mkN p)) (P.mkPrep i) to_Prep ; open_Adv = P.mkAdv "open" ; diff --git a/examples/phrasebook/Makefile b/examples/phrasebook/Makefile index f0dc1826d..4e36e2988 100644 --- a/examples/phrasebook/Makefile +++ b/examples/phrasebook/Makefile @@ -29,7 +29,7 @@ doc: rm -f Ontology.gf cat SentencesI.gf WordsEng.gf >Implementation.gf gfdoc Implementation.gf - txt2tags -thtml phrasebook.txt + txt2tags -thtml --toc phrasebook.txt rm -f Ontology.gf Implementation.gf upload:: Phrasebook.pgf diff --git a/examples/phrasebook/Ontology.html b/examples/phrasebook/Ontology.html index 0765ac4e0..48059049a 100644 --- a/examples/phrasebook/Ontology.html +++ b/examples/phrasebook/Ontology.html @@ -147,12 +147,16 @@ Determiners. Actions are typically language-dependent, not only lexically but also structurally. However, these ones are mostly functorial. <pre> - AHave : Person -> Object -> Action ; -- you have pizzas + SHave : Person -> Object -> Sentence ; -- you have beer + SHaveNo : Person -> Kind -> Sentence ; -- you have no apples + SHaveNoMass : Person -> MassKind -> Sentence ; -- you have no beer + QDoHave : Person -> Object -> Question ; -- do you have beer + AHaveCurr : Person -> Currency -> Action ; -- you have dollars ACitizen : Person -> Citizenship -> Action ; -- you are Swedish ABePlace : Person -> Place -> Action ; -- you are in the bar - ByTransp : Transport -> ByTransport ; -- by bus + ByTransp : Transport -> ByTransport ; -- by bus } </pre> diff --git a/examples/phrasebook/WordsFin.gf b/examples/phrasebook/WordsFin.gf index 29494ccb2..0e4e7d14c 100644 --- a/examples/phrasebook/WordsFin.gf +++ b/examples/phrasebook/WordsFin.gf @@ -208,7 +208,9 @@ concrete WordsFin of Words = SentencesFin ** mkQS (mkQCl (mkIP which_IDet trans.name) (mkVP (mkVP L.go_V) place.to)) ; IsTranspPlace trans place = - mkQS (mkQCl (E.AdvPredNP place.to L.go_V (E.PartCN (trans.name)))) ; + mkQS (mkQCl (mkCl (mkVP (mkVP (mkVP (mkV "päästä")) trans.by) place.to))) ; + -- pääseekö keskustaan bussilla + -- mkQS (mkQCl (E.AdvPredNP place.to L.go_V (E.PartCN (trans.name)))) ; -- meneekö keskustaan bussia -- modifiers of places diff --git a/examples/phrasebook/missing.txt b/examples/phrasebook/missing.txt index e05b4c3c2..88a998dfb 100644 --- a/examples/phrasebook/missing.txt +++ b/examples/phrasebook/missing.txt @@ -11,5 +11,5 @@ PhrasebookIta : PhrasebookNor : PhrasebookPol : PhrasebookRon : -PhrasebookSpa : GCongratulations GGoodLuck GHappyBirthday +PhrasebookSpa : PhrasebookSwe : diff --git a/examples/phrasebook/phrasebook.html b/examples/phrasebook/phrasebook.html index fae61468a..2d36e5fc0 100644 --- a/examples/phrasebook/phrasebook.html +++ b/examples/phrasebook/phrasebook.html @@ -2,6 +2,7 @@ <HTML> <HEAD> <META NAME="generator" CONTENT="http://txt2tags.sf.net"> +<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8"> <TITLE>MOLTO Multilingual Phrasebook</TITLE> </HEAD><BODY BGCOLOR="white" TEXT="black"> <P ALIGN="center"><CENTER><H1>MOLTO Multilingual Phrasebook</H1> @@ -10,6 +11,25 @@ Showcase for project FP7-ICT-247914, Deliverable D10.2. </FONT></CENTER> +<P></P> +<HR NOSHADE SIZE=1> +<P></P> + <UL> + <LI><A HREF="#toc1">Purpose</A> + <LI><A HREF="#toc2">Points illustrated</A> + <LI><A HREF="#toc3">Ontology</A> + <LI><A HREF="#toc4">Files</A> + <LI><A HREF="#toc5">To Do</A> + <LI><A HREF="#toc6">How to contribute</A> + <LI><A HREF="#toc7">Effort and cost</A> + <LI><A HREF="#toc8">Example-based grammar writing prototype</A> + <LI><A HREF="#toc9">Conclusions (tentative)</A> + <LI><A HREF="#toc10">Acknowledgements</A> + </UL> + +<P></P> +<HR NOSHADE SIZE=1> +<P></P> <P> <HR> <font size=-1> @@ -18,6 +38,8 @@ Showcase for project FP7-ICT-247914, Deliverable D10.2. History </P> <UL> +<LI>2 June. Version 1.0 released! +<LI>29 May. Link to Google translate with the current language pair and phrase. <LI>27 May. Polish added. <LI>26 May. Version 0.9: Catalan added, mass/count noun distinction to reduce overgeneration, @@ -49,33 +71,34 @@ History </font> <HR> </P> +<A NAME="toc1"></A> <H1>Purpose</H1> <P> This phrasebook is a program for translating touristic phrases -between the 15 European languages included in the +between 14 European languages included in the <A HREF="http://www.molto-project.eu">MOLTO</A> project (Multilingual On-Line Translation): </P> <UL> <LI>Bulgarian, Catalan, Danish, Dutch, English, Finnish, French, German, Italian, Norwegian, - Polish, Romanian, Russian, Spanish, Swedish + Polish, Romanian, Spanish, Swedish </UL> <P> It is implemented by using the GF programming language (<A HREF="http://grammaticalframework.org">Grammatical Framework</A>). -It is the first demo for the MOLTO project, released in the third month (by June 2010) -but to be updated in the course of the project. +It is the first demo for the MOLTO project, released in the third month (by June 2010). +The first version is a very small system, but it will extended in the course of the project. </P> <P> -The phrasebook has the following requirements: +The phrasebook has the following requirement specification: </P> <UL> <LI>high quality: reliable translations to express yourself in any language <LI>translation between all pairs of languages <LI>runnable in web browsers -<LI>runnable on mobile phones (also off-line: forthcoming for Android phones) +<LI>runnable on mobile phones (forthcoming: Android phones) <LI>easily extensible by new words (forthcoming: semi-automatic extensions by users) </UL> @@ -84,39 +107,91 @@ The phrasebook is available as open-source software, licensed under GNU LGPL. The source code resides in <A HREF="http://code.haskell.org/gf/examples/phrasebook/"><CODE>code.haskell.org/gf/examples/phrasebook/</CODE></A> </P> +<A NAME="toc2"></A> +<H1>Points illustrated</H1> <P> -Current status (27 May 2010): +Interlingua-based translation </P> <UL> -<LI>small but useful coverage in abstract syntax -<LI>reasonable implementations for all MOLTO languages except Russian -<LI>works on web browsers calling a server -<LI>web service not yet released, but preliminarily available in - <A HREF="http://www.grammaticalframework.org/demos/phrasebook/">http://www.grammaticalframework.org/demos/phrasebook/</A> +<LI>we translate meanings, rather than words </UL> -<H1>Points illustrated</H1> <P> -Interlingua-based translation. +Incremental parsing </P> +<UL> +<LI>the user is at every point guided by the list of possible next words +</UL> + <P> -Incremental parsing. +The use of resource grammars and functors </P> +<UL> +<LI>the translator was implemented on top of an earlier linguistic knowledge base, + the <A HREF="http://grammaticalframework.com/lib">GF Resource Grammar Library</A> +</UL> + <P> -The use of resource grammars and functors. +Example-based grammar writing and grammar induction from statistical models +(<A HREF="http://translate.google.com">Google translate</A>) </P> +<UL> +<LI>many of the grammars were created semi-automatically by generalization from + examples +</UL> + <P> -Example-based grammar writing and grammar induction from statistical models (Google). +Compile-time transfer: especially, in Action in Words </P> +<UL> +<LI>the structural differences between languages are treated at compile time, + for maximal run-time efficiency +</UL> + <P> -Compile-time transfer: especially, in Action in Words. +Quasi-incremental translation: many basic types are also used as phrases </P> +<UL> +<LI>one can translate both words and complete sentences, and get intermediate results +</UL> + <P> -Quasi-incremental translation: many basic types are also used as phrases. +Disambiguation, esp. of politeness distinctions </P> +<UL> +<LI>if a phrase has many translations, each of them is shown and given an explanation + (currently just in English, later in any source language) +</UL> + <P> -Disambiguation, esp. of politeness distinctions. +Fall-back to statistical translation </P> +<UL> +<LI>currently just a link to Google translate (forthcoming: tailor-made statistical models) +</UL> + +<P> +Feed-back from users +</P> +<UL> +<LI>you are welcome to send comments, bug reports, and better translation suggestions! +</UL> + +<P> +The level of skills involved in grammar development +</P> +<UL> +<LI>testing different configurations (see table below) +</UL> + +<P> +Grammar testing +</P> +<UL> +<LI>use of treebanks with guided random generation for initial evaluation and regression testing +</UL> + +<A NAME="toc3"></A> <H1>Ontology</H1> <P> The abstract syntax defines the <B>ontology</B> behind the phrasebook. @@ -128,6 +203,7 @@ and <A HREF="http://code.haskell.org/gf/examples/phrasebook/Words.gf"><CODE>Words.gf</CODE></A> by <CODE>make doc</CODE>. </P> +<A NAME="toc4"></A> <H1>Files</H1> <P> <CODE>Sentences</CODE>: general syntactic structures implementable in a uniform way. @@ -164,18 +240,9 @@ Here is the module structure as produced in GF by <P> <IMG ALIGN="middle" SRC="pgraph.png" BORDER="0" ALT=""> </P> +<A NAME="toc5"></A> <H1>To Do</H1> <P> -Improved translation interface -</P> -<UL> -<LI>a nicer way to show disambiguation (maybe hidden by default) -</UL> - -<P> -Complete the missing words and phrases -</P> -<P> Disambiguation grammars for other languages than English </P> <P> @@ -183,20 +250,15 @@ Extend the abstract lexicon in <CODE>Words</CODE> by hand or (semi)automatically </P> <UL> <LI>food stuff -<LI>languages <LI>places +<LI>actions </UL> <P> -Link to Google translate, for fall-back and for comparison -</P> -<P> -Feedback facility in the UI -</P> -<P> -Customizable distribution: make your own selection of the 2^15 language subsets +Customizable phone distribution: make your own selection of the 2^15 language subsets when downloading the phrasebook to a phone </P> +<A NAME="toc6"></A> <H1>How to contribute</H1> <P> The basic things "everyone" can do is @@ -253,15 +315,337 @@ Here are the steps to follow for contributors: <LI>Don't compromise quality to gain coverage: <I>non multa sed multum!</I> </UL> -<H2>Acknowledgements</H2> +<A NAME="toc7"></A> +<H1>Effort and cost</H1> +<TABLE BORDER="1" CELLPADDING="4"> +<TR> +<TH>Language</TH> +<TH>Grammarian's language skills</TH> +<TH>Grammarian's GF skills</TH> +<TH>Informant used for development</TH> +<TH>Informant used for testing</TH> +<TH>Use of external tools</TH> +<TH>Impact of external tools</TH> +<TH>Changes on the resource grammar</TH> +<TH COLSPAN="2">Development time</TH> +</TR> +<TR> +<TD>Bulgarian</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">?</TD> +<TD ALIGN="center">#</TD> +<TD ALIGN="center">##</TD> +</TR> +<TR> +<TD>Catalan</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">?</TD> +<TD ALIGN="center">#</TD> +<TD ALIGN="center">#</TD> +</TR> +<TR> +<TD>Danish</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">##</TD> +<TD ALIGN="center">##</TD> +<TD ALIGN="center">##</TD> +</TR> +<TR> +<TD>Dutch</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">##</TD> +<TD ALIGN="center">#</TD> +<TD ALIGN="center">##</TD> +</TR> +<TR> +<TD>English</TD> +<TD ALIGN="center">##</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">_</TD> +<TD ALIGN="center">#</TD> +</TR> +<TR> +<TD>Finnish</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">?</TD> +<TD ALIGN="center">#</TD> +<TD ALIGN="center">##</TD> +</TR> +<TR> +<TD>French</TD> +<TD ALIGN="center">##</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">?</TD> +<TD ALIGN="center">#</TD> +<TD ALIGN="center">#</TD> +</TR> +<TR> +<TD>German</TD> +<TD ALIGN="center">#</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">##</TD> +<TD ALIGN="center">##</TD> +<TD ALIGN="center">###</TD> +</TR> +<TR> +<TD>Italian</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">#</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">?</TD> +<TD ALIGN="center">##</TD> +<TD ALIGN="center">##</TD> +</TR> +<TR> +<TD>Norwegian</TD> +<TD ALIGN="center">#</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">##</TD> +<TD ALIGN="center">#</TD> +<TD ALIGN="center">##</TD> +</TR> +<TR> +<TD>Polish</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">#</TD> +<TD ALIGN="center">#</TD> +<TD ALIGN="center">##</TD> +</TR> +<TR> +<TD>Romanian</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">#</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">###</TD> +</TR> +<TR> +<TD>Spanish</TD> +<TD ALIGN="center">##</TD> +<TD ALIGN="center">#</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">?</TD> +<TD ALIGN="center">_</TD> +<TD ALIGN="center">##</TD> +</TR> +<TR> +<TD>Swedish</TD> +<TD ALIGN="center">##</TD> +<TD ALIGN="center">###</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">+</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">?</TD> +<TD ALIGN="center">-</TD> +<TD ALIGN="center">##</TD> +</TR> +</TABLE> + +<P> +Explanation on scores +</P> +<UL> +<LI>Grammarian's language skills + <UL> + <LI>- : no skills + <LI># : passive knowledge + <LI>## : fluent non-native + <LI>### : native speaker + </UL> +</UL> + +<UL> +<LI>Grammarian's GF skills + <UL> + <LI>- : no skills + <LI># : basic skills (2-day GF tutorial) + <LI>## : medium skills (previous experience of similar task) + <LI>### : advanced skills (resource grammar writer/substantial contributor) + </UL> +</UL> + +<UL> +<LI>Informant used for development/Informant needed for testing/Use of external tools + <UL> + <LI>- : no + <LI>+ : yes + </UL> +</UL> + +<UL> +<LI>Impact of external tools + <UL> + <LI>? : not investigated + <LI>- : no effect on the Phrasebook + <LI># : small impact (literal translation, simple idioms) + <LI>## : medium effect (translation of more forms of words, contextual preposition) + <LI>### : great effect (no extra work needed, translations are correct) + </UL> +</UL> + +<UL> +<LI>Changes on the resource grammars + <UL> + <LI>- : no changes + <LI># : 1-3 minor changes + <LI>## : 4-10 minor changes, 1-3 medium changes + <LI>### : >10 changes of any kind + </UL> +</UL> + +<UL> +<LI>Overall effort (including extra work on resource grammars) + <UL> + <LI># : less than 8 person hours + <LI>## : 8-24 person hours + <LI>### : >24 person hours + </UL> +</UL> + +<A NAME="toc8"></A> +<H1>Example-based grammar writing prototype</H1> +<P> +The figure presents the process of creating a Phrasebook using an example-based +approach for the language X, where X = {Danish, Dutch, German, Norwegian}. +</P> +<P> +<IMG ALIGN="middle" SRC="picpic.jpg" BORDER="0" ALT=""> +</P> +<UL> +<LI>the first step assumes an analysis of the resource grammar and extracts the necessary + information that functions that build new lexical entries would need. + A model is built so that the proper forms of the word can be rendered, + and additional information, such as gender, can be inferred. The script applies + these rules to each entry that we want to translate into the target language, and + one obtains a set of constructions. +<LI>they are furthermore given to an external translator tool (Google translate) + or a native speaker for translation. One needs the configuration file even if the + translator is human, because formal knowledge of grammar is not assumed. +<LI>the translations into the target language are further more processed in order to + build the linearizations of the categories first, decoding the information received. + Furthermore, having the words in the lexicon, one can parse the translations of + functions with the GF parser and generalize from that. +<LI>the resulting grammar is tested with the aid of a script that generates + constructions covering all the functions and categories from the grammar, along + with some other constructions that proved to be problematic in some language. + The result of the script contains for each construction in the target language + its English correspondent and the abstract syntax tree. A native speaker + evaluates the results and if corrections are needed, the algorithm runs again + with the new examples. Depending on the language skills of the grammar writer, + the changes can be made directly into the GF files, and the correct examples + given by the native informant are just kept for validating the results. + The algorithm is repeated as long as corrections are needed. +</UL> + +<P> +The time needed for preparing the configuration files for a grammar will not be needed +in the future, since the files are reusable for other applications. +The time for the second step can be saved if automatic tools, like Google translate +are used. This is only possible in languages with a simpler morphology and syntax +and large corpora available. +Good results were obtained for German and Dutch with Google translate, but for +languages like Romanian or Polish, which are both complex and lack enough resources, +the results are discouraging. +</P> +<P> +If the statistical oracle works well, the only step where the presence of a human +translator is needed is the evaluation and feedback step. An average of 4 hours per +round and 2 rounds were needed in average for the languages for which we performed +the experiment. It is possible that more effort is needed for more complex languages. +</P> +<A NAME="toc9"></A> +<H1>Conclusions (tentative)</H1> +<P> +The grammarian need not be a native speaker of the language. +</P> +<P> +For many languages, the grammarian need not even know the language - native informants are +enough. +</P> +<P> +However, evaluation by native speakers is necessary. +</P> +<P> +Correct and idiomatic translations are possible. +</P> +<P> +A typical development time was 2-3 person working days per language. +</P> +<P> +Google translate helps in bootstrapping grammars, but must be checked. +</P> +<UL> +<LI>in particular, unreliable for morphologically rich languages +</UL> + +<P> +Resource grammars should give some more support +</P> +<UL> +<LI>higher-level access to constructions like negative expressions +<LI>large-scale morphological lexica +</UL> + +<A NAME="toc10"></A> +<H1>Acknowledgements</H1> <P> The Phrasebook has been built in the MOLTO project funded by the European Commission. </P> <P> The authors are grateful to their native speaker informants helping to bootstrap and evaluate -the grammars: Richard Bubel, Grégoire Détrez, Michal Palka, Willard Rafnsson,... +the grammars: +Richard Bubel, +Grégoire Détrez, +Karin Keijzer, +MichaÅ‚ PaÅ‚ka, +Willard Rafnsson, +Nick Smallbone. </P> <!-- html code generated by txt2tags 2.5 (http://txt2tags.sf.net) --> -<!-- cmdline: txt2tags -thtml phrasebook.txt --> +<!-- cmdline: txt2tags -thtml -\-toc phrasebook.txt --> </BODY></HTML> diff --git a/examples/phrasebook/phrasebook.txt b/examples/phrasebook/phrasebook.txt index 7226ae1b1..d7bfa162d 100644 --- a/examples/phrasebook/phrasebook.txt +++ b/examples/phrasebook/phrasebook.txt @@ -3,6 +3,8 @@ Krasimir Angelov, Olga Caprotti, Ramona Enache, Thomas Hallgren, Inari Listenmaa Showcase for project FP7-ICT-247914, Deliverable D10.2. +%!Encoding:utf-8 + %!postproc(html): #HR <HR> %!postproc(html): #BSMALL <font size=-1> %!postproc(html): #ESMALL </font> @@ -14,6 +16,8 @@ Showcase for project FP7-ICT-247914, Deliverable D10.2. #BSMALL History +- 2 June. Version 1.0 released! +- 29 May. Link to Google translate with the current language pair and phrase. - 27 May. Polish added. - 26 May. Version 0.9: Catalan added, mass/count noun distinction to reduce overgeneration, @@ -46,24 +50,24 @@ History =Purpose= This phrasebook is a program for translating touristic phrases -between the 15 European languages included in the +between 14 European languages included in the [MOLTO http://www.molto-project.eu] project (Multilingual On-Line Translation): - Bulgarian, Catalan, Danish, Dutch, English, Finnish, French, German, Italian, Norwegian, - Polish, Romanian, Russian, Spanish, Swedish + Polish, Romanian, Spanish, Swedish It is implemented by using the GF programming language ([Grammatical Framework http://grammaticalframework.org]). -It is the first demo for the MOLTO project, released in the third month (by June 2010) -but to be updated in the course of the project. +It is the first demo for the MOLTO project, released in the third month (by June 2010). +The first version is a very small system, but it will extended in the course of the project. -The phrasebook has the following requirements: +The phrasebook has the following requirement specification: - high quality: reliable translations to express yourself in any language - translation between all pairs of languages - runnable in web browsers -- runnable on mobile phones (also off-line: forthcoming for Android phones) +- runnable on mobile phones (forthcoming: Android phones) - easily extensible by new words (forthcoming: semi-automatic extensions by users) @@ -72,30 +76,57 @@ The source code resides in [``code.haskell.org/gf/examples/phrasebook/`` http://code.haskell.org/gf/examples/phrasebook/] -Current status (27 May 2010): -- small but useful coverage in abstract syntax -- reasonable implementations for all MOLTO languages except Russian -- works on web browsers calling a server -- web service not yet released, but preliminarily available in - http://www.grammaticalframework.org/demos/phrasebook/ +=Points illustrated= + +Interlingua-based translation +- we translate meanings, rather than words -=Points illustrated= +Incremental parsing +- the user is at every point guided by the list of possible next words + + +The use of resource grammars and functors +- the translator was implemented on top of an earlier linguistic knowledge base, + the [GF Resource Grammar Library http://grammaticalframework.com/lib] + + +Example-based grammar writing and grammar induction from statistical models +([Google translate http://translate.google.com]) +- many of the grammars were created semi-automatically by generalization from + examples + + +Compile-time transfer: especially, in Action in Words +- the structural differences between languages are treated at compile time, + for maximal run-time efficiency + + +Quasi-incremental translation: many basic types are also used as phrases +- one can translate both words and complete sentences, and get intermediate results + + +Disambiguation, esp. of politeness distinctions +- if a phrase has many translations, each of them is shown and given an explanation + (currently just in English, later in any source language) + -Interlingua-based translation. +Fall-back to statistical translation +- currently just a link to Google translate (forthcoming: tailor-made statistical models) -Incremental parsing. -The use of resource grammars and functors. +Feed-back from users +- you are welcome to send comments, bug reports, and better translation suggestions! -Example-based grammar writing and grammar induction from statistical models (Google). -Compile-time transfer: especially, in Action in Words. +The level of skills involved in grammar development +- testing different configurations (see table below) -Quasi-incremental translation: many basic types are also used as phrases. -Disambiguation, esp. of politeness distinctions. +Grammar testing +- use of treebanks with guided random generation for initial evaluation and regression testing + @@ -146,25 +177,15 @@ Here is the module structure as produced in GF by =To Do= -Improved translation interface -- a nicer way to show disambiguation (maybe hidden by default) - - -Complete the missing words and phrases - Disambiguation grammars for other languages than English Extend the abstract lexicon in ``Words`` by hand or (semi)automatically for - food stuff -- languages - places +- actions -Link to Google translate, for fall-back and for comparison - -Feedback facility in the UI - -Customizable distribution: make your own selection of the 2^15 language subsets +Customizable phone distribution: make your own selection of the 2^15 language subsets when downloading the phrasebook to a phone @@ -214,10 +235,151 @@ Here are the steps to follow for contributors: - Don't compromise quality to gain coverage: //non multa sed multum!// -==Acknowledgements== + +=Effort and cost= + +|| Language | Grammarian's language skills | Grammarian's GF skills | Informant used for development | Informant used for testing | Use of external tools | Impact of external tools | Changes on the resource grammar | Development time || +| Bulgarian | ### | ### | - | - | - | ? | # | ## | +| Catalan | ### | ### | - | - | - | ? | # | # | +| Danish | - | ### | + | + | + | ## | ## | ## | +| Dutch | - | ### | + | + | + | ## | # | ## | +| English | ## | ### | - | + | - | - | _ | # | +| Finnish | ### | ### | - | - | - | ? | # | ## | +| French | ## | ### | - | + | - | ? | # | # | +| German | # | ### | + | + | + | ## | ## | ### | +| Italian | ### | # | - | - | - | ? | ## | ## | +| Norwegian | # | ### | + | - | + | ## | # | ## | +| Polish | ### | ### | + | + | + | # | # | ## | +| Romanian | ### | ### | - | - | + | # | ### | ### | +| Spanish | ## | # | - | - | - | ? | _ | ## | +| Swedish | ## | ### | - | + | - | ? | - | ## | + + +Explanation on scores + +- Grammarian's language skills + - - : no skills + - # : passive knowledge + - ## : fluent non-native + - ### : native speaker + + +- Grammarian's GF skills + - - : no skills + - # : basic skills (2-day GF tutorial) + - ## : medium skills (previous experience of similar task) + - ### : advanced skills (resource grammar writer/substantial contributor) + + +- Informant used for development/Informant needed for testing/Use of external tools + - - : no + - + : yes + + +- Impact of external tools + - ? : not investigated + - - : no effect on the Phrasebook + - # : small impact (literal translation, simple idioms) + - ## : medium effect (translation of more forms of words, contextual preposition) + - ### : great effect (no extra work needed, translations are correct) + + +- Changes on the resource grammars + - - : no changes + - # : 1-3 minor changes + - ## : 4-10 minor changes, 1-3 medium changes + - ### : >10 changes of any kind + + +- Overall effort (including extra work on resource grammars) + - # : less than 8 person hours + - ## : 8-24 person hours + - ### : >24 person hours + + +=Example-based grammar writing prototype= + +The figure presents the process of creating a Phrasebook using an example-based +approach for the language X, where X = {Danish, Dutch, German, Norwegian}. + +[picpic.jpg] + +- the first step assumes an analysis of the resource grammar and extracts the necessary + information that functions that build new lexical entries would need. + A model is built so that the proper forms of the word can be rendered, + and additional information, such as gender, can be inferred. The script applies + these rules to each entry that we want to translate into the target language, and + one obtains a set of constructions. +- they are furthermore given to an external translator tool (Google translate) + or a native speaker for translation. One needs the configuration file even if the + translator is human, because formal knowledge of grammar is not assumed. +- the translations into the target language are further more processed in order to + build the linearizations of the categories first, decoding the information received. + Furthermore, having the words in the lexicon, one can parse the translations of + functions with the GF parser and generalize from that. +- the resulting grammar is tested with the aid of a script that generates + constructions covering all the functions and categories from the grammar, along + with some other constructions that proved to be problematic in some language. + The result of the script contains for each construction in the target language + its English correspondent and the abstract syntax tree. A native speaker + evaluates the results and if corrections are needed, the algorithm runs again + with the new examples. Depending on the language skills of the grammar writer, + the changes can be made directly into the GF files, and the correct examples + given by the native informant are just kept for validating the results. + The algorithm is repeated as long as corrections are needed. + + +The time needed for preparing the configuration files for a grammar will not be needed +in the future, since the files are reusable for other applications. +The time for the second step can be saved if automatic tools, like Google translate +are used. This is only possible in languages with a simpler morphology and syntax +and large corpora available. +Good results were obtained for German and Dutch with Google translate, but for +languages like Romanian or Polish, which are both complex and lack enough resources, +the results are discouraging. + +If the statistical oracle works well, the only step where the presence of a human +translator is needed is the evaluation and feedback step. An average of 4 hours per +round and 2 rounds were needed in average for the languages for which we performed +the experiment. It is possible that more effort is needed for more complex languages. + + +=Conclusions (tentative)= + +The grammarian need not be a native speaker of the language. + +For many languages, the grammarian need not even know the language - native informants are +enough. + +However, evaluation by native speakers is necessary. + +Correct and idiomatic translations are possible. + +A typical development time was 2-3 person working days per language. + +Google translate helps in bootstrapping grammars, but must be checked. +- in particular, unreliable for morphologically rich languages + + +Resource grammars should give some more support +- higher-level access to constructions like negative expressions +- large-scale morphological lexica + + + + + + +=Acknowledgements= The Phrasebook has been built in the MOLTO project funded by the European Commission. The authors are grateful to their native speaker informants helping to bootstrap and evaluate -the grammars: Richard Bubel, Grégoire Détrez, Michal Palka, Willard Rafnsson,... +the grammars: +Richard Bubel, +Grégoire Détrez, +Karin Keijzer, +MichaÅ‚ PaÅ‚ka, +Willard Rafnsson, +Nick Smallbone. diff --git a/examples/phrasebook/picpic.jpg b/examples/phrasebook/picpic.jpg Binary files differnew file mode 100644 index 000000000..aac20b611 --- /dev/null +++ b/examples/phrasebook/picpic.jpg |
