summaryrefslogtreecommitdiff
path: root/next-lib/src/parse/oald
diff options
context:
space:
mode:
authorbjorn <bjorn@bringert.net>2008-11-21 16:50:07 +0000
committerbjorn <bjorn@bringert.net>2008-11-21 16:50:07 +0000
commit59ff172b7d3c91278f7a878cd7eec64286e22c8c (patch)
tree29fa88c69e45ee9af335e8d7780f4ff2fae61be6 /next-lib/src/parse/oald
parentb047a99a84c1b21f60b44eb12d7a11e9f3d056ea (diff)
Started working on oald2gf script.
Diffstat (limited to 'next-lib/src/parse/oald')
-rw-r--r--next-lib/src/parse/oald/asc2gf386
1 files changed, 386 insertions, 0 deletions
diff --git a/next-lib/src/parse/oald/asc2gf b/next-lib/src/parse/oald/asc2gf
new file mode 100644
index 000000000..046d4f5c0
--- /dev/null
+++ b/next-lib/src/parse/oald/asc2gf
@@ -0,0 +1,386 @@
+#! /usr/bin/perl
+#
+# Perl script to process OALD machine-readable ASCII file
+# into a GF lexicon
+#
+# Usage: ./asc2gf < ascii_0710-1.txt
+#
+# Bjorn Bringert 2008,
+# based on asc2lex by
+# Matthew Purver, 11/2001
+
+# skip header section
+while ( <STDIN> ) {
+ last if /<\/TEIHEADER>/;
+}
+
+# read a line from stdin
+while ( $line = <STDIN> ) {
+
+ # remove SGML tags
+ $line =~ s/<[^<>]+>//g;
+
+ # split line into fields according to spec (line may be empty now)
+ if ( $line =~ /^(.{23}).{23}(.{23}).{1}(.{58})$/ ) {
+
+ # trim white space
+ for ( ( $word, $pos, $cat ) = ( $1, $2, $3 ) ) {
+ s/\s*$//;
+ }
+
+ # make word lower-case atomic string
+ $word =~ s/\"/\\\"/g; # " -> \"
+ $word =~ tr/A-Z/a-z/; # lower case
+
+ # move diacritics to the following letter
+ $word =~ s/~n/ñ/g;
+ $word =~ s/<c/ç/g;
+ $word =~ s/"a/ä/g;
+ $word =~ s/"o/ö/g;
+ $word =~ s/"u/ü/g;
+ $word =~ s/\^a/â/g;
+ $word =~ s/\^e/ê/g;
+ $word =~ s/\^o/ô/g;
+ $word =~ s/`a/à/g;
+ $word =~ s/`e/è/g;
+ $word =~ s/_e/é/g;
+
+ $name = $word;
+ $name =~ s/ /_/g; # space -> _
+ $name =~ s/-/_/g; # - -> _
+
+
+ # get PoS & subcat info
+ @pos = split( /,/, $pos );
+ $cat =~ s/,/\',\'/g;
+ ( $cat = "\'$cat\'" ) unless ( $cat eq '' );
+
+ # set up Prolog-style string & put into array
+ foreach ( @pos ) {
+ ( $pcode, $infl, $freq )=split(//);
+
+ # for verbs, get inflected forms
+ if ( $pcode =~ /^[GHIJ]/ ) {
+ $pos = 'verb';
+ # if this is a root form, work out the inflected forms
+ if ( $infl =~ /^\d/ ) {
+ if ( $infl == 0 ) {
+ ( $vbz = $word ) =~ s/$/s/;
+ ( $vbg = $word ) =~ s/$/ing/;
+ ( $vbd = $word ) =~ s/$/ed/;
+ }
+ elsif ( $infl == 1 ) {
+ ( $vbz = $word ) =~ s/$/es/;
+ ( $vbg = $word ) =~ s/$/ing/;
+ ( $vbd = $word ) =~ s/$/ed/;
+ }
+ elsif ( $infl == 2 ) {
+ ( $vbz = $word ) =~ s/e$/es/;
+ ( $vbg = $word ) =~ s/e$/ing/;
+ ( $vbd = $word ) =~ s/e$/ed/;
+ }
+ elsif ( $infl == 3 ) {
+ ( $vbz = $word ) =~ s/y$/ies/;
+ ( $vbg = $word ) =~ s/y$/ying/;
+ ( $vbd = $word ) =~ s/y$/ied/;
+ }
+ elsif ( $infl == 4 ) {
+ ( $vbz = $word ) =~ s/$/s/;
+ ( $vbg = $word ) =~ s/(\w)$/$1$1ing/;
+ ( $vbd = $word ) =~ s/(\w)$/$1$1ed/;
+ }
+ elsif ( $infl == 5 ) {
+ # for irregulars, just mark as such for now, we'll guess later
+ $vbz = 'IRREG';
+ $vbg = 'IRREG';
+ $vbd = 'IRREG';
+ }
+
+ $lin = "mkV \"$word\" \"$vbz\" \"$vbd\" \"$vbd\" \"$vbg\"";
+
+ if ($pcode eq 'G') {
+ $words{"${name}_VX"} = "mkVX ($lin)";
+ }
+ if ($pcode eq 'I' || $pcode eq 'J') {
+ $words{"${name}_V"} = "$lin";
+ }
+ if ($pcode eq 'H' || $pcode eq 'J') {
+ $words{"${name}_V2"} = "mkV2 ($lin)";
+ }
+ }
+ # if this is an inflected form, save for guessing irregulars later
+ elsif ( $infl =~ /^a/ ) {
+ push( @vbz, $word );
+ }
+ elsif ( $infl =~ /^b/ ) {
+ push( @vbg, $word );
+ }
+ elsif ( $infl =~ /^c/ ) {
+ push( @vbd, $word );
+ }
+ elsif ( $infl =~ /^d/ ) {
+ push( @vbn, $word );
+ }
+ }
+ # for nouns, get plural form
+ elsif( $pcode =~ /^[KLMNY]/ ) {
+ $pos = 'noun';
+ $pcode =~ s/^K/count/;
+ $pcode =~ s/^L/mass/;
+ $pcode =~ s/^M/both/;
+ $pcode =~ s/^N/proper/;
+ if ( $pcode =~ /^Y/ ) {
+ $pcode = 'count' if $infl =~ /^[>\)\]]/;
+ $pcode = 'mass' if $infl =~ /^\}/;
+ $pcode = 'proper' if $infl =~ /^[:=~]/;
+ }
+ # if this is a singular form, work out plural form
+ unless ( $infl =~ /^j/ ) {
+ $pl = '-';
+ if ( $infl == 6 ) {
+ ( $pl = $word ) =~ s/$/s/;
+ }
+ elsif ( $infl == 7 ) {
+ ( $pl = $word ) =~ s/$/es/;
+ }
+ elsif ( $infl == 8 ) {
+ ( $pl = $word ) =~ s/y$/ies/;
+ }
+ elsif ( $infl =~ /^[9k\]]/ ) {
+ $pl = $word;
+ }
+ elsif ( $infl =~ /^i/ ) {
+ # for irregulars, let's just make a guess and mark with '*'
+ # this could be done better, as for verbs, but I can't be bothered now
+ $pl = $word;
+ ( $pl =~ s/^((wo)?m)an/$1en\*/ ) or
+ ( $pl =~ s/man(-|$)/men$1\*/ ) or
+ ( $pl =~ s/-in-law/s-in-law\*/ ) or
+ ( $pl =~ s/um$/a\*/ ) or
+ ( $pl =~ s/us$/i\*/ ) or
+ ( $pl =~ s/a$/ae\*/ ) or
+ ( $pl =~ s/on$/a\*/ ) or
+ ( $pl =~ s/is$/es\*/ ) or
+ ( $pl =~ s/o$/i\*/ ) or
+ ( $pl =~ s/child$/children\*/ ) or
+ ( $pl =~ s/oot$/eet\*/ ) or
+ ( $pl =~ s/ooth$/eeth\*/ ) or
+ ( $pl =~ s/([lm])ouse$/$1ice\*/ ) or
+ ( $pl =~ s/f(e)?$/ves\*/ ) or
+ ( $pl =~ s/[ei]x$/ices\*/ ) or
+ ( $pl =~ s/eau$/eaux\*/ ) or
+ ( $pl = 'IRREG' );
+ }
+ # if plural-only, swap root form & plural
+ elsif ( $infl =~ /^\)/ ) {
+ $pl = $word;
+ $word = '-';
+ }
+ ( $infl =~ s/^[:l]/per/ ) or ( $infl =~ s/^[mn]/loc/ ) or ( $infl = '_' );
+ $words{"${name}_N"} = "mkN \"$word\" \"$pl\"";
+ }
+ }
+ # for adjectives, get comparative & superlative forms
+ elsif( $pcode =~ /^O/ ) {
+ $pos = 'adj';
+ # if this is root form, work out inflected forms
+ unless ( $infl =~ /^[rs]/ ) {
+ if ( $infl =~ /^[Apqt]/ ) {
+ $comp = $sup = '-';
+ }
+ elsif ( $infl =~ /^B/ ) {
+ ( $comp = $word ) =~ s/$/r/;
+ ( $sup = $word ) =~ s/$/st/;
+ }
+ elsif ( $infl =~ /^C/ ) {
+ ( $comp = $word ) =~ s/$/er/;
+ ( $sup = $word ) =~ s/$/est/;
+ }
+ elsif ( $infl =~ /^D/ ) {
+ ( $comp = $word ) =~ s/y$/ier/;
+ ( $sup = $word ) =~ s/y$/iest/;
+ }
+ elsif ( $infl =~ /^E/ ) {
+ # for irregulars, let's just have a guess and mark with '*'
+ # (there aren't very many of these)
+ ( $comp = $word ) =~ s/(\w)$/$1$1er\*/;
+ ( $sup = $word ) =~ s/(\w)$/$1$1est\*/;
+ }
+ $infl =~ s/^[ABCDE]/normal/;
+ $infl =~ s/^p/pred/;
+ $infl =~ s/^q/attr/;
+ $infl =~ s/^t/affix/;
+
+ $words{"${name}_A"} = "mkA \"$word\" \"$comp\"";
+ }
+ }
+ # for adverbs, just add all info to @adv array
+ elsif( $pcode =~ /^P/ ) {
+ $pos = 'adv';
+ $infl =~ s/^[u\+]/normal/;
+ $infl =~ s/^w/whrel/;
+ $infl =~ s/^v/whq/;
+ $words{"${name}_Adv"} = "mkAdv \"$word\"";
+ }
+ # for pronouns, work out some case/person info
+ elsif( $pcode =~ s/^Q/_/ ) {
+ $pos = 'pron';
+ $infl =~ s/^x/normal/;
+ $infl =~ s/^y/whq/;
+ $infl =~ s/^z/whrel/;
+ $class = '_';
+ # reflexive pronouns
+ if ( ( $word =~ /self$/ ) or
+ ( $word =~ /selves$/ ) ) {
+ $pcode = 'acc';
+ }
+ # accusative personal pronouns
+ if ( ( $word =~ /^him/ ) or
+ ( $word =~ /^her/ ) or
+ ( $word =~ /^them/ ) or
+ ( $word eq 'us' ) or
+ ( $word eq 'thee' ) or
+ ( $word eq 'me' ) ) {
+ $pcode = 'acc';
+ $class = 'per';
+ }
+ # nominative personal pronouns
+ if ( ( $word eq 'he' ) or
+ ( $word eq 'she' ) or
+ ( $word eq 'they' ) or
+ ( $word eq 'we' ) or
+ ( $word eq 'thou' ) or
+ ( $word eq 'i' ) ) {
+ $pcode = 'nom';
+ $class = 'per';
+ }
+ # other personal pronouns
+ if ( ( $word =~ /.+one/ ) or
+ ( $word =~ /one.+/ ) or
+ ( $word =~ /body/ ) or
+ ( $word =~ /^you/ ) or
+ ( $word =~ /^who/ ) ) {
+ $class = 'per';
+ }
+ # non-personal pronouns
+ if ( $word =~ /thing/ ) {
+ $class = 'nper';
+ }
+ # otherwise case/person info will be '_' (anon variable)
+ # add full spec to @pron array
+ #push( @pron, "$pos( \'$word\', $pcode, $infl, $class ).\n" );
+ }
+ # for determiners, leave anon variable as placeholder for semantics
+ elsif( $pcode =~ /^[RS]/ ) {
+ $pos = 'det';
+ $pcode =~ s/^R/def/;
+ $pcode =~ s/^S/indef/;
+ #push( @det, "$pos( \'$word\', $pcode, _ ).\n" );
+ }
+ # for prepositions - nothing to say
+ elsif( $pcode =~ s/^T/prep/ ) {
+ $pos = 'prep';
+ #push( @prep, "$pos( \'$word\', $pcode ).\n" );
+ }
+ # for conjunctions - nothing to say
+ elsif( $pcode =~ s/^V/conj/ ) {
+ $pos = 'conj';
+ #push( @conj, "$pos( \'$word\', $pcode ).\n" );
+ }
+ # for miscellaneous, leave '-' as placeholder for illocutionary info
+ elsif( $pcode =~ /^[UWXZ]/ ) {
+ $pos = 'misc';
+ #push( @prefix, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^U/prefix/ );
+ #push( @interj, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^W/interj/ );
+ #push( @partcl, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^X/partcl/ );
+ #push( @unknown, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^Z/unknown/ );
+ }
+ }
+ }
+}
+
+$absfile = "Oald.gf";
+$cncfile = "OaldEng.gf";
+
+open (ABS, '>', $absfile);
+open (CNC, '>', $cncfile);
+
+
+
+# print a nice comment at the top
+$header = "-- GF lexicon, from OALD machine-readable dictionary\n"
+ . "-- Produced by asc2gf, based on asc2lex, Matthew Purver 11/2001\n\n";
+print ABS $header;
+print CNC $header;
+
+print ABS "abstract Oald = {\n";
+print CNC "concrete OaldEng of Oald = {\n";
+
+foreach $name (sort (keys %words)) {
+ ($cat = $name) =~ s/.*_([A-Z\d])$/$1/;
+ $lin = $words{$name};
+ print ABS "fun $name : $cat;\n";
+ print CNC "lin $name = $lin;\n";
+ print "$name\n";
+}
+
+print ABS "}";
+print CNC "}";
+
+close(ABS);
+close(CNC);
+
+print "\nWrote lexicon to $absfile and $cncfile\n";
+
+exit 0;
+
+
+
+
+
+
+
+
+
+
+
+
+# now have a guess at irregular verb forms (marking the best guess with '*')
+foreach $verb ( @verb ) {
+ if ( $verb =~ /verb\( \'([^\']+)\', \'IRREG/ ) {
+ $word = $1;
+ $vbz = findbest( $word, @vbz );
+ $vbg = findbest( $word, @vbg );
+ $vbd = findbest( $word, @vbd );
+ $vbn = findbest( $word, @vbn );
+ $verb =~ s/($word\', \')IRREG(\', \')IRREG(\', \')IRREG(\', \')IRREG/\*$1$vbz$2$vbg$3$vbd$4$vbn/;
+ }
+}
+
+# now print everything out (so we can group PoSs together)
+print @verb, "\n", @noun, "\n", @adj, "\n", @adv;
+print "\n", @pron, "\n", @det, "\n", @prep, "\n", @conj;
+print "\n", @prefix, "\n", @interj, "\n", @partcl, "\n", @unknown;
+
+
+# find closest string match
+# similarity measure is just the length of identical prefix
+# prefer shorter strings in the case of equal similarity
+sub findbest
+{
+ my ( $word, @array ) = @_;
+
+ $bestlen = 0;
+ foreach $test ( @array ) {
+ if ( ( substr( $word, 0, $bestlen-1 ) eq substr( $test, 0, $bestlen-1 ) ) &&
+ ( length( $test ) < length( $best ) ) ) {
+ $best = $test;
+ }
+ while ( ( substr( $word, 0, $bestlen ) eq substr( $test, 0, $bestlen ) ) &&
+ ( $bestlen <= length( $test ) ) ) {
+ $bestlen++;
+ $best = $test;
+ }
+ }
+ return $best;
+}