diff options
Diffstat (limited to 'next-lib/src/parse/oald/asc2gf')
| -rw-r--r-- | next-lib/src/parse/oald/asc2gf | 453 |
1 files changed, 0 insertions, 453 deletions
diff --git a/next-lib/src/parse/oald/asc2gf b/next-lib/src/parse/oald/asc2gf deleted file mode 100644 index d3db827f8..000000000 --- a/next-lib/src/parse/oald/asc2gf +++ /dev/null @@ -1,453 +0,0 @@ -#! /usr/bin/perl -w -# -# Perl script to process OALD machine-readable ASCII file -# into a GF lexicon -# -# Usage: ./asc2gf < ascii_0710-1.txt -# -# Bjorn Bringert 2008, -# based on asc2lex by -# Matthew Purver, 11/2001 - -use strict; - -my %irregular_verbs = (); -my %words = (); - -my $irreg_eng = "../../english/IrregEng.gf"; - -open(IRREG_ENG,"$irreg_eng") or die "Could not open $irreg_eng\n"; -while (<IRREG_ENG>) { - if (s/\s*([a-z\d]+)_V\s*=.*/$1/) { - chomp; - $irregular_verbs{$_} = 1; - } -} -close IRREG_ENG; - -print "Known irregular verbs from $irreg_eng:\n"; -print join(",", keys %irregular_verbs) . "\n"; - - -# skip header section -while ( <STDIN> ) { - last if /<\/TEIHEADER>/; -} - -# read a line from stdin -while ( my $line = <STDIN> ) { - - # remove SGML tags - $line =~ s/<[^<>]+>//g; - - # split line into fields according to spec (line may be empty now) - if ( $line =~ /^(.{23}).{23}(.{23}).{1}(.{58})$/ ) { - - my ( $word, $pos, $cat ) = ( $1, $2, $3 ); - - # trim white space - for ( ( $word, $pos, $cat ) ) { - s/\s*$//; - } - - # make word lower-case - $word =~ tr/A-Z/a-z/; # lower case - - # translate OALD diacritics - $word =~ s/~n/ñ/g; - $word =~ s/<c/ç/g; - $word =~ s/"a/ä/g; - $word =~ s/"o/ö/g; - $word =~ s/"u/ü/g; - $word =~ s/"i/ï/g; - $word =~ s/\^a/â/g; - $word =~ s/\^e/ê/g; - $word =~ s/\^o/ô/g; - $word =~ s/`a/à/g; - $word =~ s/`e/è/g; - $word =~ s/_e/é/g; - - # make legal identifier - # Note: in theory this could cause clashes, but I don't think it does - # with the OALD. - my $name = $word; - $name =~ s/ /_/g; # space -> _ - $name =~ s/-/_/g; # - -> _ - $name =~ s/\./_/g; # . -> _ - $name =~ s/^'//; # drop initial ' - - - # get PoS & subcat info - my @pos = split( /,/, $pos ); - $cat =~ s/,/\',\'/g; - ( $cat = "\'$cat\'" ) unless ( $cat eq '' ); - - foreach ( @pos ) { - my ( $pcode, $infl, $freq )=split(//); - - # for verbs, get inflected forms - if ( $pcode =~ /^[GHIJ]/ ) { - $pos = 'verb'; - my ($vbz, $vbg, $vbd); - - # if this is a root form, work out the inflected forms - if ( $infl =~ /^\d/ ) { - if ( $infl == 0 ) { - ( $vbz = $word ) =~ s/$/s/; - ( $vbg = $word ) =~ s/$/ing/; - ( $vbd = $word ) =~ s/$/ed/; - } - elsif ( $infl == 1 ) { - ( $vbz = $word ) =~ s/$/es/; - ( $vbg = $word ) =~ s/$/ing/; - ( $vbd = $word ) =~ s/$/ed/; - } - elsif ( $infl == 2 ) { - ( $vbz = $word ) =~ s/e$/es/; - ( $vbg = $word ) =~ s/e$/ing/; - ( $vbd = $word ) =~ s/e$/ed/; - } - elsif ( $infl == 3 ) { - ( $vbz = $word ) =~ s/y$/ies/; - ( $vbg = $word ) =~ s/y$/ying/; - ( $vbd = $word ) =~ s/y$/ied/; - } - elsif ( $infl == 4 ) { - ( $vbz = $word ) =~ s/$/s/; - ( $vbg = $word ) =~ s/(\w)$/$1$1ing/; - ( $vbd = $word ) =~ s/(\w)$/$1$1ed/; - } - elsif ( $infl == 5 ) { - # for irregulars, just mark as such for now, we'll guess later - $vbz = 'IRREG'; - $vbg = 'IRREG'; - $vbd = 'IRREG'; - } - - my $lin = "mkV \"$word\" \"$vbz\" \"$vbd\" \"$vbd\" \"$vbg\""; - - # try to use a verb from IrregEng - if ( $infl == 5 ) { - for (my $i = 0; $i < length($word) - 1; $i++) { - my $suffix = substr($word, $i); - if ($irregular_verbs{$suffix}) { - if ($i == 0) { - $lin = "IrregEng.${name}_V"; - } else { - my $prefix = substr($word, 0, $i); - $lin = "mkV \"$prefix\" IrregEng.${suffix}_V"; - } - last; - } - } - } - - if ($pcode eq 'G') { - #add_word("${name}_VX", "mkVX ($lin)"); - print STDERR "Ignoring anomalous verb: $name\n"; - } - if ($pcode eq 'I' || $pcode eq 'J') { - add_word("${name}_V", "$lin"); - } - if ($pcode eq 'H' || $pcode eq 'J') { - add_word("${name}_V2", "mkV2 ($lin)"); - } - } - # if this is an inflected form, save for guessing irregulars later - elsif ( $infl =~ /^a/ ) { - #push( @vbz, $word ); - } - elsif ( $infl =~ /^b/ ) { - #push( @vbg, $word ); - } - elsif ( $infl =~ /^c/ ) { - #push( @vbd, $word ); - } - elsif ( $infl =~ /^d/ ) { - #push( @vbn, $word ); - } - } - # for nouns, get plural form - elsif( $pcode =~ /^[KLMNY]/ ) { - $pos = 'noun'; - $pcode =~ s/^K/count/; - $pcode =~ s/^L/mass/; - $pcode =~ s/^M/both/; - $pcode =~ s/^N/proper/; - if ( $pcode =~ /^Y/ ) { - $pcode = 'count' if $infl =~ /^[>\)\]]/; - $pcode = 'mass' if $infl =~ /^\}/; - $pcode = 'proper' if $infl =~ /^[:=~]/; - } - # if this is a singular form, work out plural form - unless ( $infl =~ /^j/ ) { - my $pl = '-'; - if ( $infl eq '6' ) { - ( $pl = $word ) =~ s/$/s/; - } - elsif ( $infl eq '7' ) { - ( $pl = $word ) =~ s/$/es/; - } - elsif ( $infl eq '8' ) { - ( $pl = $word ) =~ s/y$/ies/; - } - elsif ( $infl =~ /^[9k\]]/ ) { - $pl = $word; - } - elsif ( $infl =~ /^i/ ) { - # for irregulars, let's just make a guess and mark with '*' - # this could be done better, as for verbs, but I can't be bothered now - $pl = $word; - ( $pl =~ s/^((wo)?m)an/$1en\*/ ) or - ( $pl =~ s/man(-|$)/men$1\*/ ) or - ( $pl =~ s/-in-law/s-in-law\*/ ) or - ( $pl =~ s/um$/a\*/ ) or - ( $pl =~ s/us$/i\*/ ) or - ( $pl =~ s/a$/ae\*/ ) or - ( $pl =~ s/on$/a\*/ ) or - ( $pl =~ s/is$/es\*/ ) or - ( $pl =~ s/o$/i\*/ ) or - ( $pl =~ s/child$/children\*/ ) or - ( $pl =~ s/oot$/eet\*/ ) or - ( $pl =~ s/ooth$/eeth\*/ ) or - ( $pl =~ s/([lm])ouse$/$1ice\*/ ) or - ( $pl =~ s/f(e)?$/ves\*/ ) or - ( $pl =~ s/[ei]x$/ices\*/ ) or - ( $pl =~ s/eau$/eaux\*/ ) or - ( $pl = 'IRREG' ); - } - # if plural-only, swap root form & plural - elsif ( $infl =~ /^\)/ ) { - $pl = $word; - $word = '-'; - } - ( $infl =~ s/^[:l]/per/ ) or ( $infl =~ s/^[mn]/loc/ ) or ( $infl = '_' ); - - my $comment = ""; - if ( $word eq '-' ) { - $comment .= " {- FIXME: no singular form -}"; - } - if ( $pl eq '-' ) { - $comment .= " {- FIXME: no plural form -}"; - } - if ( $pl =~ s/\*$// ) { - $comment .= " {- FIXME: guessed plural form -}"; - } - - if ( $pcode eq 'proper' ) { - add_word("${name}_PN", "mkPN \"$word\""); - } else { - add_word("${name}_N", "mkN \"$word\" \"$pl\"$comment"); - } - } - } - # for adjectives, get comparative & superlative forms - elsif( $pcode =~ /^O/ ) { - $pos = 'adj'; - # if this is root form, work out inflected forms - unless ( $infl =~ /^[rs]/ ) { - my ($comp, $sup); - if ( $infl =~ /^[Apqt]/ ) { - $comp = $sup = '-'; - } - elsif ( $infl =~ /^B/ ) { - ( $comp = $word ) =~ s/$/r/; - ( $sup = $word ) =~ s/$/st/; - } - elsif ( $infl =~ /^C/ ) { - ( $comp = $word ) =~ s/$/er/; - ( $sup = $word ) =~ s/$/est/; - } - elsif ( $infl =~ /^D/ ) { - ( $comp = $word ) =~ s/y$/ier/; - ( $sup = $word ) =~ s/y$/iest/; - } - elsif ( $infl =~ /^E/ ) { - # for irregulars, let's just have a guess and mark with '*' - # (there aren't very many of these) - ( $comp = $word ) =~ s/(\w)$/$1$1er\*/; - ( $sup = $word ) =~ s/(\w)$/$1$1est\*/; - } - $infl =~ s/^[ABCDE]/normal/; - $infl =~ s/^p/pred/; - $infl =~ s/^q/attr/; - $infl =~ s/^t/affix/; - - if ( $comp eq '-' ) { - add_word("${name}_A", "compoundA (mkA \"$word\")"); - } else { - add_word("${name}_A", "mkA \"$word\" \"$comp\""); - } - } - } - # adverb - elsif( $pcode =~ /^P/ ) { - $pos = 'adv'; - $infl =~ s/^[u\+]/normal/; - $infl =~ s/^w/whrel/; - $infl =~ s/^v/whq/; - add_word("${name}_Adv", "mkAdv \"$word\""); - } - # pronoun - elsif( $pcode =~ s/^Q/_/ ) { - $pos = 'pron'; - $infl =~ s/^x/normal/; - $infl =~ s/^y/whq/; - $infl =~ s/^z/whrel/; - my $class = '_'; - # reflexive pronouns - if ( ( $word =~ /self$/ ) or - ( $word =~ /selves$/ ) ) { - $pcode = 'acc'; - } - # accusative personal pronouns - if ( ( $word =~ /^him/ ) or - ( $word =~ /^her/ ) or - ( $word =~ /^them/ ) or - ( $word eq 'us' ) or - ( $word eq 'thee' ) or - ( $word eq 'me' ) ) { - $pcode = 'acc'; - $class = 'per'; - } - # nominative personal pronouns - if ( ( $word eq 'he' ) or - ( $word eq 'she' ) or - ( $word eq 'they' ) or - ( $word eq 'we' ) or - ( $word eq 'thou' ) or - ( $word eq 'i' ) ) { - $pcode = 'nom'; - $class = 'per'; - } - # other personal pronouns - if ( ( $word =~ /.+one/ ) or - ( $word =~ /one.+/ ) or - ( $word =~ /body/ ) or - ( $word =~ /^you/ ) or - ( $word =~ /^who/ ) ) { - $class = 'per'; - } - # non-personal pronouns - if ( $word =~ /thing/ ) { - $class = 'nper'; - } - # otherwise case/person info will be '_' (anon variable) - # add full spec to @pron array - #push( @pron, "$pos( \'$word\', $pcode, $infl, $class ).\n" ); - } - # for determiners, leave anon variable as placeholder for semantics - elsif( $pcode =~ /^[RS]/ ) { - $pos = 'det'; - $pcode =~ s/^R/def/; - $pcode =~ s/^S/indef/; - #add_word("${name}_Det","mkDeterminer \"$word\""); - } - # for prepositions - nothing to say - elsif( $pcode =~ s/^T/prep/ ) { - $pos = 'prep'; - add_word("${name}_Prep","mkPrep \"$word\""); - } - # for conjunctions - nothing to say - elsif( $pcode =~ s/^V/conj/ ) { - $pos = 'conj'; - add_word("${name}_Conj","mkConj \"$word\""); - } - # for miscellaneous, leave '-' as placeholder for illocutionary info - elsif( $pcode =~ /^[UWXZ]/ ) { - $pos = 'misc'; - #push( @prefix, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^U/prefix/ ); - #push( @interj, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^W/interj/ ); - #push( @partcl, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^X/partcl/ ); - #push( @unknown, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^Z/unknown/ ); - } - } - } -} - -my $absfile = "Oald.gf"; -my $cncfile = "OaldEng.gf"; -my $abs_structfile = "OaldStructural.gf"; -my $cnc_structfile = "OaldStructuralEng.gf"; - -open (ABS, '>', $absfile); -open (CNC, '>', $cncfile); - -open (ABS_STRUCTURAL, '>', $abs_structfile); -open (CNC_STRUCTURAL, '>', $cnc_structfile); - - - -# print a nice comment at the top -my $header = "-- English lexicon for GF, produced from:\n" - . "-- Oxford advanced learner's dictionary of current English:\n" - . "-- expanded 'computer usable' version compiled by Roger Mitton\n" - . "-- The computer usable version is transcribed from:\n" - . "-- Oxford advanced learner's dictionary of current English\n" - . "-- A.S. Hornby ; with the assistance of A.P. Cowie [and] J. Windsor Lewis.\n" - . "-- 3rd. ed., London : Oxford University Press, 1974.\n" - . "-- Distributed as 'dict0710' by:\n" - . "-- Oxford Text Archive\n" - . "-- Oxford University Computing Services\n" - . "-- 13 Banbury Road\n" - . "-- Oxford\n" - . "-- OX2 6NN\n" - . "-- Under these conditions:\n" - . "-- Freely available for non-commercial use provided that this header is\n" - . "-- included in its entirety with any copy distributed.\n" - . "--\n" - . "-- GF version generated by asc2gf, Bjorn Bringert Nov 2008\n" - . "-- based on asc2lex, Matthew Purver Nov 2001\n" - . "-- http://www.stanford.edu/~mpurver/software.html\n" - . "\n"; -print ABS $header; -print ABS "abstract Oald = Cat ** {\n"; - -print CNC $header; -print CNC "--# -path=.:alltenses\n"; -print CNC "concrete OaldEng of Oald = CatEng ** open ParadigmsEng, IrregEng in {\n"; - -print ABS_STRUCTURAL $header; -print ABS_STRUCTURAL "abstract OaldStructural = Cat ** {\n"; - -print CNC_STRUCTURAL $header; -print CNC_STRUCTURAL "--# -path=.:alltenses\n"; -print CNC_STRUCTURAL "concrete OaldStructuralEng of OaldStructural = CatEng ** open ParadigmsEng in {\n"; - -foreach my $name (sort (keys %words)) { - (my $cat = $name) =~ s/.*_([A-Z][A-Za-z\d]*)$/$1/; - my $lin = $words{$name}; - if ( $cat =~ /^(A)|(N)|(V)|(V2)$/ ) { - print ABS "fun $name : $cat;\n"; - print CNC "lin $name = $lin;\n"; - } else { - print ABS_STRUCTURAL "fun $name : $cat;\n"; - print CNC_STRUCTURAL "lin $name = $lin;\n"; - } -} - -print ABS "}"; -print CNC "}"; - -print ABS_STRUCTURAL "}"; -print CNC_STRUCTURAL "}"; - -close(ABS_STRUCTURAL); -close(CNC_STRUCTURAL); - -close(ABS); -close(CNC); - -print "\nWrote open lexicon to $absfile and $cncfile\n"; -print "Wrote closed lexicon to $abs_structfile and $cnc_structfile\n"; - - - -sub add_word { - my ($name,$lin) = @_; - if (exists $words{$name}) { - print STDERR "Duplicate word: $name\n"; - } else { - $words{$name} = $lin; - } -} |
