summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Frolov <nf@mkmks.org>2011-12-31 02:36:24 +0000
committerNick Frolov <nf@mkmks.org>2011-12-31 02:36:24 +0000
commit8e2c1823ed74b32d980b27ec2bd8972c80488309 (patch)
treed653937afe22f139612fa4efa079c94548bea80d
parentba451d203aef642e98790e054f31c5aae6d8d861 (diff)
A Russian dictionary
A Russian dictionary generated from a wordlist created by the FreeLing project. The accompanying converter can be used to convert other wordlists in EAGLES format to GF grammars.
-rw-r--r--contrib/eaglesconv/CollectLemmas.hs28
-rw-r--r--contrib/eaglesconv/EaglesConv.hs135
-rw-r--r--contrib/eaglesconv/EaglesMatcher.hs63
-rw-r--r--contrib/eaglesconv/EaglesParser.hs239
-rw-r--r--contrib/eaglesconv/README24
-rw-r--r--contrib/eaglesconv/mkAbstract.sh8
-rw-r--r--contrib/eaglesconv/mkConcrete.sh12
-rw-r--r--contrib/eaglesconv/run_conv.sh4
8 files changed, 513 insertions, 0 deletions
diff --git a/contrib/eaglesconv/CollectLemmas.hs b/contrib/eaglesconv/CollectLemmas.hs
new file mode 100644
index 000000000..a63e7e1a8
--- /dev/null
+++ b/contrib/eaglesconv/CollectLemmas.hs
@@ -0,0 +1,28 @@
+-- Copyright (C) 2011 Nikita Frolov
+
+import qualified Data.Text as T
+import qualified Data.Text.IO as UTF8
+import System.IO
+import System.Environment
+import Control.Monad
+import Control.Monad.State
+
+main :: IO ()
+main = do
+ args <- getArgs
+ forM_ args $ \ f -> do
+ entries <- UTF8.readFile f >>= (return . T.lines)
+ forM_ entries $ \ entry ->
+ do
+ let ws = T.words entry
+ form = head ws
+ tags = toPairs $ tail ws
+ forM_ tags $ \ (lemma, tag) ->
+ do
+ UTF8.putStrLn $ T.concat [lemma, sp, form, sp, tag]
+ where sp = T.singleton ' '
+
+
+toPairs xs = zip (stride 2 xs) (stride 2 (drop 1 xs))
+ where stride _ [] = []
+ stride n (x:xs) = x : stride n (drop (n-1) xs)
diff --git a/contrib/eaglesconv/EaglesConv.hs b/contrib/eaglesconv/EaglesConv.hs
new file mode 100644
index 000000000..aa8929496
--- /dev/null
+++ b/contrib/eaglesconv/EaglesConv.hs
@@ -0,0 +1,135 @@
+-- Copyright (C) 2011 Nikita Frolov
+
+-- No, we can't pipeline parsing and generation, because there is no guarantee
+-- that we have collected all forms for a lemma before we've scanned the
+-- complete file.
+
+import qualified Data.Text as T
+import qualified Data.Text.IO as UTF8
+import System.IO
+import System.Environment
+import Control.Monad
+import Control.Monad.State
+import qualified Data.Map as M
+import Codec.Text.IConv
+import qualified Data.ByteString.Lazy as BS
+import qualified Data.ByteString.Internal as BSI
+
+import EaglesMatcher
+
+type Lemmas = M.Map T.Text Forms
+
+main :: IO ()
+main = do
+ args <- getArgs
+ forM_ args $ \ f -> do
+ entries <- UTF8.readFile f >>= (return . T.lines)
+ lemmas <- return $ execState (collectLemmas entries) (M.empty :: Lemmas)
+ mapM_ generateLin (M.assocs lemmas)
+
+collectLemmas entries = do
+ forM_ entries $ \ entry -> do
+ let ws = T.words entry
+ lemma = head ws
+ tags = toPairs $ tail ws
+ lemmas <- get
+ forM_ tags $ \ (form, tag) -> do
+ let forms = (case M.lookup lemma lemmas of
+ Just f -> f
+ Nothing -> M.empty) :: Forms
+ if isOpenCat . T.unpack $ tag
+ then put $ M.insert lemma (M.insert tag form forms) lemmas
+ else return ()
+
+generateLin :: (T.Text, Forms) -> IO ()
+generateLin (lemma, forms) = do
+ let lemma' = myVeryOwnCyrillicRomanizationIConvSucks lemma
+ UTF8.putStr $ T.concat [T.pack "lin ", lemma']
+ UTF8.putStr $ case T.unpack . head . M.keys $ forms of
+ ('N':_:_:_:g:a:'0':_) ->
+ T.concat $ [T.pack "_N = mkN "]
+ ++ map (quote . noun forms) [ ('N','S'), ('G','S')
+ , ('D','S'), ('F','S'), ('C','S'), ('O','S')
+ , ('L','S'), ('N','P'), ('G','P'), ('D','P')
+ , ('F','P'), ('C','P'), ('O','P') ]
+ ++ [showG g, sp, showAni a, ln]
+ ('N':_:c:n:g:a:_) ->
+ T.concat $ [T.pack "_PN = mkPN "
+ , quote $ noun forms ('N', 'S')
+ , showG g, sp
+ , showN n, sp, showAni a, ln]
+ ('A':_) ->
+ T.concat $ [T.pack "_A = mkA ", quote $ adj forms 'P',
+ if adj forms 'P' /= adj forms 'C'
+ then quote $ adj forms 'C'
+ else T.pack ""
+ , ln]
+ ('V':t) ->
+ let a = case t of
+ (_:_:_:_:'P':_:a':_) -> a'
+ (_:_:_:_:_:a':_) -> a'
+ in
+ T.concat $ [T.pack "_V = mkV ", showAsp a, sp]
+ ++ map (quote . verbPres forms) [ ('S','1'), ('S','2')
+ , ('S','3'), ('P','1')
+ , ('P','2'), ('P','3')]
+ ++ [ quote $ verbPast forms ('S', 'M')
+ , quote $ verbImp forms, quote $ verbInf forms, ln]
+ ('D':_) ->
+ T.concat $ [T.pack "_Adv = mkAdv "
+ , quote . adv $ forms, ln]
+ putStrLn ""
+ hFlush stdout
+ where quote x = T.concat [T.pack "\"", x, T.pack "\" "]
+ showG 'F' = T.pack "Fem"
+ showG 'A' = T.pack "Neut"
+ showG _ = T.pack "Masc"
+ showAni 'I' = T.pack "Inanimate"
+ showAni _ = T.pack "Animate"
+ showN 'P' = T.pack "Pl"
+ showN _ = T.pack "Sg"
+ showAsp 'F' = T.pack "Perfective"
+ showAsp _ = T.pack "Imperfective"
+ sp = T.singleton ' '
+ ln = T.pack " ;"
+
+toPairs xs = zip (stride 2 xs) (stride 2 (drop 1 xs))
+ where stride _ [] = []
+ stride n (x:xs) = x : stride n (drop (n-1) xs)
+
+myVeryOwnCyrillicRomanizationIConvSucks s = T.pack . concatMap r . T.unpack $ s
+ where r 'а' = "a"
+ r 'б' = "b"
+ r 'в' = "v"
+ r 'г' = "g"
+ r 'д' = "d"
+ r 'е' = "je"
+ r 'ё' = "jo"
+ r 'ж' = "zh"
+ r 'з' = "z"
+ r 'и' = "i"
+ r 'й' = "jj"
+ r 'к' = "k"
+ r 'л' = "l"
+ r 'м' = "m"
+ r 'н' = "n"
+ r 'о' = "o"
+ r 'п' = "p"
+ r 'р' = "r"
+ r 'с' = "s"
+ r 'т' = "t"
+ r 'у' = "u"
+ r 'ф' = "f"
+ r 'х' = "kh"
+ r 'ц' = "c"
+ r 'ч' = "ch"
+ r 'ш' = "sh"
+ r 'щ' = "shc"
+ r 'ъ' = "yy"
+ r 'ы' = "y"
+ r 'ь' = "q"
+ r 'э' = "e"
+ r 'ю' = "ju"
+ r 'я' = "ja"
+ r '-' = "_"
+ r o = [o]
diff --git a/contrib/eaglesconv/EaglesMatcher.hs b/contrib/eaglesconv/EaglesMatcher.hs
new file mode 100644
index 000000000..27e76706f
--- /dev/null
+++ b/contrib/eaglesconv/EaglesMatcher.hs
@@ -0,0 +1,63 @@
+-- Copyright (C) 2011 Nikita Frolov
+
+-- The format specification can be found at
+-- http://devel.cpl.upc.edu/freeling/svn/trunk/doc/tagsets/tagset-ru.html
+
+-- Bugs in the specification:
+-- Participle, 2nd field: case, not mood
+-- Participle, 6th field: field, not person
+-- Verb, persons can be denoted both with 'Pnumber' or just 'number'
+-- Noun, 10th field can be absent
+
+-- No, it wouldn't be simpler to implement this grammar with Parsec or another
+-- parser combinator library.
+
+
+module EaglesMatcher where
+
+import qualified Data.Text as T
+import Data.List
+import qualified Data.Map as M
+
+type Forms = M.Map T.Text T.Text
+
+isOpenCat ('A':_) = True
+isOpenCat ('N':_) = True
+isOpenCat ('V':_) = True
+isOpenCat ('D':_) = True
+isOpenCat _ = False
+
+noun forms (c, n) = findForm (matchNoun . T.unpack) forms
+ where matchNoun ('N':_:c':n':_) = c == c' && n == n'
+ matchNoun _ = False
+
+adj forms d = findForm (matchAdj . T.unpack) forms
+ where matchAdj ('A':'N':'S':'M':_:'F':d':_) = d == d
+ matchAdj _ = False
+
+verbPres forms (n, p) = findForm (matchPres . T.unpack) forms
+ where matchPres ('V':'D':n':_:'P':'P':p':_:'A':_) = n == n' && p == p'
+ matchPres ('V':'D':n':_:'F':'P':p':_:'A':_) = n == n' && p == p'
+ matchPres ('V':'D':n':_:'P':'P':p':_) = n == n' && p == p'
+ matchPres ('V':'D':n':_:'F':'P':p':_) = n == n' && p == p'
+ matchPres _ = False
+
+verbPast forms (n, g) = findForm (matchPast . T.unpack) forms
+ where matchPast ('V':'D':n':g':'S':_:_:'A':_) = n == n' && g == g'
+ matchPast _ = False
+
+verbImp forms = findForm (matchImp . T.unpack) forms
+ where matchImp ('V':'M':_) = True
+ matchImp _ = False
+
+verbInf forms = findForm (matchInf . T.unpack) forms
+ where matchInf ('V':'I':_) = True
+ matchInf _ = False
+
+adv forms = findForm (matchAdv . T.unpack) forms
+ where matchAdv ('D':d:_) = d == 'P'
+ matchAdv _ = False
+
+findForm match forms = case find match (M.keys forms) of
+ Just tag -> forms M.! tag
+ Nothing -> findForm (\ _ -> True) forms
diff --git a/contrib/eaglesconv/EaglesParser.hs b/contrib/eaglesconv/EaglesParser.hs
new file mode 100644
index 000000000..6fc64d3b8
--- /dev/null
+++ b/contrib/eaglesconv/EaglesParser.hs
@@ -0,0 +1,239 @@
+-- Copyright (C) 2011 Nikita Frolov
+
+-- An early version of the parser that requires somewhat more memory. Kept for
+-- nostalgic reasons.
+
+module EaglesParser where
+
+import qualified Data.Text as T
+import Data.List
+import qualified Data.Map as M
+
+type Forms = M.Map Tag T.Text
+
+data Tag = A Case Number Gender Animacy Form Degree Extra Obscene
+ | Adv Degree Extra Obscene
+ | AdvPron Extra
+ | Ord Case Number Gender Animacy
+ | AdjPron Case Number Gender Animacy Extra
+ | Frag Extra
+ | Conj Extra
+ | Inter Extra Obscene
+ | Num Case Number Gender Animacy Extra
+ | Part Extra
+ | Prep Extra
+ | N Case Number Gender Animacy Name Extra Obscene
+ | Pron Case Number Gender Animacy Extra
+ | V Mood Number Gender Tense Person Aspect Voice Trans Extra Obscene
+ | P Case Number Gender Tense Form Aspect Voice Trans Extra Obscene
+ deriving (Show, Ord, Eq)
+
+parseTag :: T.Text -> Tag
+parseTag tag = case (T.unpack tag) of {
+ ('A':c:n:g:a:f:cmp:e:o:[]) -> A (readCase c) (readNumber n)
+ (readGender g) (readAnimacy a)
+ (readForm f) (readDegree cmp)
+ (readExtra e) (readObscene o) ;
+ ('D':cmp:e:o:[]) -> Adv (readDegree cmp)
+ (readExtra e) (readObscene o) ;
+ ('P':e:[]) -> AdvPron (readExtra e) ;
+ ('Y':c:n:g:a:[]) -> Ord (readCase c) (readNumber n)
+ (readGender g) (readAnimacy a) ;
+ ('R':c:n:g:a:e:[]) -> AdjPron (readCase c) (readNumber n)
+ (readGender g) (readAnimacy a) (readExtra e) ;
+ ('M':e:[]) -> Frag (readExtra e) ;
+ ('C':e:[]) -> Conj (readExtra e) ;
+ ('J':e:o:[]) -> Inter (readExtra e) (readObscene o) ;
+ ('Z':c:n:g:a:e:[]) -> Num (readCase c) (readNumber n)
+ (readGender g) (readAnimacy a) (readExtra e) ;
+ ('T':e:[]) -> Part (readExtra e) ;
+ ('B':e:[]) -> Prep (readExtra e) ;
+ ('N':_:c:n:g:a:name:e:o:_:[]) -> N (readCase c) (readNumber n)
+ (readGender g) (readAnimacy a)
+ (readName name)
+ (readExtra e) (readObscene o) ;
+ ('N':_:c:n:g:a:name:e:o:[]) -> N (readCase c) (readNumber n)
+ (readGender g) (readAnimacy a)
+ (readName name)
+ (readExtra e) (readObscene o) ;
+ ('E':c:n:g:a:e:[]) -> Pron (readCase c) (readNumber n)
+ (readGender g) (readAnimacy a) (readExtra e) ;
+ ('V':m:n:g:t:'P':p:a:v:tr:e:o:[]) -> V (readMood m) (readNumber n)
+ (readGender g) (readTense t)
+ (readPerson p) (readAspect a)
+ (readVoice v) (readTrans tr)
+ (readExtra e) (readObscene o) ;
+ ('V':m:n:g:t:'0':a:v:tr:e:o:[]) -> V (readMood m) (readNumber n)
+ (readGender g) (readTense t)
+ NP (readAspect a)
+ (readVoice v) (readTrans tr)
+ (readExtra e) (readObscene o) ;
+ ('V':m:n:g:t:p:a:v:tr:e:o:[]) -> V (readMood m) (readNumber n)
+ (readGender g) (readTense t)
+ (readPerson p) (readAspect a)
+ (readVoice v) (readTrans tr)
+ (readExtra e) (readObscene o) ;
+ ('Q':c:n:g:t:f:a:v:tr:e:o:[]) -> P (readCase c) (readNumber n)
+ (readGender g) (readTense t)
+ (readForm f) (readAspect a)
+ (readVoice v) (readTrans tr)
+ (readExtra e) (readObscene o) ;
+ _ -> error $ "Parse error: " ++ T.unpack tag }
+
+data Case = Nom | Gen | Dat | Acc | Inst | Prepos | Partit | Loc | Voc | NC
+ deriving (Show, Ord, Eq)
+
+readCase 'N' = Nom
+readCase 'G' = Gen
+readCase 'D' = Dat
+readCase 'F' = Acc
+readCase 'C' = Inst
+readCase 'O' = Prepos
+readCase 'P' = Partit
+readCase 'L' = Loc
+readCase 'V' = Voc
+readCase '0' = NC
+
+data Number = Sg | Pl | NN deriving (Show, Ord, Eq)
+
+readNumber 'S' = Sg
+readNumber 'P' = Pl
+readNumber '0' = NN
+
+data Gender = Masc | Fem | Neut | Common | NG deriving (Show, Ord, Eq)
+
+readGender 'F' = Fem
+readGender 'M' = Masc
+readGender 'A' = Neut
+readGender 'C' = Common
+readGender '0' = NG
+
+data Animacy = Animate | Inanimate | NA deriving (Show, Ord, Eq)
+
+readAnimacy 'A' = Animate
+readAnimacy 'I' = Inanimate
+readAnimacy '0' = NA
+
+data Form = Short | Full | NF deriving (Show, Ord, Eq)
+
+readForm 'S' = Short
+readForm 'F' = Full
+readForm '0' = NF
+
+data Degree = Pos | Comp | Super | ND deriving (Show, Ord, Eq)
+
+readDegree 'E' = Super
+readDegree 'C' = Comp
+readDegree 'P' = Pos
+readDegree '0' = ND
+
+data Extra = Introductory | Difficult | Distorted | Predicative
+ | Colloquial | Rare | Abbreviation | Obsolete | NE deriving (Show, Ord, Eq)
+
+readExtra 'P' = Introductory
+readExtra 'D' = Difficult
+readExtra 'V' = Distorted
+readExtra 'R' = Predicative
+readExtra 'I' = Colloquial
+readExtra 'A' = Rare
+readExtra 'B' = Abbreviation
+readExtra 'E' = Obsolete
+readExtra '0' = NE
+
+data Obscene = Obscene | NO deriving (Show, Ord, Eq)
+
+readObscene 'H' = Obscene
+readObscene '0' = NO
+
+data Name = Topo | Proper | Patro | Family | NNa deriving (Show, Ord, Eq)
+
+readName 'G' = Topo
+readName 'N' = Proper
+readName 'S' = Patro
+readName 'F' = Family
+readName '0' = NNa
+
+data Mood = Gerund | Inf | Ind | Imp | NM deriving (Show, Ord, Eq)
+
+readMood 'G' = Gerund
+readMood 'I' = Inf
+readMood 'D' = Ind
+readMood 'M' = Imp
+readMood '0' = NM
+
+data Tense = Pres | Fut | Past | NT deriving (Show, Ord, Eq)
+
+readTense 'P' = Pres
+readTense 'F' = Fut
+readTense 'S' = Past
+readTense '0' = NT
+
+data Person = P1 | P2 | P3 | NP deriving (Show, Ord, Eq)
+
+readPerson '1' = P1
+readPerson '2' = P2
+readPerson '3' = P3
+
+data Aspect = Perf | Imperf | NAs deriving (Show, Ord, Eq)
+
+readAspect 'F' = Perf
+readAspect 'N' = Imperf
+readAspect '0' = NAs
+
+data Voice = Act | Pass | NV deriving (Show, Ord, Eq)
+
+readVoice 'A' = Act
+readVoice 'S' = Pass
+readVoice '0' = NV
+
+data Trans = Trans | Intrans | NTr deriving (Show, Ord, Eq)
+
+readTrans 'M' = Trans
+readTrans 'A' = Intrans
+readTrans '0' = NTr
+
+isOpenCat :: Tag -> Bool
+isOpenCat (A _ _ _ _ _ _ _ _) = True
+isOpenCat (N _ _ _ _ _ _ _) = True
+isOpenCat (V _ _ _ _ _ _ _ _ _ _) = True
+isOpenCat (Adv _ _ _) = True
+isOpenCat _ = False
+
+noun :: Forms -> (Case, Number) -> T.Text
+noun forms (c, n) = findForm matchNoun forms
+ where matchNoun (N c' n' _ _ _ _ _) = c == c' && n == n'
+ matchNoun _ = False
+
+adj :: Forms -> Degree -> T.Text
+adj forms d = findForm matchAdj forms
+ where matchAdj (A _ _ _ _ _ d' _ _) = d == d
+ matchAdj _ = False
+
+verbPres :: Forms -> (Number, Person) -> T.Text
+verbPres forms (n, p) = findForm matchPres forms
+ where matchPres (V Ind n' _ Pres p' _ Act _ _ _) = n == n' && p == p'
+ matchPres _ = False
+
+verbPast :: Forms -> (Number, Gender) -> T.Text
+verbPast forms (n, g) = findForm matchPast forms
+ where matchPast (V Ind n' g' Past _ _ Act _ _ _) = n == n' && g == g'
+ matchPast _ = False
+
+verbImp :: Forms -> T.Text
+verbImp forms = findForm matchImp forms
+ where matchImp (V Imp _ _ _ _ _ _ _ _ _) = True
+ matchImp _ = False
+
+verbInf :: Forms -> T.Text
+verbInf forms = findForm matchInf forms
+ where matchInf (V Inf _ _ _ _ _ _ _ _ _) = True
+ matchInf _ = False
+
+adv :: Forms -> T.Text
+adv forms = findForm matchAdv forms
+ where matchAdv (Adv d _ _) = d == Pos
+ matchAdv _ = False
+
+findForm match forms = case find match (M.keys forms) of
+ Just tag -> forms M.! tag
+ Nothing -> findForm (\ _ -> True) forms \ No newline at end of file
diff --git a/contrib/eaglesconv/README b/contrib/eaglesconv/README
new file mode 100644
index 000000000..e3c84c61d
--- /dev/null
+++ b/contrib/eaglesconv/README
@@ -0,0 +1,24 @@
+How to use:
+
+1) Sort the wordlist so it can be split into sublists. It is necessary because
+the converter is quite memory-hungry, and you might not have enough RAM to
+process the whole wordlist at once.
+
+./CollectLemmas dicc.src | sort > lemmas.src
+
+2) Split the sorted wordlist.
+
+split -l 500000 lemmas.src
+
+3) Splitting has probably left forms of some lemmas spread across two
+sublists. Manually edit sublists so all forms for a lemma are present in just
+one sublist.
+
+4) Run the converter.
+
+./run_conv.sh xa*
+
+5) The converter has produced abstract and concrete syntaxes for the
+dictionary. You can try them out with GF:
+
+gf DictRus.gf \ No newline at end of file
diff --git a/contrib/eaglesconv/mkAbstract.sh b/contrib/eaglesconv/mkAbstract.sh
new file mode 100644
index 000000000..d07da18fc
--- /dev/null
+++ b/contrib/eaglesconv/mkAbstract.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+echo "abstract DictRusAbs = Cat ** {
+"
+cat $1 | sed 's/^lin/fun/g;s/=.*$//g;s/\_N/\_N : N\;/g;s/\_PN/\_PN : PN\;/g;s/\_A /\_A : A\;/g;s/\_V/\_V : V\;/g;s/\_Adv/\_Adv : Adv\;/g'
+
+echo "
+}" \ No newline at end of file
diff --git a/contrib/eaglesconv/mkConcrete.sh b/contrib/eaglesconv/mkConcrete.sh
new file mode 100644
index 000000000..170ab9c5e
--- /dev/null
+++ b/contrib/eaglesconv/mkConcrete.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+echo "--# -path=.:../prelude:../abstract:../common
+
+concrete DictRus of DictRusAbs = CatRus **
+ open ParadigmsRus, Prelude, StructuralRus, MorphoRus in {
+flags
+ optimize=values ;
+ coding=utf8 ;
+"
+cat $1
+echo "}"
diff --git a/contrib/eaglesconv/run_conv.sh b/contrib/eaglesconv/run_conv.sh
new file mode 100644
index 000000000..5ad586834
--- /dev/null
+++ b/contrib/eaglesconv/run_conv.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+./EaglesConv "$@" +RTS -K256M -RTS > convtmp
+./mkConcrete.sh convtmp > DictRus.gf
+./mkAbstract.sh convtmp > DictRusAbs.gf