summaryrefslogtreecommitdiff
path: root/contrib/eaglesconv
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/eaglesconv')
-rw-r--r--contrib/eaglesconv/CollectLemmas.hs28
-rw-r--r--contrib/eaglesconv/EaglesConv.hs135
-rw-r--r--contrib/eaglesconv/EaglesMatcher.hs63
-rw-r--r--contrib/eaglesconv/EaglesParser.hs239
-rw-r--r--contrib/eaglesconv/README24
-rw-r--r--contrib/eaglesconv/mkAbstract.sh8
-rw-r--r--contrib/eaglesconv/mkConcrete.sh12
-rw-r--r--contrib/eaglesconv/run_conv.sh4
8 files changed, 0 insertions, 513 deletions
diff --git a/contrib/eaglesconv/CollectLemmas.hs b/contrib/eaglesconv/CollectLemmas.hs
deleted file mode 100644
index a63e7e1a8..000000000
--- a/contrib/eaglesconv/CollectLemmas.hs
+++ /dev/null
@@ -1,28 +0,0 @@
--- Copyright (C) 2011 Nikita Frolov
-
-import qualified Data.Text as T
-import qualified Data.Text.IO as UTF8
-import System.IO
-import System.Environment
-import Control.Monad
-import Control.Monad.State
-
-main :: IO ()
-main = do
- args <- getArgs
- forM_ args $ \ f -> do
- entries <- UTF8.readFile f >>= (return . T.lines)
- forM_ entries $ \ entry ->
- do
- let ws = T.words entry
- form = head ws
- tags = toPairs $ tail ws
- forM_ tags $ \ (lemma, tag) ->
- do
- UTF8.putStrLn $ T.concat [lemma, sp, form, sp, tag]
- where sp = T.singleton ' '
-
-
-toPairs xs = zip (stride 2 xs) (stride 2 (drop 1 xs))
- where stride _ [] = []
- stride n (x:xs) = x : stride n (drop (n-1) xs)
diff --git a/contrib/eaglesconv/EaglesConv.hs b/contrib/eaglesconv/EaglesConv.hs
deleted file mode 100644
index aa8929496..000000000
--- a/contrib/eaglesconv/EaglesConv.hs
+++ /dev/null
@@ -1,135 +0,0 @@
--- Copyright (C) 2011 Nikita Frolov
-
--- No, we can't pipeline parsing and generation, because there is no guarantee
--- that we have collected all forms for a lemma before we've scanned the
--- complete file.
-
-import qualified Data.Text as T
-import qualified Data.Text.IO as UTF8
-import System.IO
-import System.Environment
-import Control.Monad
-import Control.Monad.State
-import qualified Data.Map as M
-import Codec.Text.IConv
-import qualified Data.ByteString.Lazy as BS
-import qualified Data.ByteString.Internal as BSI
-
-import EaglesMatcher
-
-type Lemmas = M.Map T.Text Forms
-
-main :: IO ()
-main = do
- args <- getArgs
- forM_ args $ \ f -> do
- entries <- UTF8.readFile f >>= (return . T.lines)
- lemmas <- return $ execState (collectLemmas entries) (M.empty :: Lemmas)
- mapM_ generateLin (M.assocs lemmas)
-
-collectLemmas entries = do
- forM_ entries $ \ entry -> do
- let ws = T.words entry
- lemma = head ws
- tags = toPairs $ tail ws
- lemmas <- get
- forM_ tags $ \ (form, tag) -> do
- let forms = (case M.lookup lemma lemmas of
- Just f -> f
- Nothing -> M.empty) :: Forms
- if isOpenCat . T.unpack $ tag
- then put $ M.insert lemma (M.insert tag form forms) lemmas
- else return ()
-
-generateLin :: (T.Text, Forms) -> IO ()
-generateLin (lemma, forms) = do
- let lemma' = myVeryOwnCyrillicRomanizationIConvSucks lemma
- UTF8.putStr $ T.concat [T.pack "lin ", lemma']
- UTF8.putStr $ case T.unpack . head . M.keys $ forms of
- ('N':_:_:_:g:a:'0':_) ->
- T.concat $ [T.pack "_N = mkN "]
- ++ map (quote . noun forms) [ ('N','S'), ('G','S')
- , ('D','S'), ('F','S'), ('C','S'), ('O','S')
- , ('L','S'), ('N','P'), ('G','P'), ('D','P')
- , ('F','P'), ('C','P'), ('O','P') ]
- ++ [showG g, sp, showAni a, ln]
- ('N':_:c:n:g:a:_) ->
- T.concat $ [T.pack "_PN = mkPN "
- , quote $ noun forms ('N', 'S')
- , showG g, sp
- , showN n, sp, showAni a, ln]
- ('A':_) ->
- T.concat $ [T.pack "_A = mkA ", quote $ adj forms 'P',
- if adj forms 'P' /= adj forms 'C'
- then quote $ adj forms 'C'
- else T.pack ""
- , ln]
- ('V':t) ->
- let a = case t of
- (_:_:_:_:'P':_:a':_) -> a'
- (_:_:_:_:_:a':_) -> a'
- in
- T.concat $ [T.pack "_V = mkV ", showAsp a, sp]
- ++ map (quote . verbPres forms) [ ('S','1'), ('S','2')
- , ('S','3'), ('P','1')
- , ('P','2'), ('P','3')]
- ++ [ quote $ verbPast forms ('S', 'M')
- , quote $ verbImp forms, quote $ verbInf forms, ln]
- ('D':_) ->
- T.concat $ [T.pack "_Adv = mkAdv "
- , quote . adv $ forms, ln]
- putStrLn ""
- hFlush stdout
- where quote x = T.concat [T.pack "\"", x, T.pack "\" "]
- showG 'F' = T.pack "Fem"
- showG 'A' = T.pack "Neut"
- showG _ = T.pack "Masc"
- showAni 'I' = T.pack "Inanimate"
- showAni _ = T.pack "Animate"
- showN 'P' = T.pack "Pl"
- showN _ = T.pack "Sg"
- showAsp 'F' = T.pack "Perfective"
- showAsp _ = T.pack "Imperfective"
- sp = T.singleton ' '
- ln = T.pack " ;"
-
-toPairs xs = zip (stride 2 xs) (stride 2 (drop 1 xs))
- where stride _ [] = []
- stride n (x:xs) = x : stride n (drop (n-1) xs)
-
-myVeryOwnCyrillicRomanizationIConvSucks s = T.pack . concatMap r . T.unpack $ s
- where r 'а' = "a"
- r 'б' = "b"
- r 'в' = "v"
- r 'г' = "g"
- r 'д' = "d"
- r 'е' = "je"
- r 'ё' = "jo"
- r 'ж' = "zh"
- r 'з' = "z"
- r 'и' = "i"
- r 'й' = "jj"
- r 'к' = "k"
- r 'л' = "l"
- r 'м' = "m"
- r 'н' = "n"
- r 'о' = "o"
- r 'п' = "p"
- r 'р' = "r"
- r 'с' = "s"
- r 'т' = "t"
- r 'у' = "u"
- r 'ф' = "f"
- r 'х' = "kh"
- r 'ц' = "c"
- r 'ч' = "ch"
- r 'ш' = "sh"
- r 'щ' = "shc"
- r 'ъ' = "yy"
- r 'ы' = "y"
- r 'ь' = "q"
- r 'э' = "e"
- r 'ю' = "ju"
- r 'я' = "ja"
- r '-' = "_"
- r o = [o]
diff --git a/contrib/eaglesconv/EaglesMatcher.hs b/contrib/eaglesconv/EaglesMatcher.hs
deleted file mode 100644
index 27e76706f..000000000
--- a/contrib/eaglesconv/EaglesMatcher.hs
+++ /dev/null
@@ -1,63 +0,0 @@
--- Copyright (C) 2011 Nikita Frolov
-
--- The format specification can be found at
--- http://devel.cpl.upc.edu/freeling/svn/trunk/doc/tagsets/tagset-ru.html
-
--- Bugs in the specification:
--- Participle, 2nd field: case, not mood
--- Participle, 6th field: field, not person
--- Verb, persons can be denoted both with 'Pnumber' or just 'number'
--- Noun, 10th field can be absent
-
--- No, it wouldn't be simpler to implement this grammar with Parsec or another
--- parser combinator library.
-
-
-module EaglesMatcher where
-
-import qualified Data.Text as T
-import Data.List
-import qualified Data.Map as M
-
-type Forms = M.Map T.Text T.Text
-
-isOpenCat ('A':_) = True
-isOpenCat ('N':_) = True
-isOpenCat ('V':_) = True
-isOpenCat ('D':_) = True
-isOpenCat _ = False
-
-noun forms (c, n) = findForm (matchNoun . T.unpack) forms
- where matchNoun ('N':_:c':n':_) = c == c' && n == n'
- matchNoun _ = False
-
-adj forms d = findForm (matchAdj . T.unpack) forms
- where matchAdj ('A':'N':'S':'M':_:'F':d':_) = d == d
- matchAdj _ = False
-
-verbPres forms (n, p) = findForm (matchPres . T.unpack) forms
- where matchPres ('V':'D':n':_:'P':'P':p':_:'A':_) = n == n' && p == p'
- matchPres ('V':'D':n':_:'F':'P':p':_:'A':_) = n == n' && p == p'
- matchPres ('V':'D':n':_:'P':'P':p':_) = n == n' && p == p'
- matchPres ('V':'D':n':_:'F':'P':p':_) = n == n' && p == p'
- matchPres _ = False
-
-verbPast forms (n, g) = findForm (matchPast . T.unpack) forms
- where matchPast ('V':'D':n':g':'S':_:_:'A':_) = n == n' && g == g'
- matchPast _ = False
-
-verbImp forms = findForm (matchImp . T.unpack) forms
- where matchImp ('V':'M':_) = True
- matchImp _ = False
-
-verbInf forms = findForm (matchInf . T.unpack) forms
- where matchInf ('V':'I':_) = True
- matchInf _ = False
-
-adv forms = findForm (matchAdv . T.unpack) forms
- where matchAdv ('D':d:_) = d == 'P'
- matchAdv _ = False
-
-findForm match forms = case find match (M.keys forms) of
- Just tag -> forms M.! tag
- Nothing -> findForm (\ _ -> True) forms
diff --git a/contrib/eaglesconv/EaglesParser.hs b/contrib/eaglesconv/EaglesParser.hs
deleted file mode 100644
index 6fc64d3b8..000000000
--- a/contrib/eaglesconv/EaglesParser.hs
+++ /dev/null
@@ -1,239 +0,0 @@
--- Copyright (C) 2011 Nikita Frolov
-
--- An early version of the parser that requires somewhat more memory. Kept for
--- nostalgic reasons.
-
-module EaglesParser where
-
-import qualified Data.Text as T
-import Data.List
-import qualified Data.Map as M
-
-type Forms = M.Map Tag T.Text
-
-data Tag = A Case Number Gender Animacy Form Degree Extra Obscene
- | Adv Degree Extra Obscene
- | AdvPron Extra
- | Ord Case Number Gender Animacy
- | AdjPron Case Number Gender Animacy Extra
- | Frag Extra
- | Conj Extra
- | Inter Extra Obscene
- | Num Case Number Gender Animacy Extra
- | Part Extra
- | Prep Extra
- | N Case Number Gender Animacy Name Extra Obscene
- | Pron Case Number Gender Animacy Extra
- | V Mood Number Gender Tense Person Aspect Voice Trans Extra Obscene
- | P Case Number Gender Tense Form Aspect Voice Trans Extra Obscene
- deriving (Show, Ord, Eq)
-
-parseTag :: T.Text -> Tag
-parseTag tag = case (T.unpack tag) of {
- ('A':c:n:g:a:f:cmp:e:o:[]) -> A (readCase c) (readNumber n)
- (readGender g) (readAnimacy a)
- (readForm f) (readDegree cmp)
- (readExtra e) (readObscene o) ;
- ('D':cmp:e:o:[]) -> Adv (readDegree cmp)
- (readExtra e) (readObscene o) ;
- ('P':e:[]) -> AdvPron (readExtra e) ;
- ('Y':c:n:g:a:[]) -> Ord (readCase c) (readNumber n)
- (readGender g) (readAnimacy a) ;
- ('R':c:n:g:a:e:[]) -> AdjPron (readCase c) (readNumber n)
- (readGender g) (readAnimacy a) (readExtra e) ;
- ('M':e:[]) -> Frag (readExtra e) ;
- ('C':e:[]) -> Conj (readExtra e) ;
- ('J':e:o:[]) -> Inter (readExtra e) (readObscene o) ;
- ('Z':c:n:g:a:e:[]) -> Num (readCase c) (readNumber n)
- (readGender g) (readAnimacy a) (readExtra e) ;
- ('T':e:[]) -> Part (readExtra e) ;
- ('B':e:[]) -> Prep (readExtra e) ;
- ('N':_:c:n:g:a:name:e:o:_:[]) -> N (readCase c) (readNumber n)
- (readGender g) (readAnimacy a)
- (readName name)
- (readExtra e) (readObscene o) ;
- ('N':_:c:n:g:a:name:e:o:[]) -> N (readCase c) (readNumber n)
- (readGender g) (readAnimacy a)
- (readName name)
- (readExtra e) (readObscene o) ;
- ('E':c:n:g:a:e:[]) -> Pron (readCase c) (readNumber n)
- (readGender g) (readAnimacy a) (readExtra e) ;
- ('V':m:n:g:t:'P':p:a:v:tr:e:o:[]) -> V (readMood m) (readNumber n)
- (readGender g) (readTense t)
- (readPerson p) (readAspect a)
- (readVoice v) (readTrans tr)
- (readExtra e) (readObscene o) ;
- ('V':m:n:g:t:'0':a:v:tr:e:o:[]) -> V (readMood m) (readNumber n)
- (readGender g) (readTense t)
- NP (readAspect a)
- (readVoice v) (readTrans tr)
- (readExtra e) (readObscene o) ;
- ('V':m:n:g:t:p:a:v:tr:e:o:[]) -> V (readMood m) (readNumber n)
- (readGender g) (readTense t)
- (readPerson p) (readAspect a)
- (readVoice v) (readTrans tr)
- (readExtra e) (readObscene o) ;
- ('Q':c:n:g:t:f:a:v:tr:e:o:[]) -> P (readCase c) (readNumber n)
- (readGender g) (readTense t)
- (readForm f) (readAspect a)
- (readVoice v) (readTrans tr)
- (readExtra e) (readObscene o) ;
- _ -> error $ "Parse error: " ++ T.unpack tag }
-
-data Case = Nom | Gen | Dat | Acc | Inst | Prepos | Partit | Loc | Voc | NC
- deriving (Show, Ord, Eq)
-
-readCase 'N' = Nom
-readCase 'G' = Gen
-readCase 'D' = Dat
-readCase 'F' = Acc
-readCase 'C' = Inst
-readCase 'O' = Prepos
-readCase 'P' = Partit
-readCase 'L' = Loc
-readCase 'V' = Voc
-readCase '0' = NC
-
-data Number = Sg | Pl | NN deriving (Show, Ord, Eq)
-
-readNumber 'S' = Sg
-readNumber 'P' = Pl
-readNumber '0' = NN
-
-data Gender = Masc | Fem | Neut | Common | NG deriving (Show, Ord, Eq)
-
-readGender 'F' = Fem
-readGender 'M' = Masc
-readGender 'A' = Neut
-readGender 'C' = Common
-readGender '0' = NG
-
-data Animacy = Animate | Inanimate | NA deriving (Show, Ord, Eq)
-
-readAnimacy 'A' = Animate
-readAnimacy 'I' = Inanimate
-readAnimacy '0' = NA
-
-data Form = Short | Full | NF deriving (Show, Ord, Eq)
-
-readForm 'S' = Short
-readForm 'F' = Full
-readForm '0' = NF
-
-data Degree = Pos | Comp | Super | ND deriving (Show, Ord, Eq)
-
-readDegree 'E' = Super
-readDegree 'C' = Comp
-readDegree 'P' = Pos
-readDegree '0' = ND
-
-data Extra = Introductory | Difficult | Distorted | Predicative
- | Colloquial | Rare | Abbreviation | Obsolete | NE deriving (Show, Ord, Eq)
-
-readExtra 'P' = Introductory
-readExtra 'D' = Difficult
-readExtra 'V' = Distorted
-readExtra 'R' = Predicative
-readExtra 'I' = Colloquial
-readExtra 'A' = Rare
-readExtra 'B' = Abbreviation
-readExtra 'E' = Obsolete
-readExtra '0' = NE
-
-data Obscene = Obscene | NO deriving (Show, Ord, Eq)
-
-readObscene 'H' = Obscene
-readObscene '0' = NO
-
-data Name = Topo | Proper | Patro | Family | NNa deriving (Show, Ord, Eq)
-
-readName 'G' = Topo
-readName 'N' = Proper
-readName 'S' = Patro
-readName 'F' = Family
-readName '0' = NNa
-
-data Mood = Gerund | Inf | Ind | Imp | NM deriving (Show, Ord, Eq)
-
-readMood 'G' = Gerund
-readMood 'I' = Inf
-readMood 'D' = Ind
-readMood 'M' = Imp
-readMood '0' = NM
-
-data Tense = Pres | Fut | Past | NT deriving (Show, Ord, Eq)
-
-readTense 'P' = Pres
-readTense 'F' = Fut
-readTense 'S' = Past
-readTense '0' = NT
-
-data Person = P1 | P2 | P3 | NP deriving (Show, Ord, Eq)
-
-readPerson '1' = P1
-readPerson '2' = P2
-readPerson '3' = P3
-
-data Aspect = Perf | Imperf | NAs deriving (Show, Ord, Eq)
-
-readAspect 'F' = Perf
-readAspect 'N' = Imperf
-readAspect '0' = NAs
-
-data Voice = Act | Pass | NV deriving (Show, Ord, Eq)
-
-readVoice 'A' = Act
-readVoice 'S' = Pass
-readVoice '0' = NV
-
-data Trans = Trans | Intrans | NTr deriving (Show, Ord, Eq)
-
-readTrans 'M' = Trans
-readTrans 'A' = Intrans
-readTrans '0' = NTr
-
-isOpenCat :: Tag -> Bool
-isOpenCat (A _ _ _ _ _ _ _ _) = True
-isOpenCat (N _ _ _ _ _ _ _) = True
-isOpenCat (V _ _ _ _ _ _ _ _ _ _) = True
-isOpenCat (Adv _ _ _) = True
-isOpenCat _ = False
-
-noun :: Forms -> (Case, Number) -> T.Text
-noun forms (c, n) = findForm matchNoun forms
- where matchNoun (N c' n' _ _ _ _ _) = c == c' && n == n'
- matchNoun _ = False
-
-adj :: Forms -> Degree -> T.Text
-adj forms d = findForm matchAdj forms
- where matchAdj (A _ _ _ _ _ d' _ _) = d == d
- matchAdj _ = False
-
-verbPres :: Forms -> (Number, Person) -> T.Text
-verbPres forms (n, p) = findForm matchPres forms
- where matchPres (V Ind n' _ Pres p' _ Act _ _ _) = n == n' && p == p'
- matchPres _ = False
-
-verbPast :: Forms -> (Number, Gender) -> T.Text
-verbPast forms (n, g) = findForm matchPast forms
- where matchPast (V Ind n' g' Past _ _ Act _ _ _) = n == n' && g == g'
- matchPast _ = False
-
-verbImp :: Forms -> T.Text
-verbImp forms = findForm matchImp forms
- where matchImp (V Imp _ _ _ _ _ _ _ _ _) = True
- matchImp _ = False
-
-verbInf :: Forms -> T.Text
-verbInf forms = findForm matchInf forms
- where matchInf (V Inf _ _ _ _ _ _ _ _ _) = True
- matchInf _ = False
-
-adv :: Forms -> T.Text
-adv forms = findForm matchAdv forms
- where matchAdv (Adv d _ _) = d == Pos
- matchAdv _ = False
-
-findForm match forms = case find match (M.keys forms) of
- Just tag -> forms M.! tag
- Nothing -> findForm (\ _ -> True) forms \ No newline at end of file
diff --git a/contrib/eaglesconv/README b/contrib/eaglesconv/README
deleted file mode 100644
index e3c84c61d..000000000
--- a/contrib/eaglesconv/README
+++ /dev/null
@@ -1,24 +0,0 @@
-How to use:
-
-1) Sort the wordlist so it can be split into sublists. It is necessary because
-the converter is quite memory-hungry, and you might not have enough RAM to
-process the whole wordlist at once.
-
-./CollectLemmas dicc.src | sort > lemmas.src
-
-2) Split the sorted wordlist.
-
-split -l 500000 lemmas.src
-
-3) Splitting has probably left forms of some lemmas spread across two
-sublists. Manually edit sublists so all forms for a lemma are present in just
-one sublist.
-
-4) Run the converter.
-
-./run_conv.sh xa*
-
-5) The converter has produced abstract and concrete syntaxes for the
-dictionary. You can try them out with GF:
-
-gf DictRus.gf \ No newline at end of file
diff --git a/contrib/eaglesconv/mkAbstract.sh b/contrib/eaglesconv/mkAbstract.sh
deleted file mode 100644
index d07da18fc..000000000
--- a/contrib/eaglesconv/mkAbstract.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/sh
-
-echo "abstract DictRusAbs = Cat ** {
-"
-cat $1 | sed 's/^lin/fun/g;s/=.*$//g;s/\_N/\_N : N\;/g;s/\_PN/\_PN : PN\;/g;s/\_A /\_A : A\;/g;s/\_V/\_V : V\;/g;s/\_Adv/\_Adv : Adv\;/g'
-
-echo "
-}" \ No newline at end of file
diff --git a/contrib/eaglesconv/mkConcrete.sh b/contrib/eaglesconv/mkConcrete.sh
deleted file mode 100644
index 170ab9c5e..000000000
--- a/contrib/eaglesconv/mkConcrete.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/sh
-
-echo "--# -path=.:../prelude:../abstract:../common
-
-concrete DictRus of DictRusAbs = CatRus **
- open ParadigmsRus, Prelude, StructuralRus, MorphoRus in {
-flags
- optimize=values ;
- coding=utf8 ;
-"
-cat $1
-echo "}"
diff --git a/contrib/eaglesconv/run_conv.sh b/contrib/eaglesconv/run_conv.sh
deleted file mode 100644
index 5ad586834..000000000
--- a/contrib/eaglesconv/run_conv.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/sh
-./EaglesConv "$@" +RTS -K256M -RTS > convtmp
-./mkConcrete.sh convtmp > DictRus.gf
-./mkAbstract.sh convtmp > DictRusAbs.gf