diff options
Diffstat (limited to 'contrib/eaglesconv')
| -rw-r--r-- | contrib/eaglesconv/CollectLemmas.hs | 28 | ||||
| -rw-r--r-- | contrib/eaglesconv/EaglesConv.hs | 135 | ||||
| -rw-r--r-- | contrib/eaglesconv/EaglesMatcher.hs | 63 | ||||
| -rw-r--r-- | contrib/eaglesconv/EaglesParser.hs | 239 | ||||
| -rw-r--r-- | contrib/eaglesconv/README | 24 | ||||
| -rw-r--r-- | contrib/eaglesconv/mkAbstract.sh | 8 | ||||
| -rw-r--r-- | contrib/eaglesconv/mkConcrete.sh | 12 | ||||
| -rw-r--r-- | contrib/eaglesconv/run_conv.sh | 4 |
8 files changed, 0 insertions, 513 deletions
diff --git a/contrib/eaglesconv/CollectLemmas.hs b/contrib/eaglesconv/CollectLemmas.hs deleted file mode 100644 index a63e7e1a8..000000000 --- a/contrib/eaglesconv/CollectLemmas.hs +++ /dev/null @@ -1,28 +0,0 @@ --- Copyright (C) 2011 Nikita Frolov - -import qualified Data.Text as T -import qualified Data.Text.IO as UTF8 -import System.IO -import System.Environment -import Control.Monad -import Control.Monad.State - -main :: IO () -main = do - args <- getArgs - forM_ args $ \ f -> do - entries <- UTF8.readFile f >>= (return . T.lines) - forM_ entries $ \ entry -> - do - let ws = T.words entry - form = head ws - tags = toPairs $ tail ws - forM_ tags $ \ (lemma, tag) -> - do - UTF8.putStrLn $ T.concat [lemma, sp, form, sp, tag] - where sp = T.singleton ' ' - - -toPairs xs = zip (stride 2 xs) (stride 2 (drop 1 xs)) - where stride _ [] = [] - stride n (x:xs) = x : stride n (drop (n-1) xs) diff --git a/contrib/eaglesconv/EaglesConv.hs b/contrib/eaglesconv/EaglesConv.hs deleted file mode 100644 index aa8929496..000000000 --- a/contrib/eaglesconv/EaglesConv.hs +++ /dev/null @@ -1,135 +0,0 @@ --- Copyright (C) 2011 Nikita Frolov - --- No, we can't pipeline parsing and generation, because there is no guarantee --- that we have collected all forms for a lemma before we've scanned the --- complete file. - -import qualified Data.Text as T -import qualified Data.Text.IO as UTF8 -import System.IO -import System.Environment -import Control.Monad -import Control.Monad.State -import qualified Data.Map as M -import Codec.Text.IConv -import qualified Data.ByteString.Lazy as BS -import qualified Data.ByteString.Internal as BSI - -import EaglesMatcher - -type Lemmas = M.Map T.Text Forms - -main :: IO () -main = do - args <- getArgs - forM_ args $ \ f -> do - entries <- UTF8.readFile f >>= (return . T.lines) - lemmas <- return $ execState (collectLemmas entries) (M.empty :: Lemmas) - mapM_ generateLin (M.assocs lemmas) - -collectLemmas entries = do - forM_ entries $ \ entry -> do - let ws = T.words entry - lemma = head ws - tags = toPairs $ tail ws - lemmas <- get - forM_ tags $ \ (form, tag) -> do - let forms = (case M.lookup lemma lemmas of - Just f -> f - Nothing -> M.empty) :: Forms - if isOpenCat . T.unpack $ tag - then put $ M.insert lemma (M.insert tag form forms) lemmas - else return () - -generateLin :: (T.Text, Forms) -> IO () -generateLin (lemma, forms) = do - let lemma' = myVeryOwnCyrillicRomanizationIConvSucks lemma - UTF8.putStr $ T.concat [T.pack "lin ", lemma'] - UTF8.putStr $ case T.unpack . head . M.keys $ forms of - ('N':_:_:_:g:a:'0':_) -> - T.concat $ [T.pack "_N = mkN "] - ++ map (quote . noun forms) [ ('N','S'), ('G','S') - , ('D','S'), ('F','S'), ('C','S'), ('O','S') - , ('L','S'), ('N','P'), ('G','P'), ('D','P') - , ('F','P'), ('C','P'), ('O','P') ] - ++ [showG g, sp, showAni a, ln] - ('N':_:c:n:g:a:_) -> - T.concat $ [T.pack "_PN = mkPN " - , quote $ noun forms ('N', 'S') - , showG g, sp - , showN n, sp, showAni a, ln] - ('A':_) -> - T.concat $ [T.pack "_A = mkA ", quote $ adj forms 'P', - if adj forms 'P' /= adj forms 'C' - then quote $ adj forms 'C' - else T.pack "" - , ln] - ('V':t) -> - let a = case t of - (_:_:_:_:'P':_:a':_) -> a' - (_:_:_:_:_:a':_) -> a' - in - T.concat $ [T.pack "_V = mkV ", showAsp a, sp] - ++ map (quote . verbPres forms) [ ('S','1'), ('S','2') - , ('S','3'), ('P','1') - , ('P','2'), ('P','3')] - ++ [ quote $ verbPast forms ('S', 'M') - , quote $ verbImp forms, quote $ verbInf forms, ln] - ('D':_) -> - T.concat $ [T.pack "_Adv = mkAdv " - , quote . adv $ forms, ln] - putStrLn "" - hFlush stdout - where quote x = T.concat [T.pack "\"", x, T.pack "\" "] - showG 'F' = T.pack "Fem" - showG 'A' = T.pack "Neut" - showG _ = T.pack "Masc" - showAni 'I' = T.pack "Inanimate" - showAni _ = T.pack "Animate" - showN 'P' = T.pack "Pl" - showN _ = T.pack "Sg" - showAsp 'F' = T.pack "Perfective" - showAsp _ = T.pack "Imperfective" - sp = T.singleton ' ' - ln = T.pack " ;" - -toPairs xs = zip (stride 2 xs) (stride 2 (drop 1 xs)) - where stride _ [] = [] - stride n (x:xs) = x : stride n (drop (n-1) xs) - -myVeryOwnCyrillicRomanizationIConvSucks s = T.pack . concatMap r . T.unpack $ s - where r 'а' = "a" - r 'б' = "b" - r 'в' = "v" - r 'г' = "g" - r 'д' = "d" - r 'е' = "je" - r 'ё' = "jo" - r 'ж' = "zh" - r 'з' = "z" - r 'и' = "i" - r 'й' = "jj" - r 'к' = "k" - r 'л' = "l" - r 'м' = "m" - r 'н' = "n" - r 'о' = "o" - r 'п' = "p" - r 'р' = "r" - r 'с' = "s" - r 'т' = "t" - r 'у' = "u" - r 'ф' = "f" - r 'х' = "kh" - r 'ц' = "c" - r 'ч' = "ch" - r 'ш' = "sh" - r 'щ' = "shc" - r 'ъ' = "yy" - r 'ы' = "y" - r 'ь' = "q" - r 'э' = "e" - r 'ю' = "ju" - r 'я' = "ja" - r '-' = "_" - r o = [o] diff --git a/contrib/eaglesconv/EaglesMatcher.hs b/contrib/eaglesconv/EaglesMatcher.hs deleted file mode 100644 index 27e76706f..000000000 --- a/contrib/eaglesconv/EaglesMatcher.hs +++ /dev/null @@ -1,63 +0,0 @@ --- Copyright (C) 2011 Nikita Frolov - --- The format specification can be found at --- http://devel.cpl.upc.edu/freeling/svn/trunk/doc/tagsets/tagset-ru.html - --- Bugs in the specification: --- Participle, 2nd field: case, not mood --- Participle, 6th field: field, not person --- Verb, persons can be denoted both with 'Pnumber' or just 'number' --- Noun, 10th field can be absent - --- No, it wouldn't be simpler to implement this grammar with Parsec or another --- parser combinator library. - - -module EaglesMatcher where - -import qualified Data.Text as T -import Data.List -import qualified Data.Map as M - -type Forms = M.Map T.Text T.Text - -isOpenCat ('A':_) = True -isOpenCat ('N':_) = True -isOpenCat ('V':_) = True -isOpenCat ('D':_) = True -isOpenCat _ = False - -noun forms (c, n) = findForm (matchNoun . T.unpack) forms - where matchNoun ('N':_:c':n':_) = c == c' && n == n' - matchNoun _ = False - -adj forms d = findForm (matchAdj . T.unpack) forms - where matchAdj ('A':'N':'S':'M':_:'F':d':_) = d == d - matchAdj _ = False - -verbPres forms (n, p) = findForm (matchPres . T.unpack) forms - where matchPres ('V':'D':n':_:'P':'P':p':_:'A':_) = n == n' && p == p' - matchPres ('V':'D':n':_:'F':'P':p':_:'A':_) = n == n' && p == p' - matchPres ('V':'D':n':_:'P':'P':p':_) = n == n' && p == p' - matchPres ('V':'D':n':_:'F':'P':p':_) = n == n' && p == p' - matchPres _ = False - -verbPast forms (n, g) = findForm (matchPast . T.unpack) forms - where matchPast ('V':'D':n':g':'S':_:_:'A':_) = n == n' && g == g' - matchPast _ = False - -verbImp forms = findForm (matchImp . T.unpack) forms - where matchImp ('V':'M':_) = True - matchImp _ = False - -verbInf forms = findForm (matchInf . T.unpack) forms - where matchInf ('V':'I':_) = True - matchInf _ = False - -adv forms = findForm (matchAdv . T.unpack) forms - where matchAdv ('D':d:_) = d == 'P' - matchAdv _ = False - -findForm match forms = case find match (M.keys forms) of - Just tag -> forms M.! tag - Nothing -> findForm (\ _ -> True) forms diff --git a/contrib/eaglesconv/EaglesParser.hs b/contrib/eaglesconv/EaglesParser.hs deleted file mode 100644 index 6fc64d3b8..000000000 --- a/contrib/eaglesconv/EaglesParser.hs +++ /dev/null @@ -1,239 +0,0 @@ --- Copyright (C) 2011 Nikita Frolov - --- An early version of the parser that requires somewhat more memory. Kept for --- nostalgic reasons. - -module EaglesParser where - -import qualified Data.Text as T -import Data.List -import qualified Data.Map as M - -type Forms = M.Map Tag T.Text - -data Tag = A Case Number Gender Animacy Form Degree Extra Obscene - | Adv Degree Extra Obscene - | AdvPron Extra - | Ord Case Number Gender Animacy - | AdjPron Case Number Gender Animacy Extra - | Frag Extra - | Conj Extra - | Inter Extra Obscene - | Num Case Number Gender Animacy Extra - | Part Extra - | Prep Extra - | N Case Number Gender Animacy Name Extra Obscene - | Pron Case Number Gender Animacy Extra - | V Mood Number Gender Tense Person Aspect Voice Trans Extra Obscene - | P Case Number Gender Tense Form Aspect Voice Trans Extra Obscene - deriving (Show, Ord, Eq) - -parseTag :: T.Text -> Tag -parseTag tag = case (T.unpack tag) of { - ('A':c:n:g:a:f:cmp:e:o:[]) -> A (readCase c) (readNumber n) - (readGender g) (readAnimacy a) - (readForm f) (readDegree cmp) - (readExtra e) (readObscene o) ; - ('D':cmp:e:o:[]) -> Adv (readDegree cmp) - (readExtra e) (readObscene o) ; - ('P':e:[]) -> AdvPron (readExtra e) ; - ('Y':c:n:g:a:[]) -> Ord (readCase c) (readNumber n) - (readGender g) (readAnimacy a) ; - ('R':c:n:g:a:e:[]) -> AdjPron (readCase c) (readNumber n) - (readGender g) (readAnimacy a) (readExtra e) ; - ('M':e:[]) -> Frag (readExtra e) ; - ('C':e:[]) -> Conj (readExtra e) ; - ('J':e:o:[]) -> Inter (readExtra e) (readObscene o) ; - ('Z':c:n:g:a:e:[]) -> Num (readCase c) (readNumber n) - (readGender g) (readAnimacy a) (readExtra e) ; - ('T':e:[]) -> Part (readExtra e) ; - ('B':e:[]) -> Prep (readExtra e) ; - ('N':_:c:n:g:a:name:e:o:_:[]) -> N (readCase c) (readNumber n) - (readGender g) (readAnimacy a) - (readName name) - (readExtra e) (readObscene o) ; - ('N':_:c:n:g:a:name:e:o:[]) -> N (readCase c) (readNumber n) - (readGender g) (readAnimacy a) - (readName name) - (readExtra e) (readObscene o) ; - ('E':c:n:g:a:e:[]) -> Pron (readCase c) (readNumber n) - (readGender g) (readAnimacy a) (readExtra e) ; - ('V':m:n:g:t:'P':p:a:v:tr:e:o:[]) -> V (readMood m) (readNumber n) - (readGender g) (readTense t) - (readPerson p) (readAspect a) - (readVoice v) (readTrans tr) - (readExtra e) (readObscene o) ; - ('V':m:n:g:t:'0':a:v:tr:e:o:[]) -> V (readMood m) (readNumber n) - (readGender g) (readTense t) - NP (readAspect a) - (readVoice v) (readTrans tr) - (readExtra e) (readObscene o) ; - ('V':m:n:g:t:p:a:v:tr:e:o:[]) -> V (readMood m) (readNumber n) - (readGender g) (readTense t) - (readPerson p) (readAspect a) - (readVoice v) (readTrans tr) - (readExtra e) (readObscene o) ; - ('Q':c:n:g:t:f:a:v:tr:e:o:[]) -> P (readCase c) (readNumber n) - (readGender g) (readTense t) - (readForm f) (readAspect a) - (readVoice v) (readTrans tr) - (readExtra e) (readObscene o) ; - _ -> error $ "Parse error: " ++ T.unpack tag } - -data Case = Nom | Gen | Dat | Acc | Inst | Prepos | Partit | Loc | Voc | NC - deriving (Show, Ord, Eq) - -readCase 'N' = Nom -readCase 'G' = Gen -readCase 'D' = Dat -readCase 'F' = Acc -readCase 'C' = Inst -readCase 'O' = Prepos -readCase 'P' = Partit -readCase 'L' = Loc -readCase 'V' = Voc -readCase '0' = NC - -data Number = Sg | Pl | NN deriving (Show, Ord, Eq) - -readNumber 'S' = Sg -readNumber 'P' = Pl -readNumber '0' = NN - -data Gender = Masc | Fem | Neut | Common | NG deriving (Show, Ord, Eq) - -readGender 'F' = Fem -readGender 'M' = Masc -readGender 'A' = Neut -readGender 'C' = Common -readGender '0' = NG - -data Animacy = Animate | Inanimate | NA deriving (Show, Ord, Eq) - -readAnimacy 'A' = Animate -readAnimacy 'I' = Inanimate -readAnimacy '0' = NA - -data Form = Short | Full | NF deriving (Show, Ord, Eq) - -readForm 'S' = Short -readForm 'F' = Full -readForm '0' = NF - -data Degree = Pos | Comp | Super | ND deriving (Show, Ord, Eq) - -readDegree 'E' = Super -readDegree 'C' = Comp -readDegree 'P' = Pos -readDegree '0' = ND - -data Extra = Introductory | Difficult | Distorted | Predicative - | Colloquial | Rare | Abbreviation | Obsolete | NE deriving (Show, Ord, Eq) - -readExtra 'P' = Introductory -readExtra 'D' = Difficult -readExtra 'V' = Distorted -readExtra 'R' = Predicative -readExtra 'I' = Colloquial -readExtra 'A' = Rare -readExtra 'B' = Abbreviation -readExtra 'E' = Obsolete -readExtra '0' = NE - -data Obscene = Obscene | NO deriving (Show, Ord, Eq) - -readObscene 'H' = Obscene -readObscene '0' = NO - -data Name = Topo | Proper | Patro | Family | NNa deriving (Show, Ord, Eq) - -readName 'G' = Topo -readName 'N' = Proper -readName 'S' = Patro -readName 'F' = Family -readName '0' = NNa - -data Mood = Gerund | Inf | Ind | Imp | NM deriving (Show, Ord, Eq) - -readMood 'G' = Gerund -readMood 'I' = Inf -readMood 'D' = Ind -readMood 'M' = Imp -readMood '0' = NM - -data Tense = Pres | Fut | Past | NT deriving (Show, Ord, Eq) - -readTense 'P' = Pres -readTense 'F' = Fut -readTense 'S' = Past -readTense '0' = NT - -data Person = P1 | P2 | P3 | NP deriving (Show, Ord, Eq) - -readPerson '1' = P1 -readPerson '2' = P2 -readPerson '3' = P3 - -data Aspect = Perf | Imperf | NAs deriving (Show, Ord, Eq) - -readAspect 'F' = Perf -readAspect 'N' = Imperf -readAspect '0' = NAs - -data Voice = Act | Pass | NV deriving (Show, Ord, Eq) - -readVoice 'A' = Act -readVoice 'S' = Pass -readVoice '0' = NV - -data Trans = Trans | Intrans | NTr deriving (Show, Ord, Eq) - -readTrans 'M' = Trans -readTrans 'A' = Intrans -readTrans '0' = NTr - -isOpenCat :: Tag -> Bool -isOpenCat (A _ _ _ _ _ _ _ _) = True -isOpenCat (N _ _ _ _ _ _ _) = True -isOpenCat (V _ _ _ _ _ _ _ _ _ _) = True -isOpenCat (Adv _ _ _) = True -isOpenCat _ = False - -noun :: Forms -> (Case, Number) -> T.Text -noun forms (c, n) = findForm matchNoun forms - where matchNoun (N c' n' _ _ _ _ _) = c == c' && n == n' - matchNoun _ = False - -adj :: Forms -> Degree -> T.Text -adj forms d = findForm matchAdj forms - where matchAdj (A _ _ _ _ _ d' _ _) = d == d - matchAdj _ = False - -verbPres :: Forms -> (Number, Person) -> T.Text -verbPres forms (n, p) = findForm matchPres forms - where matchPres (V Ind n' _ Pres p' _ Act _ _ _) = n == n' && p == p' - matchPres _ = False - -verbPast :: Forms -> (Number, Gender) -> T.Text -verbPast forms (n, g) = findForm matchPast forms - where matchPast (V Ind n' g' Past _ _ Act _ _ _) = n == n' && g == g' - matchPast _ = False - -verbImp :: Forms -> T.Text -verbImp forms = findForm matchImp forms - where matchImp (V Imp _ _ _ _ _ _ _ _ _) = True - matchImp _ = False - -verbInf :: Forms -> T.Text -verbInf forms = findForm matchInf forms - where matchInf (V Inf _ _ _ _ _ _ _ _ _) = True - matchInf _ = False - -adv :: Forms -> T.Text -adv forms = findForm matchAdv forms - where matchAdv (Adv d _ _) = d == Pos - matchAdv _ = False - -findForm match forms = case find match (M.keys forms) of - Just tag -> forms M.! tag - Nothing -> findForm (\ _ -> True) forms
\ No newline at end of file diff --git a/contrib/eaglesconv/README b/contrib/eaglesconv/README deleted file mode 100644 index e3c84c61d..000000000 --- a/contrib/eaglesconv/README +++ /dev/null @@ -1,24 +0,0 @@ -How to use: - -1) Sort the wordlist so it can be split into sublists. It is necessary because -the converter is quite memory-hungry, and you might not have enough RAM to -process the whole wordlist at once. - -./CollectLemmas dicc.src | sort > lemmas.src - -2) Split the sorted wordlist. - -split -l 500000 lemmas.src - -3) Splitting has probably left forms of some lemmas spread across two -sublists. Manually edit sublists so all forms for a lemma are present in just -one sublist. - -4) Run the converter. - -./run_conv.sh xa* - -5) The converter has produced abstract and concrete syntaxes for the -dictionary. You can try them out with GF: - -gf DictRus.gf
\ No newline at end of file diff --git a/contrib/eaglesconv/mkAbstract.sh b/contrib/eaglesconv/mkAbstract.sh deleted file mode 100644 index d07da18fc..000000000 --- a/contrib/eaglesconv/mkAbstract.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/sh - -echo "abstract DictRusAbs = Cat ** { -" -cat $1 | sed 's/^lin/fun/g;s/=.*$//g;s/\_N/\_N : N\;/g;s/\_PN/\_PN : PN\;/g;s/\_A /\_A : A\;/g;s/\_V/\_V : V\;/g;s/\_Adv/\_Adv : Adv\;/g' - -echo " -}"
\ No newline at end of file diff --git a/contrib/eaglesconv/mkConcrete.sh b/contrib/eaglesconv/mkConcrete.sh deleted file mode 100644 index 170ab9c5e..000000000 --- a/contrib/eaglesconv/mkConcrete.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/sh - -echo "--# -path=.:../prelude:../abstract:../common - -concrete DictRus of DictRusAbs = CatRus ** - open ParadigmsRus, Prelude, StructuralRus, MorphoRus in { -flags - optimize=values ; - coding=utf8 ; -" -cat $1 -echo "}" diff --git a/contrib/eaglesconv/run_conv.sh b/contrib/eaglesconv/run_conv.sh deleted file mode 100644 index 5ad586834..000000000 --- a/contrib/eaglesconv/run_conv.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -./EaglesConv "$@" +RTS -K256M -RTS > convtmp -./mkConcrete.sh convtmp > DictRus.gf -./mkAbstract.sh convtmp > DictRusAbs.gf |
