summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authoraarne <aarne@chalmers.se>2010-11-19 16:50:32 +0000
committeraarne <aarne@chalmers.se>2010-11-19 16:50:32 +0000
commit0bf41793694e8b3101d09e34858eba8ab2c8c5b6 (patch)
tree3fbe077d0c55e8c73214a9b2d69ca44c5bbcc962 /src
parent2acb125f6fd05427a95ebe2a1c784e362512cf59 (diff)
tools/Multi, a script for generating multilingual lexicon grammars with the minimum of effort
Diffstat (limited to 'src')
-rw-r--r--src/tools/Koe.multi12
-rw-r--r--src/tools/Multi.hs137
2 files changed, 149 insertions, 0 deletions
diff --git a/src/tools/Koe.multi b/src/tools/Koe.multi
new file mode 100644
index 000000000..d8fed0e53
--- /dev/null
+++ b/src/tools/Koe.multi
@@ -0,0 +1,12 @@
+-- baseline
+
+> langs Eng Swe Fin
+house ; hus ; talo
+car, automobile ; bil ; auto
+man ; man ; mies
+girl ; flicka, tjej ; tyttö
+technical university ; teknisk högskola ; teknillinen korkeakoulu
+
+
+
+
diff --git a/src/tools/Multi.hs b/src/tools/Multi.hs
new file mode 100644
index 000000000..c45f06b41
--- /dev/null
+++ b/src/tools/Multi.hs
@@ -0,0 +1,137 @@
+module Main where
+
+import List
+import Char
+import System
+
+-- quick way of writing a multilingual lexicon and (with some more work) a grammar
+
+usage = "usage: runghc Multi (-pgf)? file"
+
+{-
+-- This multi-line comment is a possible file in the format.
+-- comments are as in GF, one-liners
+
+-- always start by declaring lang names as follows
+> langs Eng Fin Swe
+
+-- baseline rules: semicolon-separated line-by-line entries update abs and cncs, adding to S
+cheers ; skål ; terveydeksi
+
+-- alternatives within a language are comma-separated
+cheers ; skål ; terveydeksi, kippis
+
+-- more advanced: verbatim abstract rules prefixed by "> abs"
+> abs cat Drink ;
+> abs fun drink : Drink -> S ;
+
+-- verbatim concrete rules prefixed by ">" and comma-separated language list
+> Eng,Swe lin Gin = "gin" ;
+
+-}
+
+
+main = do
+ xx <- getArgs
+ if null xx putStrLn usage else do
+ let (opts,file) = (init xx, last xx)
+ src <- readFile file
+ let multi = getMulti (takeWhile (/='.') file) src
+ absn = absName multi
+ cncns = cncNames multi
+ writeFile (gfFile absn) (absCode multi)
+ mapM_ (uncurry writeFile)
+ [(gfFile cncn, cncCode absn cncn cod) |
+ cncn <- cncNames multi, let cod = [r | (la,r) <- cncRules multi, la == cncn]]
+ putStrLn $ "wrote " ++ unwords (map gfFile (absn:cncns))
+ if elem "-pgf" xx
+ then do
+ system ("gf -make -s -optimize-pgf " ++ unwords (map gfFile cncns))
+ putStrLn $ "wrote " ++ absn ++ ".pgf"
+ else return ()
+
+data Multi = Multi {
+ absName :: String,
+ cncNames :: [String],
+ startCat :: String,
+ absRules :: [String],
+ cncRules :: [(String,String)] -- lang,lin
+ }
+
+emptyMulti :: Multi
+emptyMulti = Multi {
+ absName = "Abs",
+ cncNames = [],
+ startCat = "S",
+ absRules = [],
+ cncRules = []
+ }
+
+absCode :: Multi -> String
+absCode multi = unlines $ header : start ++ (reverse (absRules multi)) ++ ["}"] where
+ header = "abstract " ++ absName multi ++ " = {"
+ start = ["flags startcat = " ++ cat ++ " ;", "cat " ++ cat ++ " ;"]
+ cat = startCat multi
+
+cncCode :: String -> String -> [String] -> String
+cncCode ab cnc rules = unlines $ header : (reverse rules ++ ["}"]) where
+ header = "concrete " ++ cnc ++ " of " ++ ab ++ " = {"
+
+getMulti :: String -> String -> Multi
+getMulti m s = foldl (flip addMulti) (emptyMulti{absName = m}) (lines s)
+
+addMulti :: String -> Multi -> Multi
+addMulti line multi = case line of
+ '-':'-':_ -> multi
+ _ | all isSpace line -> multi
+ '>':s -> case words s of
+ "langs":ws -> let las = [absName multi ++ w | w <- ws] in multi {
+ cncNames = las,
+ cncRules = concat [[(la,"lincat " ++ startCat multi ++ " = Str ;"),
+ (la,"flags coding = utf8 ;")] | la <- las]
+ }
+ "startcat":c:ws -> multi {startCat = c}
+ "abs":ws -> multi {
+ absRules = unwords ws : absRules multi
+ }
+ langs:ws -> multi {
+ cncRules = [(absName multi ++ la, unwords ws) | la <- chop ',' langs] ++ cncRules multi
+ }
+ _ -> let (cat,fun,lins) = getRules (startCat multi) line
+ in multi {
+ absRules = ("fun " ++ fun ++ " : " ++ cat ++ " ;") : absRules multi,
+ cncRules = zip (cncNames multi) lins ++ cncRules multi
+ }
+
+getRules :: String -> String -> (String,String,[String])
+getRules cat line = (cat, fun, map lin rss) where
+ rss = map (map unspace . chop ',') $ chop ';' line
+ fun = map idChar (head (head rss)) ++ "_" ++ cat
+ lin rs = "lin " ++ fun ++ " = " ++ unwords (intersperse "|" (map quote rs)) ++ " ;"
+
+chop :: Eq c => c -> [c] -> [[c]]
+chop c cs = case break (==c) cs of
+ (w,_:cs2) -> w : chop c cs2
+ ([],[]) -> []
+ (w,_) -> [w]
+
+-- remove spaces from beginning and end, leave them in the middle
+unspace :: String -> String
+unspace = unwords . words
+
+quote :: String -> String
+quote r = "\"" ++ r ++ "\""
+
+-- to guarantee that the char can be used in an ident
+idChar :: Char -> Char
+idChar c =
+ if (n > 47 && n < 58) || (n > 64 && n < 91) || (n > 96 && n < 123)
+ then c
+ else '_'
+ where n = fromEnum c
+
+
+gfFile :: FilePath -> FilePath
+gfFile f = f ++ ".gf"
+
+