summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgdetrez <gdetrez@crans.org>2011-02-10 15:00:06 +0000
committergdetrez <gdetrez@crans.org>2011-02-10 15:00:06 +0000
commit45ecae4b774aee96dcc3e9f2c5f82307982faa08 (patch)
tree21b7ce41a617b24e8b25b92e912789049e7b2def
parentd7ae73f1c7c5b95f1f08dadd314fa7143602b523 (diff)
Adding a basic lexicon-based tokenizer and the asociated command in gf shell
-rw-r--r--gf.cabal3
-rw-r--r--src/compiler/GF/Command/Commands.hs30
-rw-r--r--src/runtime/haskell/PGF.hs4
3 files changed, 37 insertions, 0 deletions
diff --git a/gf.cabal b/gf.cabal
index 9d23e0dde..acd1ac2e3 100644
--- a/gf.cabal
+++ b/gf.cabal
@@ -22,6 +22,7 @@ flag interrupt
library
build-depends: base >= 4.2 && <5,
array,
+ fst,
containers,
bytestring,
random,
@@ -42,6 +43,7 @@ library
PGF.Expr
PGF.Type
PGF.Tree
+ PGF.Tokenizer
PGF.Paraphrase
PGF.TypeCheck
PGF.Binary
@@ -72,6 +74,7 @@ executable gf
containers,
bytestring,
filepath,
+ fst,
directory,
random,
old-time,
diff --git a/src/compiler/GF/Command/Commands.hs b/src/compiler/GF/Command/Commands.hs
index 1290666cb..00d8e427a 100644
--- a/src/compiler/GF/Command/Commands.hs
+++ b/src/compiler/GF/Command/Commands.hs
@@ -964,6 +964,13 @@ allCommands env@(pgf, mos) = Map.fromList [
],
flags = [("file","the output filename")]
}),
+ ("t", emptyCommandInfo {
+ longname = "tokenize",
+ synopsis = "Tokenize string usng the vocabulary",
+ exec = execToktok env,
+ options = [],
+ flags = [("lang","The name of the concrete to use")]
+ }),
("ai", emptyCommandInfo {
longname = "abstract_info",
syntax = "ai IDENTIFIER or ai EXPR",
@@ -1251,3 +1258,26 @@ prMorphoAnalysis (w,lps) =
unlines (w:[showCId l ++ " : " ++ p | (l,p) <- lps])
+-- This function is to be excuted when the command 'tok' is parsed
+execToktok :: PGFEnv -> [Option] -> [Expr] -> IO CommandOutput
+execToktok (pgf, _) opts exprs = do
+ let tokenizers = Map.fromList [ (l, mkTokenizer pgf l) | l <- languages pgf]
+ case getLang opts of
+ Nothing -> do
+ let output = concatMap toStringList [t input | (_,t) <- Map.toList tokenizers]
+ return ([ELit $ LStr o | o <- output],unlines output)
+ Just lang -> case Map.lookup lang tokenizers of
+ Just tok -> do
+ let output = toStringList $ tok input
+ return ([ELit $ LStr o | o <- output],unlines output)
+ Nothing -> return ([],"Unknown language: " ++ show lang)
+ where input = case exprs of
+ [ELit (LStr s)] -> s
+ _ -> ""
+ toStringList :: Maybe [String] -> [String]
+ toStringList Nothing = []
+ toStringList (Just l) = l
+ getLang :: [Option] -> Maybe Language
+ getLang [] = Nothing
+ getLang (OFlag "lang" (VId l):_) = readLanguage l
+ getLang (_:os) = getLang os
diff --git a/src/runtime/haskell/PGF.hs b/src/runtime/haskell/PGF.hs
index 42ef8aaff..8530d9a71 100644
--- a/src/runtime/haskell/PGF.hs
+++ b/src/runtime/haskell/PGF.hs
@@ -109,6 +109,9 @@ module PGF(
-- ** Morphological Analysis
Lemma, Analysis, Morpho,
lookupMorpho, buildMorpho, fullFormLexicon,
+
+ -- ** Tokenizing
+ mkTokenizer,
-- ** Visualizations
graphvizAbstractTree,
@@ -141,6 +144,7 @@ import PGF.Expr (Tree)
import PGF.Morphology
import PGF.Data
import PGF.Binary
+import PGF.Tokenizer
import qualified PGF.Forest as Forest
import qualified PGF.Parse as Parse