summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authoraarne <aarne@chalmers.se>2011-02-06 16:19:24 +0000
committeraarne <aarne@chalmers.se>2011-02-06 16:19:24 +0000
commit7445e5638713276b471a0c8d5a53940704e8239d (patch)
tree2797d77952099e99da827824afdbf2c72b59b7b7 /src
parent878fefc6f5d2c706faafb218a99432e896109d1f (diff)
a simple clitic analysis command 'ca'
Diffstat (limited to 'src')
-rw-r--r--src/compiler/GF/Command/Commands.hs37
-rw-r--r--src/compiler/GF/Text/Clitics.hs40
-rw-r--r--src/runtime/haskell/PGF/Morphology.hs5
3 files changed, 81 insertions, 1 deletions
diff --git a/src/compiler/GF/Command/Commands.hs b/src/compiler/GF/Command/Commands.hs
index 22485e94b..d4e8e406b 100644
--- a/src/compiler/GF/Command/Commands.hs
+++ b/src/compiler/GF/Command/Commands.hs
@@ -33,6 +33,7 @@ import GF.Data.ErrM ----
import GF.Command.Abstract
import GF.Command.Messages
import GF.Text.Lexing
+import GF.Text.Clitics
import GF.Text.Transliterations
import GF.Quiz
@@ -200,6 +201,38 @@ allCommands env@(pgf, mos) = Map.fromList [
]
}),
+ ("ca", emptyCommandInfo {
+ longname = "clitic_analyse",
+ synopsis = "print the analyses of all words into stems and clitics",
+ explanation = unlines [
+ "Analyses all words into all possible combinations of stem + clitics.",
+ "The analysis is returned in the format stem &+ clitic1 &+ clitic2 ...",
+ "which is hence the inverse of 'pt -bind'. The list of clitics is give",
+ "by the flag '-clitics'. The list of stems is given as the list of words",
+ "of the language given by the '-lang' flag."
+ ],
+ exec = \opts -> case opts of
+ _ | isOpt "raw" opts ->
+ return . fromString .
+ unlines . map (unwords . map (concat . intersperse "+")) .
+ map (getClitics (isInMorpho (optMorpho opts)) (optClitics opts)) .
+ concatMap words . toStrings
+ _ ->
+ return . fromStrings .
+ getCliticsText (isInMorpho (optMorpho opts)) (optClitics opts) .
+ concatMap words . toStrings,
+ flags = [
+ ("clitics","the list of possible clitics (comma-separated, no spaces)"),
+ ("lang", "the language of analysis")
+ ],
+ options = [
+ ("raw", "analyse each word separately (not suitable input for parser)")
+ ],
+ examples = [
+ "ca -lang=Fin -clitics=ko,ni \"nukkuuko minun vaimoni\" | p -- to parse Finnish"
+ ]
+ }),
+
("cc", emptyCommandInfo {
longname = "compute_concrete",
syntax = "cc (-all | -table | -unqual)? TERM",
@@ -1113,6 +1146,10 @@ allCommands env@(pgf, mos) = Map.fromList [
optMorpho opts = morpho (error "no morpho") id (head (optLangs opts))
+ optClitics opts = case valStrOpts "clitics" "" opts of
+ "" -> []
+ cs -> map reverse $ chunks ',' cs
+
mexp xs = case xs of
t:_ -> Just t
_ -> Nothing
diff --git a/src/compiler/GF/Text/Clitics.hs b/src/compiler/GF/Text/Clitics.hs
new file mode 100644
index 000000000..849deb94e
--- /dev/null
+++ b/src/compiler/GF/Text/Clitics.hs
@@ -0,0 +1,40 @@
+module GF.Text.Clitics (getClitics,getCliticsText) where
+
+import Data.List
+
+-- AR 6/2/2011
+-- Analyse word as stem+clitic whenever
+-- (1) clitic is in clitic list
+-- (2) either
+-- (a) stem is in Lexicon
+-- (b) stem can be analysed as stem0+clitic0
+--
+-- Examples:
+-- Italian amarmi = amar+mi
+-- Finnish autossanikohan = autossa+ni+kohan
+--
+-- The analysis gives all results, including the case where the whole word is in Lexicon.
+--
+-- The clitics in the list are expected to be reversed.
+
+getClitics :: (String -> Bool) -> [String] -> String -> [[String]]
+getClitics isLex rclitics = map (reverse . map reverse) . clits . reverse where
+ clits rword = ifLex rword [rclit:more |
+ rclit <- rclitics, stem <- splits rclit rword, more <- clits stem]
+ splits c = maybe [] return . stripPrefix c
+
+ ifLex w ws = if isLex (reverse w) then [w] : ws else ws
+
+
+getCliticsText :: (String -> Bool) -> [String] -> [String] -> [String]
+getCliticsText isLex rclitics =
+ map unwords . sequence . map (map render . getClitics isLex rclitics)
+ where
+ render = unwords . intersperse "&+"
+
+
+-- example
+
+getClitics1 = getClitics exlex1 exclits1
+exlex1 = flip elem ["auto", "naise", "rahan","maa","maahan","maahankaan"]
+exclits1 = map reverse ["ni","ko","han","pas","nsa","kin","kaan"]
diff --git a/src/runtime/haskell/PGF/Morphology.hs b/src/runtime/haskell/PGF/Morphology.hs
index d5a2d28bc..70fa70458 100644
--- a/src/runtime/haskell/PGF/Morphology.hs
+++ b/src/runtime/haskell/PGF/Morphology.hs
@@ -1,5 +1,5 @@
module PGF.Morphology(Lemma,Analysis,Morpho,
- buildMorpho,
+ buildMorpho,isInMorpho,
lookupMorpho,fullFormLexicon,
morphoMissing,missingWordMsg) where
@@ -42,6 +42,9 @@ collectWords pinfo = Map.fromListWith (++)
lookupMorpho :: Morpho -> String -> [(Lemma,Analysis)]
lookupMorpho (Morpho mo) s = maybe [] id $ Map.lookup s mo
+isInMorpho :: Morpho -> String -> Bool
+isInMorpho (Morpho mo) s = maybe False (const True) $ Map.lookup s mo
+
fullFormLexicon :: Morpho -> [(String,[(Lemma,Analysis)])]
fullFormLexicon (Morpho mo) = Map.toList mo