diff options
| author | aarne <aarne@chalmers.se> | 2011-02-06 16:19:24 +0000 |
|---|---|---|
| committer | aarne <aarne@chalmers.se> | 2011-02-06 16:19:24 +0000 |
| commit | 7445e5638713276b471a0c8d5a53940704e8239d (patch) | |
| tree | 2797d77952099e99da827824afdbf2c72b59b7b7 /src/compiler/GF/Text | |
| parent | 878fefc6f5d2c706faafb218a99432e896109d1f (diff) | |
a simple clitic analysis command 'ca'
Diffstat (limited to 'src/compiler/GF/Text')
| -rw-r--r-- | src/compiler/GF/Text/Clitics.hs | 40 |
1 files changed, 40 insertions, 0 deletions
diff --git a/src/compiler/GF/Text/Clitics.hs b/src/compiler/GF/Text/Clitics.hs new file mode 100644 index 000000000..849deb94e --- /dev/null +++ b/src/compiler/GF/Text/Clitics.hs @@ -0,0 +1,40 @@ +module GF.Text.Clitics (getClitics,getCliticsText) where + +import Data.List + +-- AR 6/2/2011 +-- Analyse word as stem+clitic whenever +-- (1) clitic is in clitic list +-- (2) either +-- (a) stem is in Lexicon +-- (b) stem can be analysed as stem0+clitic0 +-- +-- Examples: +-- Italian amarmi = amar+mi +-- Finnish autossanikohan = autossa+ni+kohan +-- +-- The analysis gives all results, including the case where the whole word is in Lexicon. +-- +-- The clitics in the list are expected to be reversed. + +getClitics :: (String -> Bool) -> [String] -> String -> [[String]] +getClitics isLex rclitics = map (reverse . map reverse) . clits . reverse where + clits rword = ifLex rword [rclit:more | + rclit <- rclitics, stem <- splits rclit rword, more <- clits stem] + splits c = maybe [] return . stripPrefix c + + ifLex w ws = if isLex (reverse w) then [w] : ws else ws + + +getCliticsText :: (String -> Bool) -> [String] -> [String] -> [String] +getCliticsText isLex rclitics = + map unwords . sequence . map (map render . getClitics isLex rclitics) + where + render = unwords . intersperse "&+" + + +-- example + +getClitics1 = getClitics exlex1 exclits1 +exlex1 = flip elem ["auto", "naise", "rahan","maa","maahan","maahankaan"] +exclits1 = map reverse ["ni","ko","han","pas","nsa","kin","kaan"] |
