diff options
| author | aarne <aarne@cs.chalmers.se> | 2008-05-21 09:26:44 +0000 |
|---|---|---|
| committer | aarne <aarne@cs.chalmers.se> | 2008-05-21 09:26:44 +0000 |
| commit | 055c0d0d5a5bb0dc75904fe53df7f2e4f5732a8f (patch) | |
| tree | 0e63fb68c69c8f6ad0f78893c63420f0a3600e1c /src-3.0/GF/Text/Text.hs | |
| parent | 915a1de71783ab8446b1af9e72c7ba7dfbc12d3f (diff) | |
GF/src is now for 2.9, and the new sources are in src-3.0 - keep it this way until the release of GF 3
Diffstat (limited to 'src-3.0/GF/Text/Text.hs')
| -rw-r--r-- | src-3.0/GF/Text/Text.hs | 149 |
1 files changed, 149 insertions, 0 deletions
diff --git a/src-3.0/GF/Text/Text.hs b/src-3.0/GF/Text/Text.hs new file mode 100644 index 000000000..b55355c20 --- /dev/null +++ b/src-3.0/GF/Text/Text.hs @@ -0,0 +1,149 @@ +---------------------------------------------------------------------- +-- | +-- Module : Text +-- Maintainer : AR +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/06/23 14:32:44 $ +-- > CVS $Author: aarne $ +-- > CVS $Revision: 1.10 $ +-- +-- elementary text postprocessing. AR 21\/11\/2001. +-- +-- This is very primitive indeed. The functions should work on +-- token lists and not on strings. AR 5\/12\/2002 +-- +-- XML hack 14\/8\/2004; not in use yet +----------------------------------------------------------------------------- + +module GF.Text.Text (untokWithXML, + exceptXML, + formatAsTextLit, + formatAsCodeLit, + formatAsText, + formatAsHTML, + formatAsLatex, + formatAsCode, + performBinds, + performBindsFinnish, + unStringLit, + concatRemSpace + ) where + +import GF.Data.Operations +import Data.Char + +-- | does not apply untokenizer within XML tags --- heuristic "< " +-- this function is applied from top level... +untokWithXML :: (String -> String) -> String -> String +untokWithXML unt s = case s of + '<':cs@(c:_) | isAlpha c -> '<':beg ++ ">" ++ unto (drop 1 rest) where + (beg,rest) = span (/='>') cs + '<':cs -> '<':unto cs --- + [] -> [] + _ -> unt beg ++ unto rest where + (beg,rest) = span (/='<') s + where + unto = untokWithXML unt + +-- | ... whereas this one is embedded on a branch +exceptXML :: (String -> String) -> String -> String +exceptXML unt s = '<':beg ++ ">" ++ unt (drop 1 rest) where + (beg,rest) = span (/='>') s + +formatAsTextLit :: String -> String +formatAsTextLit = formatAsText . unwords . map unStringLit . words +--- hope that there will be deforestation... + +formatAsCodeLit :: String -> String +formatAsCodeLit = formatAsCode . unwords . map unStringLit . words + +formatAsText,formatAsHTML,formatAsLatex :: String -> String +formatAsText = formatAsTextGen (const False) (=="&-") +formatAsHTML = formatAsTextGen (\s -> take 1 s == "<" || last s == '>') (const False) +formatAsLatex = formatAsTextGen ((=="\\") . take 1) (const False) + +formatAsTextGen :: (String -> Bool) -> (String -> Bool) -> String -> String +formatAsTextGen tag para = unwords . format . cap . words where + format ws = case ws of + w : ww | capit w -> format $ (cap ww) + w : c : ww | major c -> format $ (w ++ c) :(cap ww) + w : c : ww | minor c -> format $ (w ++ c) : ww + p : c : ww | openp p -> format $ (p ++ c) :ww + p : c : ww | spanish p -> format $ (p ++ concat (cap [c])) :ww + c : ww | para c -> "\n\n" : format ww + w : ww -> w : format ww + [] -> [] + cap (p:ww) | tag p = p : cap ww + cap ((c:cs):ww) = (toUpper c : cs) : ww + cap [] = [] + capit = (=="&|") + major = flip elem (map singleton ".!?") + minor = flip elem (map singleton ",:;)") + openp = all (flip elem "(") + spanish = all (flip elem "\161\191") + +formatAsCode :: String -> String +formatAsCode = rend 0 . words where + -- render from BNF Converter + rend i ss = case ss of + "[" :ts -> cons "[" $ rend i ts + "(" :ts -> cons "(" $ rend i ts + "{" :ts -> cons "{" $ new (i+1) $ rend (i+1) ts + "}" : ";":ts -> new (i-1) $ space "}" $ cons ";" $ new (i-1) $ rend (i-1) ts + "}" :ts -> new (i-1) $ cons "}" $ new (i-1) $ rend (i-1) ts + ";" :ts -> cons ";" $ new i $ rend i ts + t : "," :ts -> cons t $ space "," $ rend i ts + t : ")" :ts -> cons t $ cons ")" $ rend i ts + t : "]" :ts -> cons t $ cons "]" $ rend i ts + t :ts -> space t $ rend i ts + _ -> "" + cons s t = s ++ t + new i s = '\n' : replicate (2*i) ' ' ++ dropWhile isSpace s + space t s = if null s then t else t ++ " " ++ s + +performBinds :: String -> String +performBinds = performBindsOpt (\x y -> y) + + +-- The function defines an effect of the former on the latter part, +-- such as in vowel harmony. It is triggered by the binder token "&*" + +performBindsOpt :: (String -> String -> String) -> String -> String +performBindsOpt harm = unwords . format . words where + format ws = case ws of + w : "&+" : u : ws -> format ((w ++ u) : ws) + w : "&*" : u : ws -> format ((w ++ harm w u) : ws) + w : ws -> w : format ws + [] -> [] + +-- unlexer for Finnish particles +-- Notice: left associativity crucial for "tie &* ko &* han" --> "tieköhän" + +performBindsFinnish :: String -> String +performBindsFinnish = performBindsOpt vowelHarmony where + vowelHarmony w p = if any (flip elem "aouAOU") w then p else map toFront p + toFront c = case c of + 'A' -> '\196' + 'O' -> '\214' + 'a' -> '\228' + 'o' -> '\246' + _ -> c + +unStringLit :: String -> String +unStringLit s = case s of + c : cs | strlim c && strlim (last cs) -> init cs + _ -> s + where + strlim = (=='\'') + +concatRemSpace :: String -> String +concatRemSpace = concat . words +{- +concatRemSpace s = case s of + '<':cs -> exceptXML concatRemSpace cs + c : cs | isSpace c -> concatRemSpace cs + c :cs -> c : concatRemSpace cs + _ -> s +-} |
