summaryrefslogtreecommitdiff
path: root/src-3.0/GF/Text/Text.hs
diff options
context:
space:
mode:
authoraarne <aarne@cs.chalmers.se>2008-05-21 09:26:44 +0000
committeraarne <aarne@cs.chalmers.se>2008-05-21 09:26:44 +0000
commit055c0d0d5a5bb0dc75904fe53df7f2e4f5732a8f (patch)
tree0e63fb68c69c8f6ad0f78893c63420f0a3600e1c /src-3.0/GF/Text/Text.hs
parent915a1de71783ab8446b1af9e72c7ba7dfbc12d3f (diff)
GF/src is now for 2.9, and the new sources are in src-3.0 - keep it this way until the release of GF 3
Diffstat (limited to 'src-3.0/GF/Text/Text.hs')
-rw-r--r--src-3.0/GF/Text/Text.hs149
1 files changed, 149 insertions, 0 deletions
diff --git a/src-3.0/GF/Text/Text.hs b/src-3.0/GF/Text/Text.hs
new file mode 100644
index 000000000..b55355c20
--- /dev/null
+++ b/src-3.0/GF/Text/Text.hs
@@ -0,0 +1,149 @@
+----------------------------------------------------------------------
+-- |
+-- Module : Text
+-- Maintainer : AR
+-- Stability : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/06/23 14:32:44 $
+-- > CVS $Author: aarne $
+-- > CVS $Revision: 1.10 $
+--
+-- elementary text postprocessing. AR 21\/11\/2001.
+--
+-- This is very primitive indeed. The functions should work on
+-- token lists and not on strings. AR 5\/12\/2002
+--
+-- XML hack 14\/8\/2004; not in use yet
+-----------------------------------------------------------------------------
+
+module GF.Text.Text (untokWithXML,
+ exceptXML,
+ formatAsTextLit,
+ formatAsCodeLit,
+ formatAsText,
+ formatAsHTML,
+ formatAsLatex,
+ formatAsCode,
+ performBinds,
+ performBindsFinnish,
+ unStringLit,
+ concatRemSpace
+ ) where
+
+import GF.Data.Operations
+import Data.Char
+
+-- | does not apply untokenizer within XML tags --- heuristic "< "
+-- this function is applied from top level...
+untokWithXML :: (String -> String) -> String -> String
+untokWithXML unt s = case s of
+ '<':cs@(c:_) | isAlpha c -> '<':beg ++ ">" ++ unto (drop 1 rest) where
+ (beg,rest) = span (/='>') cs
+ '<':cs -> '<':unto cs ---
+ [] -> []
+ _ -> unt beg ++ unto rest where
+ (beg,rest) = span (/='<') s
+ where
+ unto = untokWithXML unt
+
+-- | ... whereas this one is embedded on a branch
+exceptXML :: (String -> String) -> String -> String
+exceptXML unt s = '<':beg ++ ">" ++ unt (drop 1 rest) where
+ (beg,rest) = span (/='>') s
+
+formatAsTextLit :: String -> String
+formatAsTextLit = formatAsText . unwords . map unStringLit . words
+--- hope that there will be deforestation...
+
+formatAsCodeLit :: String -> String
+formatAsCodeLit = formatAsCode . unwords . map unStringLit . words
+
+formatAsText,formatAsHTML,formatAsLatex :: String -> String
+formatAsText = formatAsTextGen (const False) (=="&-")
+formatAsHTML = formatAsTextGen (\s -> take 1 s == "<" || last s == '>') (const False)
+formatAsLatex = formatAsTextGen ((=="\\") . take 1) (const False)
+
+formatAsTextGen :: (String -> Bool) -> (String -> Bool) -> String -> String
+formatAsTextGen tag para = unwords . format . cap . words where
+ format ws = case ws of
+ w : ww | capit w -> format $ (cap ww)
+ w : c : ww | major c -> format $ (w ++ c) :(cap ww)
+ w : c : ww | minor c -> format $ (w ++ c) : ww
+ p : c : ww | openp p -> format $ (p ++ c) :ww
+ p : c : ww | spanish p -> format $ (p ++ concat (cap [c])) :ww
+ c : ww | para c -> "\n\n" : format ww
+ w : ww -> w : format ww
+ [] -> []
+ cap (p:ww) | tag p = p : cap ww
+ cap ((c:cs):ww) = (toUpper c : cs) : ww
+ cap [] = []
+ capit = (=="&|")
+ major = flip elem (map singleton ".!?")
+ minor = flip elem (map singleton ",:;)")
+ openp = all (flip elem "(")
+ spanish = all (flip elem "\161\191")
+
+formatAsCode :: String -> String
+formatAsCode = rend 0 . words where
+ -- render from BNF Converter
+ rend i ss = case ss of
+ "[" :ts -> cons "[" $ rend i ts
+ "(" :ts -> cons "(" $ rend i ts
+ "{" :ts -> cons "{" $ new (i+1) $ rend (i+1) ts
+ "}" : ";":ts -> new (i-1) $ space "}" $ cons ";" $ new (i-1) $ rend (i-1) ts
+ "}" :ts -> new (i-1) $ cons "}" $ new (i-1) $ rend (i-1) ts
+ ";" :ts -> cons ";" $ new i $ rend i ts
+ t : "," :ts -> cons t $ space "," $ rend i ts
+ t : ")" :ts -> cons t $ cons ")" $ rend i ts
+ t : "]" :ts -> cons t $ cons "]" $ rend i ts
+ t :ts -> space t $ rend i ts
+ _ -> ""
+ cons s t = s ++ t
+ new i s = '\n' : replicate (2*i) ' ' ++ dropWhile isSpace s
+ space t s = if null s then t else t ++ " " ++ s
+
+performBinds :: String -> String
+performBinds = performBindsOpt (\x y -> y)
+
+
+-- The function defines an effect of the former on the latter part,
+-- such as in vowel harmony. It is triggered by the binder token "&*"
+
+performBindsOpt :: (String -> String -> String) -> String -> String
+performBindsOpt harm = unwords . format . words where
+ format ws = case ws of
+ w : "&+" : u : ws -> format ((w ++ u) : ws)
+ w : "&*" : u : ws -> format ((w ++ harm w u) : ws)
+ w : ws -> w : format ws
+ [] -> []
+
+-- unlexer for Finnish particles
+-- Notice: left associativity crucial for "tie &* ko &* han" --> "tieköhän"
+
+performBindsFinnish :: String -> String
+performBindsFinnish = performBindsOpt vowelHarmony where
+ vowelHarmony w p = if any (flip elem "aouAOU") w then p else map toFront p
+ toFront c = case c of
+ 'A' -> '\196'
+ 'O' -> '\214'
+ 'a' -> '\228'
+ 'o' -> '\246'
+ _ -> c
+
+unStringLit :: String -> String
+unStringLit s = case s of
+ c : cs | strlim c && strlim (last cs) -> init cs
+ _ -> s
+ where
+ strlim = (=='\'')
+
+concatRemSpace :: String -> String
+concatRemSpace = concat . words
+{-
+concatRemSpace s = case s of
+ '<':cs -> exceptXML concatRemSpace cs
+ c : cs | isSpace c -> concatRemSpace cs
+ c :cs -> c : concatRemSpace cs
+ _ -> s
+-}