GF/src is now for 2.9, and the new sources are in src-3.0 - keep it this way until the release of GF 3

author: aarne <aarne@cs.chalmers.se> 2008-05-21 09:26:44 +0000
committer: aarne <aarne@cs.chalmers.se> 2008-05-21 09:26:44 +0000
commit: 055c0d0d5a5bb0dc75904fe53df7f2e4f5732a8f (patch)
tree: 0e63fb68c69c8f6ad0f78893c63420f0a3600e1c /src-3.0/GF/Text/Text.hs
parent: 915a1de71783ab8446b1af9e72c7ba7dfbc12d3f (diff)
1 files changed, 149 insertions, 0 deletions
diff --git a/src-3.0/GF/Text/Text.hs b/src-3.0/GF/Text/Text.hs
new file mode 100644
index 000000000..b55355c20
--- /dev/null
+++ b/src-3.0/GF/Text/Text.hs
@@ -0,0 +1,149 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : Text
+-- Maintainer  : AR
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/06/23 14:32:44 $ 
+-- > CVS $Author: aarne $
+-- > CVS $Revision: 1.10 $
+--
+-- elementary text postprocessing. AR 21\/11\/2001.
+--
+-- This is very primitive indeed. The functions should work on
+-- token lists and not on strings. AR 5\/12\/2002
+--
+-- XML hack 14\/8\/2004; not in use yet
+-----------------------------------------------------------------------------
+
+module GF.Text.Text (untokWithXML,
+	     exceptXML,
+	     formatAsTextLit,
+	     formatAsCodeLit,
+	     formatAsText,
+	     formatAsHTML,
+	     formatAsLatex,
+	     formatAsCode,
+	     performBinds,
+	     performBindsFinnish,
+	     unStringLit,
+	     concatRemSpace
+	    ) where
+
+import GF.Data.Operations
+import Data.Char
+
+-- | does not apply untokenizer within XML tags --- heuristic "< "
+-- this function is applied from top level...
+untokWithXML :: (String -> String) -> String -> String
+untokWithXML unt s = case s of
+  '<':cs@(c:_) | isAlpha c -> '<':beg ++ ">" ++ unto (drop 1 rest) where 
+                  (beg,rest) = span (/='>') cs
+  '<':cs -> '<':unto cs ---
+  [] -> []
+  _ -> unt beg ++ unto rest where
+               (beg,rest) = span (/='<') s
+ where
+   unto = untokWithXML unt
+
+-- | ... whereas this one is embedded on a branch
+exceptXML :: (String -> String) -> String -> String
+exceptXML unt s = '<':beg ++ ">" ++ unt (drop 1 rest) where 
+  (beg,rest) = span (/='>') s
+
+formatAsTextLit :: String -> String
+formatAsTextLit = formatAsText . unwords . map unStringLit . words 
+--- hope that there will be deforestation...
+
+formatAsCodeLit :: String -> String
+formatAsCodeLit = formatAsCode . unwords . map unStringLit . words 
+
+formatAsText,formatAsHTML,formatAsLatex :: String -> String
+formatAsText  = formatAsTextGen (const False) (=="&-") 
+formatAsHTML  = formatAsTextGen (\s -> take 1 s == "<" || last s == '>') (const False) 
+formatAsLatex = formatAsTextGen ((=="\\") . take 1)  (const False) 
+
+formatAsTextGen :: (String -> Bool) -> (String -> Bool) -> String -> String
+formatAsTextGen tag para = unwords . format . cap . words where
+  format ws = case ws of
+    w :     ww | capit w -> format $ (cap ww)
+    w : c : ww | major c -> format $ (w ++ c) :(cap ww)
+    w : c : ww | minor c -> format $ (w ++ c) :    ww
+    p : c : ww | openp p -> format $ (p ++ c) :ww
+    p : c : ww | spanish p -> format $ (p ++ concat (cap [c])) :ww
+    c     : ww | para  c -> "\n\n"        : format ww
+    w     : ww           -> w             : format ww
+    [] -> []
+  cap (p:ww) | tag p = p : cap ww
+  cap ((c:cs):ww) = (toUpper c : cs) : ww
+  cap [] = []
+  capit = (=="&|")
+  major = flip elem (map singleton ".!?") 
+  minor = flip elem (map singleton ",:;)")
+  openp = all (flip elem "(")
+  spanish = all (flip elem "\161\191")
+
+formatAsCode :: String -> String
+formatAsCode = rend 0 . words where
+  -- render from BNF Converter
+  rend i ss = case ss of
+    "["      :ts -> cons "["  $ rend i ts
+    "("      :ts -> cons "("  $ rend i ts
+    "{"      :ts -> cons "{"  $ new (i+1) $ rend (i+1) ts
+    "}" : ";":ts -> new (i-1) $ space "}" $ cons ";" $ new (i-1) $ rend (i-1) ts
+    "}"      :ts -> new (i-1) $ cons "}" $ new (i-1) $ rend (i-1) ts
+    ";"      :ts -> cons ";"  $ new i $ rend i ts
+    t  : "," :ts -> cons t    $ space "," $ rend i ts
+    t  : ")" :ts -> cons t    $ cons ")"  $ rend i ts
+    t  : "]" :ts -> cons t    $ cons "]"  $ rend i ts
+    t        :ts -> space t   $ rend i ts
+    _            -> ""
+  cons s t  = s ++ t
+  new i s   = '\n' : replicate (2*i) ' ' ++ dropWhile isSpace s
+  space t s = if null s then t else t ++ " " ++ s
+
+performBinds :: String -> String
+performBinds = performBindsOpt (\x y -> y) 
+
+
+-- The function defines an effect of the former on the latter part,
+-- such as in vowel harmony. It is triggered by the binder token "&*"
+
+performBindsOpt :: (String -> String -> String) -> String -> String
+performBindsOpt harm = unwords . format . words where
+  format ws = case ws of
+    w : "&+" : u : ws -> format ((w ++        u) : ws)
+    w : "&*" : u : ws -> format ((w ++ harm w u) : ws)
+    w : ws            -> w : format ws
+    []                -> []
+
+-- unlexer for Finnish particles
+-- Notice: left associativity crucial for "tie &* ko &* han" --> "tieköhän"
+
+performBindsFinnish :: String -> String
+performBindsFinnish = performBindsOpt vowelHarmony where
+  vowelHarmony w p = if any (flip elem "aouAOU") w then p else map toFront p
+  toFront c = case c of
+    'A' -> '\196'
+    'O' -> '\214'
+    'a' -> '\228'
+    'o' -> '\246'
+    _ -> c
+
+unStringLit :: String -> String
+unStringLit s = case s of
+  c : cs | strlim c && strlim (last cs) -> init cs
+  _ -> s
+ where
+   strlim = (=='\'')
+
+concatRemSpace :: String -> String
+concatRemSpace = concat . words
+{-
+concatRemSpace s = case s of
+  '<':cs -> exceptXML concatRemSpace cs
+  c : cs | isSpace c -> concatRemSpace cs
+  c :cs -> c : concatRemSpace cs
+  _ -> s
+-}
author	aarne <aarne@cs.chalmers.se>	2008-05-21 09:26:44 +0000
committer	aarne <aarne@cs.chalmers.se>	2008-05-21 09:26:44 +0000
commit	055c0d0d5a5bb0dc75904fe53df7f2e4f5732a8f (patch)
tree	0e63fb68c69c8f6ad0f78893c63420f0a3600e1c /src-3.0/GF/Text/Text.hs
parent	915a1de71783ab8446b1af9e72c7ba7dfbc12d3f (diff)