summaryrefslogtreecommitdiff
path: root/src-3.0/GF/UseGrammar/Tokenize.hs
diff options
context:
space:
mode:
authoraarne <aarne@cs.chalmers.se>2008-05-21 09:26:44 +0000
committeraarne <aarne@cs.chalmers.se>2008-05-21 09:26:44 +0000
commit055c0d0d5a5bb0dc75904fe53df7f2e4f5732a8f (patch)
tree0e63fb68c69c8f6ad0f78893c63420f0a3600e1c /src-3.0/GF/UseGrammar/Tokenize.hs
parent915a1de71783ab8446b1af9e72c7ba7dfbc12d3f (diff)
GF/src is now for 2.9, and the new sources are in src-3.0 - keep it this way until the release of GF 3
Diffstat (limited to 'src-3.0/GF/UseGrammar/Tokenize.hs')
-rw-r--r--src-3.0/GF/UseGrammar/Tokenize.hs222
1 files changed, 222 insertions, 0 deletions
diff --git a/src-3.0/GF/UseGrammar/Tokenize.hs b/src-3.0/GF/UseGrammar/Tokenize.hs
new file mode 100644
index 000000000..9f1ab5449
--- /dev/null
+++ b/src-3.0/GF/UseGrammar/Tokenize.hs
@@ -0,0 +1,222 @@
+----------------------------------------------------------------------
+-- |
+-- Module : Tokenize
+-- Maintainer : AR
+-- Stability : (stable)
+-- Portability : (portable)
+--
+-- > CVS $Date: 2005/09/29 13:20:08 $
+-- > CVS $Author: aarne $
+-- > CVS $Revision: 1.14 $
+--
+-- lexers = tokenizers, to prepare input for GF grammars. AR 4\/1\/2002.
+-- an entry for each is included in 'Custom.customTokenizer'
+-----------------------------------------------------------------------------
+
+module GF.UseGrammar.Tokenize ( tokWords,
+ tokLits,
+ tokVars,
+ lexHaskell,
+ lexHaskellLiteral,
+ lexHaskellVar,
+ lexText,
+ lexTextVar,
+ lexC2M, lexC2M',
+ lexTextLiteral,
+ lexIgnore,
+ wordsLits
+ ) where
+
+import GF.Data.Operations
+---- import UseGrammar (isLiteral,identC)
+import GF.CF.CFIdent
+
+import Data.Char
+
+-- lexers = tokenizers, to prepare input for GF grammars. AR 4/1/2002
+-- an entry for each is included in Custom.customTokenizer
+
+-- | just words
+tokWords :: String -> [CFTok]
+tokWords = map tS . words
+
+tokLits :: String -> [CFTok]
+tokLits = map mkCFTok . mergeStr . wordsLits where
+ mergeStr ss = case ss of
+ w@(c:cs):rest | elem c "\'\"" && c /= last w -> getStr [w] rest
+ w :rest -> w : mergeStr rest
+ [] -> []
+ getStr v ss = case ss of
+ w@(_:_):rest | elem (last w) "\'\"" -> (unwords (reverse (w:v))) : mergeStr rest
+ w :rest -> getStr (w:v) rest
+ [] -> reverse v
+
+tokVars :: String -> [CFTok]
+tokVars = map mkCFTokVar . wordsLits
+
+isFloat s = case s of
+ c:cs | isDigit c -> isFloat cs
+ '.':cs@(_:_) -> all isDigit cs
+ _ -> False
+
+isString s = case s of
+ c:cs@(_:_) -> (c == '\'' && d == '\'') || (c == '"' && d == '"') where d = last cs
+ _ -> False
+
+
+mkCFTok :: String -> CFTok
+mkCFTok s = case s of
+ '"' :cs@(_:_) | last cs == '"' -> tL $ init cs
+ '\'':cs@(_:_) | last cs == '\'' -> tL $ init cs --- 's Gravenhage
+ _:_ | isFloat s -> tF s
+ _:_ | all isDigit s -> tI s
+ _ -> tS s
+
+mkCFTokVar :: String -> CFTok
+mkCFTokVar s = case s of
+ '?':_:_ -> tM s --- "?" --- compat with prCF
+ 'x':'_':_ -> tV s
+ 'x':[] -> tV s
+ '$':xs@(_:_) -> if last s == '$' then tV (init xs) else tS s
+ _ -> tS s
+
+mkTokVars :: (String -> [CFTok]) -> String -> [CFTok]
+mkTokVars tok = map tv . tok where
+ tv (TS s) = mkCFTokVar s
+ tv t = t
+
+mkLit :: String -> CFTok
+mkLit s
+ | isFloat s = tF s
+ | all isDigit s = tI s
+ | otherwise = tL s
+
+-- obsolete
+mkTL :: String -> CFTok
+mkTL s
+ | isFloat s = tF s
+ | all isDigit s = tI s
+ | otherwise = tL ("'" ++ s ++ "'")
+
+
+-- | Haskell lexer, usable for much code
+lexHaskell :: String -> [CFTok]
+lexHaskell ss = case lex ss of
+ [(w@(_:_),ws)] -> tS w : lexHaskell ws
+ _ -> []
+
+-- | somewhat shaky text lexer
+lexText :: String -> [CFTok]
+lexText = uncap . lx where
+
+ lx s = case s of
+ '?':'?':cs -> tS "??" : lx cs
+ p : cs | isMPunct p -> tS [p] : uncap (lx cs)
+ p : cs | isPunct p -> tS [p] : lx cs
+ s : cs | isSpace s -> lx cs
+ _ : _ -> getWord s
+ _ -> []
+
+ getWord s = tS w : lx ws where (w,ws) = span isNotSpec s
+ isMPunct c = elem c ".!?"
+ isPunct c = elem c ",:;()\""
+ isNotSpec c = not (isMPunct c || isPunct c || isSpace c)
+ uncap (TS (c:cs) : ws) = tC (c:cs) : ws
+ uncap s = s
+
+-- | lexer for C--, a mini variant of C
+lexC2M :: String -> [CFTok]
+lexC2M = lexC2M' False
+
+lexC2M' :: Bool -> String -> [CFTok]
+lexC2M' isHigherOrder s = case s of
+ '#':cs -> lexC $ dropWhile (/='\n') cs
+ '/':'*':cs -> lexC $ dropComment cs
+ c:cs | isSpace c -> lexC cs
+ c:cs | isAlpha c -> getId s
+ c:cs | isDigit c -> getLit s
+ c:d:cs | isSymb [c,d] -> tS [c,d] : lexC cs
+ c:cs | isSymb [c] -> tS [c] : lexC cs
+ _ -> [] --- covers end of file and unknown characters
+ where
+ lexC = lexC2M' isHigherOrder
+ getId s = mkT i : lexC cs where (i,cs) = span isIdChar s
+ getLit s = tI i : lexC cs where (i,cs) = span isDigit s ---- Float!
+ isIdChar c = isAlpha c || isDigit c || elem c "'_"
+ isSymb = reservedAnsiCSymbol
+ dropComment s = case s of
+ '*':'/':cs -> cs
+ _:cs -> dropComment cs
+ _ -> []
+ mkT i = if (isRes i) then (tS i) else
+ if isHigherOrder then (tV i) else (tL ("'" ++ i ++ "'"))
+ isRes = reservedAnsiC
+
+
+reservedAnsiCSymbol s = case lookupTree show s ansiCtree of
+ Ok True -> True
+ _ -> False
+
+reservedAnsiC s = case lookupTree show s ansiCtree of
+ Ok False -> True
+ _ -> False
+
+-- | for an efficient lexer: precompile this!
+ansiCtree = buildTree $ [(s,True) | s <- reservedAnsiCSymbols] ++
+ [(s,False) | s <- reservedAnsiCWords]
+
+reservedAnsiCSymbols = words $
+ "<<= >>= << >> ++ -- == <= >= *= += -= %= /= &= ^= |= " ++
+ "^ { } = , ; + * - ( ) < > & % ! ~"
+
+reservedAnsiCWords = words $
+ "auto break case char const continue default " ++
+ "do double else enum extern float for goto if int " ++
+ "long register return short signed sizeof static struct switch typedef " ++
+ "union unsigned void volatile while " ++
+ "main printin putchar" --- these are not ansi-C
+
+-- | turn unknown tokens into string literals; not recursively for literals 123, 'foo'
+unknown2string :: (String -> Bool) -> [CFTok] -> [CFTok]
+unknown2string isKnown = map mkOne where
+ mkOne t@(TS s)
+ | isKnown s = t
+ | isFloat s = tF s
+ | all isDigit s = tI s
+ | otherwise = tL s
+ mkOne t@(TC s) = if isKnown s then t else mkLit s
+ mkOne t = t
+
+unknown2var :: (String -> Bool) -> [CFTok] -> [CFTok]
+unknown2var isKnown = map mkOne where
+ mkOne t@(TS "??") = if isKnown "??" then t else tM "??"
+ mkOne t@(TS s)
+ | isKnown s = t
+ | isFloat s = tF s
+ | isString s = tL (init (tail s))
+ | all isDigit s = tI s
+ | otherwise = tV s
+ mkOne t@(TC s) = if isKnown s then t else tV s
+ mkOne t = t
+
+lexTextLiteral, lexHaskellLiteral, lexHaskellVar :: (String -> Bool) -> String -> [CFTok]
+
+lexTextLiteral isKnown = unknown2string (eitherUpper isKnown) . lexText
+lexHaskellLiteral isKnown = unknown2string isKnown . lexHaskell
+
+lexHaskellVar isKnown = unknown2var isKnown . lexHaskell
+lexTextVar isKnown = unknown2var (eitherUpper isKnown) . lexText
+
+
+eitherUpper isKnown w@(c:cs) = isKnown (toLower c : cs) || isKnown (toUpper c : cs)
+eitherUpper isKnown w = isKnown w
+
+-- ignore unknown tokens (e.g. keyword spotting)
+
+lexIgnore :: (String -> Bool) -> [CFTok] -> [CFTok]
+lexIgnore isKnown = concatMap mkOne where
+ mkOne t@(TS s)
+ | isKnown s = [t]
+ | otherwise = []
+ mkOne t = [t]
+