diff options
| author | aarne <aarne@cs.chalmers.se> | 2008-05-21 09:26:44 +0000 |
|---|---|---|
| committer | aarne <aarne@cs.chalmers.se> | 2008-05-21 09:26:44 +0000 |
| commit | 055c0d0d5a5bb0dc75904fe53df7f2e4f5732a8f (patch) | |
| tree | 0e63fb68c69c8f6ad0f78893c63420f0a3600e1c /src-3.0/GF/UseGrammar/Tokenize.hs | |
| parent | 915a1de71783ab8446b1af9e72c7ba7dfbc12d3f (diff) | |
GF/src is now for 2.9, and the new sources are in src-3.0 - keep it this way until the release of GF 3
Diffstat (limited to 'src-3.0/GF/UseGrammar/Tokenize.hs')
| -rw-r--r-- | src-3.0/GF/UseGrammar/Tokenize.hs | 222 |
1 files changed, 222 insertions, 0 deletions
diff --git a/src-3.0/GF/UseGrammar/Tokenize.hs b/src-3.0/GF/UseGrammar/Tokenize.hs new file mode 100644 index 000000000..9f1ab5449 --- /dev/null +++ b/src-3.0/GF/UseGrammar/Tokenize.hs @@ -0,0 +1,222 @@ +---------------------------------------------------------------------- +-- | +-- Module : Tokenize +-- Maintainer : AR +-- Stability : (stable) +-- Portability : (portable) +-- +-- > CVS $Date: 2005/09/29 13:20:08 $ +-- > CVS $Author: aarne $ +-- > CVS $Revision: 1.14 $ +-- +-- lexers = tokenizers, to prepare input for GF grammars. AR 4\/1\/2002. +-- an entry for each is included in 'Custom.customTokenizer' +----------------------------------------------------------------------------- + +module GF.UseGrammar.Tokenize ( tokWords, + tokLits, + tokVars, + lexHaskell, + lexHaskellLiteral, + lexHaskellVar, + lexText, + lexTextVar, + lexC2M, lexC2M', + lexTextLiteral, + lexIgnore, + wordsLits + ) where + +import GF.Data.Operations +---- import UseGrammar (isLiteral,identC) +import GF.CF.CFIdent + +import Data.Char + +-- lexers = tokenizers, to prepare input for GF grammars. AR 4/1/2002 +-- an entry for each is included in Custom.customTokenizer + +-- | just words +tokWords :: String -> [CFTok] +tokWords = map tS . words + +tokLits :: String -> [CFTok] +tokLits = map mkCFTok . mergeStr . wordsLits where + mergeStr ss = case ss of + w@(c:cs):rest | elem c "\'\"" && c /= last w -> getStr [w] rest + w :rest -> w : mergeStr rest + [] -> [] + getStr v ss = case ss of + w@(_:_):rest | elem (last w) "\'\"" -> (unwords (reverse (w:v))) : mergeStr rest + w :rest -> getStr (w:v) rest + [] -> reverse v + +tokVars :: String -> [CFTok] +tokVars = map mkCFTokVar . wordsLits + +isFloat s = case s of + c:cs | isDigit c -> isFloat cs + '.':cs@(_:_) -> all isDigit cs + _ -> False + +isString s = case s of + c:cs@(_:_) -> (c == '\'' && d == '\'') || (c == '"' && d == '"') where d = last cs + _ -> False + + +mkCFTok :: String -> CFTok +mkCFTok s = case s of + '"' :cs@(_:_) | last cs == '"' -> tL $ init cs + '\'':cs@(_:_) | last cs == '\'' -> tL $ init cs --- 's Gravenhage + _:_ | isFloat s -> tF s + _:_ | all isDigit s -> tI s + _ -> tS s + +mkCFTokVar :: String -> CFTok +mkCFTokVar s = case s of + '?':_:_ -> tM s --- "?" --- compat with prCF + 'x':'_':_ -> tV s + 'x':[] -> tV s + '$':xs@(_:_) -> if last s == '$' then tV (init xs) else tS s + _ -> tS s + +mkTokVars :: (String -> [CFTok]) -> String -> [CFTok] +mkTokVars tok = map tv . tok where + tv (TS s) = mkCFTokVar s + tv t = t + +mkLit :: String -> CFTok +mkLit s + | isFloat s = tF s + | all isDigit s = tI s + | otherwise = tL s + +-- obsolete +mkTL :: String -> CFTok +mkTL s + | isFloat s = tF s + | all isDigit s = tI s + | otherwise = tL ("'" ++ s ++ "'") + + +-- | Haskell lexer, usable for much code +lexHaskell :: String -> [CFTok] +lexHaskell ss = case lex ss of + [(w@(_:_),ws)] -> tS w : lexHaskell ws + _ -> [] + +-- | somewhat shaky text lexer +lexText :: String -> [CFTok] +lexText = uncap . lx where + + lx s = case s of + '?':'?':cs -> tS "??" : lx cs + p : cs | isMPunct p -> tS [p] : uncap (lx cs) + p : cs | isPunct p -> tS [p] : lx cs + s : cs | isSpace s -> lx cs + _ : _ -> getWord s + _ -> [] + + getWord s = tS w : lx ws where (w,ws) = span isNotSpec s + isMPunct c = elem c ".!?" + isPunct c = elem c ",:;()\"" + isNotSpec c = not (isMPunct c || isPunct c || isSpace c) + uncap (TS (c:cs) : ws) = tC (c:cs) : ws + uncap s = s + +-- | lexer for C--, a mini variant of C +lexC2M :: String -> [CFTok] +lexC2M = lexC2M' False + +lexC2M' :: Bool -> String -> [CFTok] +lexC2M' isHigherOrder s = case s of + '#':cs -> lexC $ dropWhile (/='\n') cs + '/':'*':cs -> lexC $ dropComment cs + c:cs | isSpace c -> lexC cs + c:cs | isAlpha c -> getId s + c:cs | isDigit c -> getLit s + c:d:cs | isSymb [c,d] -> tS [c,d] : lexC cs + c:cs | isSymb [c] -> tS [c] : lexC cs + _ -> [] --- covers end of file and unknown characters + where + lexC = lexC2M' isHigherOrder + getId s = mkT i : lexC cs where (i,cs) = span isIdChar s + getLit s = tI i : lexC cs where (i,cs) = span isDigit s ---- Float! + isIdChar c = isAlpha c || isDigit c || elem c "'_" + isSymb = reservedAnsiCSymbol + dropComment s = case s of + '*':'/':cs -> cs + _:cs -> dropComment cs + _ -> [] + mkT i = if (isRes i) then (tS i) else + if isHigherOrder then (tV i) else (tL ("'" ++ i ++ "'")) + isRes = reservedAnsiC + + +reservedAnsiCSymbol s = case lookupTree show s ansiCtree of + Ok True -> True + _ -> False + +reservedAnsiC s = case lookupTree show s ansiCtree of + Ok False -> True + _ -> False + +-- | for an efficient lexer: precompile this! +ansiCtree = buildTree $ [(s,True) | s <- reservedAnsiCSymbols] ++ + [(s,False) | s <- reservedAnsiCWords] + +reservedAnsiCSymbols = words $ + "<<= >>= << >> ++ -- == <= >= *= += -= %= /= &= ^= |= " ++ + "^ { } = , ; + * - ( ) < > & % ! ~" + +reservedAnsiCWords = words $ + "auto break case char const continue default " ++ + "do double else enum extern float for goto if int " ++ + "long register return short signed sizeof static struct switch typedef " ++ + "union unsigned void volatile while " ++ + "main printin putchar" --- these are not ansi-C + +-- | turn unknown tokens into string literals; not recursively for literals 123, 'foo' +unknown2string :: (String -> Bool) -> [CFTok] -> [CFTok] +unknown2string isKnown = map mkOne where + mkOne t@(TS s) + | isKnown s = t + | isFloat s = tF s + | all isDigit s = tI s + | otherwise = tL s + mkOne t@(TC s) = if isKnown s then t else mkLit s + mkOne t = t + +unknown2var :: (String -> Bool) -> [CFTok] -> [CFTok] +unknown2var isKnown = map mkOne where + mkOne t@(TS "??") = if isKnown "??" then t else tM "??" + mkOne t@(TS s) + | isKnown s = t + | isFloat s = tF s + | isString s = tL (init (tail s)) + | all isDigit s = tI s + | otherwise = tV s + mkOne t@(TC s) = if isKnown s then t else tV s + mkOne t = t + +lexTextLiteral, lexHaskellLiteral, lexHaskellVar :: (String -> Bool) -> String -> [CFTok] + +lexTextLiteral isKnown = unknown2string (eitherUpper isKnown) . lexText +lexHaskellLiteral isKnown = unknown2string isKnown . lexHaskell + +lexHaskellVar isKnown = unknown2var isKnown . lexHaskell +lexTextVar isKnown = unknown2var (eitherUpper isKnown) . lexText + + +eitherUpper isKnown w@(c:cs) = isKnown (toLower c : cs) || isKnown (toUpper c : cs) +eitherUpper isKnown w = isKnown w + +-- ignore unknown tokens (e.g. keyword spotting) + +lexIgnore :: (String -> Bool) -> [CFTok] -> [CFTok] +lexIgnore isKnown = concatMap mkOne where + mkOne t@(TS s) + | isKnown s = [t] + | otherwise = [] + mkOne t = [t] + |
