diff options
Diffstat (limited to 'src/GF/UseGrammar/Tokenize.hs')
| -rw-r--r-- | src/GF/UseGrammar/Tokenize.hs | 130 |
1 files changed, 130 insertions, 0 deletions
diff --git a/src/GF/UseGrammar/Tokenize.hs b/src/GF/UseGrammar/Tokenize.hs new file mode 100644 index 000000000..dd0879931 --- /dev/null +++ b/src/GF/UseGrammar/Tokenize.hs @@ -0,0 +1,130 @@ +module Tokenize where + +import Operations +---- import UseGrammar (isLiteral,identC) +import CFIdent + +import Char + +-- lexers = tokenizers, to prepare input for GF grammars. AR 4/1/2002 +-- an entry for each is included in Custom.customTokenizer + +-- just words + +tokWords :: String -> [CFTok] +tokWords = map tS . words + +tokLits :: String -> [CFTok] +tokLits = map mkCFTok . words + +tokVars :: String -> [CFTok] +tokVars = map mkCFTokVar . words + +mkCFTok :: String -> CFTok +mkCFTok s = tS s ---- if (isLiteral s) then (mkLit s) else (tS s) + +mkCFTokVar :: String -> CFTok +mkCFTokVar s = case s of + '?':_:_ -> tM s + 'x':'_':_ -> tV s + 'x':[] -> tV s + '$':xs@(_:_) -> if last s == '$' then tV (init xs) else tS s + _ -> tS s + +mkLit :: String -> CFTok +mkLit s = if (all isDigit s) then (tI s) else (tL s) + +mkTL :: String -> CFTok +mkTL s = if (all isDigit s) then (tI s) else (tL ("'" ++ s ++ "'")) + + +-- Haskell lexer, usable for much code + +lexHaskell :: String -> [CFTok] +lexHaskell ss = case lex ss of + [(w@(_:_),ws)] -> tS w : lexHaskell ws + _ -> [] + +-- somewhat shaky text lexer + +lexText :: String -> [CFTok] +lexText = uncap . lx where + + lx s = case s of + p : cs | isMPunct p -> tS [p] : uncap (lx cs) + p : cs | isPunct p -> tS [p] : lx cs + s : cs | isSpace s -> lx cs + _ : _ -> getWord s + _ -> [] + + getWord s = tS w : lx ws where (w,ws) = span isNotSpec s + isMPunct c = elem c ".!?" + isPunct c = elem c ",:;()\"" + isNotSpec c = not (isMPunct c || isPunct c || isSpace c) + uncap (TS (c:cs) : ws) = tC (c:cs) : ws + uncap s = s + +-- lexer for C--, a mini variant of C + +lexC2M :: String -> [CFTok] +lexC2M = lexC2M' False + +lexC2M' :: Bool -> String -> [CFTok] +lexC2M' isHigherOrder s = case s of + '#':cs -> lexC $ dropWhile (/='\n') cs + '/':'*':cs -> lexC $ dropComment cs + c:cs | isSpace c -> lexC cs + c:cs | isAlpha c -> getId s + c:cs | isDigit c -> getLit s + c:d:cs | isSymb [c,d] -> tS [c,d] : lexC cs + c:cs | isSymb [c] -> tS [c] : lexC cs + _ -> [] --- covers end of file and unknown characters + where + lexC = lexC2M' isHigherOrder + getId s = mkT i : lexC cs where (i,cs) = span isIdChar s + getLit s = tI i : lexC cs where (i,cs) = span isDigit s + isIdChar c = isAlpha c || isDigit c || elem c "'_" + isSymb = reservedAnsiCSymbol + dropComment s = case s of + '*':'/':cs -> cs + _:cs -> dropComment cs + _ -> [] + mkT i = if (isRes i) then (tS i) else + if isHigherOrder then (tV i) else (tL ("'" ++ i ++ "'")) + isRes = reservedAnsiC + + +reservedAnsiCSymbol s = case lookupTree show s ansiCtree of + Ok True -> True + _ -> False + +reservedAnsiC s = case lookupTree show s ansiCtree of + Ok False -> True + _ -> False + +-- for an efficient lexer: precompile this! +ansiCtree = buildTree $ [(s,True) | s <- reservedAnsiCSymbols] ++ + [(s,False) | s <- reservedAnsiCWords] + +reservedAnsiCSymbols = words $ + "<<= >>= << >> ++ -- == <= >= *= += -= %= /= &= ^= |= " ++ + "^ { } = , ; + * - ( ) < > & % ! ~" + +reservedAnsiCWords = words $ + "auto break case char const continue default " ++ + "do double else enum extern float for goto if int " ++ + "long register return short signed sizeof static struct switch typedef " ++ + "union unsigned void volatile while " ++ + "main printin putchar" --- these are not ansi-C + +-- turn unknown tokens into string literals; not recursively for literals 123, 'foo' + +unknown2string :: (String -> Bool) -> [CFTok] -> [CFTok] +unknown2string isKnown = map mkOne where + mkOne t@(TS s) = if isKnown s then t else mkTL s + mkOne t@(TC s) = if isKnown s then t else mkTL s + mkOne t = t + +lexTextLiteral isKnown = unknown2string isKnown . lexText +lexHaskellLiteral isKnown = unknown2string isKnown . lexHaskell + |
