summaryrefslogtreecommitdiff
path: root/src/GF/UseGrammar/Tokenize.hs
diff options
context:
space:
mode:
authoraarne <unknown>2003-09-22 13:16:55 +0000
committeraarne <unknown>2003-09-22 13:16:55 +0000
commitb1402e8bd6a68a891b00a214d6cf184d66defe19 (patch)
tree90372ac4e53dce91cf949dbf8e93be06f1d9e8bd /src/GF/UseGrammar/Tokenize.hs
Founding the newly structured GF2.0 cvs archive.
Diffstat (limited to 'src/GF/UseGrammar/Tokenize.hs')
-rw-r--r--src/GF/UseGrammar/Tokenize.hs130
1 files changed, 130 insertions, 0 deletions
diff --git a/src/GF/UseGrammar/Tokenize.hs b/src/GF/UseGrammar/Tokenize.hs
new file mode 100644
index 000000000..dd0879931
--- /dev/null
+++ b/src/GF/UseGrammar/Tokenize.hs
@@ -0,0 +1,130 @@
+module Tokenize where
+
+import Operations
+---- import UseGrammar (isLiteral,identC)
+import CFIdent
+
+import Char
+
+-- lexers = tokenizers, to prepare input for GF grammars. AR 4/1/2002
+-- an entry for each is included in Custom.customTokenizer
+
+-- just words
+
+tokWords :: String -> [CFTok]
+tokWords = map tS . words
+
+tokLits :: String -> [CFTok]
+tokLits = map mkCFTok . words
+
+tokVars :: String -> [CFTok]
+tokVars = map mkCFTokVar . words
+
+mkCFTok :: String -> CFTok
+mkCFTok s = tS s ---- if (isLiteral s) then (mkLit s) else (tS s)
+
+mkCFTokVar :: String -> CFTok
+mkCFTokVar s = case s of
+ '?':_:_ -> tM s
+ 'x':'_':_ -> tV s
+ 'x':[] -> tV s
+ '$':xs@(_:_) -> if last s == '$' then tV (init xs) else tS s
+ _ -> tS s
+
+mkLit :: String -> CFTok
+mkLit s = if (all isDigit s) then (tI s) else (tL s)
+
+mkTL :: String -> CFTok
+mkTL s = if (all isDigit s) then (tI s) else (tL ("'" ++ s ++ "'"))
+
+
+-- Haskell lexer, usable for much code
+
+lexHaskell :: String -> [CFTok]
+lexHaskell ss = case lex ss of
+ [(w@(_:_),ws)] -> tS w : lexHaskell ws
+ _ -> []
+
+-- somewhat shaky text lexer
+
+lexText :: String -> [CFTok]
+lexText = uncap . lx where
+
+ lx s = case s of
+ p : cs | isMPunct p -> tS [p] : uncap (lx cs)
+ p : cs | isPunct p -> tS [p] : lx cs
+ s : cs | isSpace s -> lx cs
+ _ : _ -> getWord s
+ _ -> []
+
+ getWord s = tS w : lx ws where (w,ws) = span isNotSpec s
+ isMPunct c = elem c ".!?"
+ isPunct c = elem c ",:;()\""
+ isNotSpec c = not (isMPunct c || isPunct c || isSpace c)
+ uncap (TS (c:cs) : ws) = tC (c:cs) : ws
+ uncap s = s
+
+-- lexer for C--, a mini variant of C
+
+lexC2M :: String -> [CFTok]
+lexC2M = lexC2M' False
+
+lexC2M' :: Bool -> String -> [CFTok]
+lexC2M' isHigherOrder s = case s of
+ '#':cs -> lexC $ dropWhile (/='\n') cs
+ '/':'*':cs -> lexC $ dropComment cs
+ c:cs | isSpace c -> lexC cs
+ c:cs | isAlpha c -> getId s
+ c:cs | isDigit c -> getLit s
+ c:d:cs | isSymb [c,d] -> tS [c,d] : lexC cs
+ c:cs | isSymb [c] -> tS [c] : lexC cs
+ _ -> [] --- covers end of file and unknown characters
+ where
+ lexC = lexC2M' isHigherOrder
+ getId s = mkT i : lexC cs where (i,cs) = span isIdChar s
+ getLit s = tI i : lexC cs where (i,cs) = span isDigit s
+ isIdChar c = isAlpha c || isDigit c || elem c "'_"
+ isSymb = reservedAnsiCSymbol
+ dropComment s = case s of
+ '*':'/':cs -> cs
+ _:cs -> dropComment cs
+ _ -> []
+ mkT i = if (isRes i) then (tS i) else
+ if isHigherOrder then (tV i) else (tL ("'" ++ i ++ "'"))
+ isRes = reservedAnsiC
+
+
+reservedAnsiCSymbol s = case lookupTree show s ansiCtree of
+ Ok True -> True
+ _ -> False
+
+reservedAnsiC s = case lookupTree show s ansiCtree of
+ Ok False -> True
+ _ -> False
+
+-- for an efficient lexer: precompile this!
+ansiCtree = buildTree $ [(s,True) | s <- reservedAnsiCSymbols] ++
+ [(s,False) | s <- reservedAnsiCWords]
+
+reservedAnsiCSymbols = words $
+ "<<= >>= << >> ++ -- == <= >= *= += -= %= /= &= ^= |= " ++
+ "^ { } = , ; + * - ( ) < > & % ! ~"
+
+reservedAnsiCWords = words $
+ "auto break case char const continue default " ++
+ "do double else enum extern float for goto if int " ++
+ "long register return short signed sizeof static struct switch typedef " ++
+ "union unsigned void volatile while " ++
+ "main printin putchar" --- these are not ansi-C
+
+-- turn unknown tokens into string literals; not recursively for literals 123, 'foo'
+
+unknown2string :: (String -> Bool) -> [CFTok] -> [CFTok]
+unknown2string isKnown = map mkOne where
+ mkOne t@(TS s) = if isKnown s then t else mkTL s
+ mkOne t@(TC s) = if isKnown s then t else mkTL s
+ mkOne t = t
+
+lexTextLiteral isKnown = unknown2string isKnown . lexText
+lexHaskellLiteral isKnown = unknown2string isKnown . lexHaskell
+