summaryrefslogtreecommitdiff
path: root/src/GF/UseGrammar/Tokenize.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/GF/UseGrammar/Tokenize.hs')
-rw-r--r--src/GF/UseGrammar/Tokenize.hs222
1 files changed, 0 insertions, 222 deletions
diff --git a/src/GF/UseGrammar/Tokenize.hs b/src/GF/UseGrammar/Tokenize.hs
deleted file mode 100644
index 9f1ab5449..000000000
--- a/src/GF/UseGrammar/Tokenize.hs
+++ /dev/null
@@ -1,222 +0,0 @@
-----------------------------------------------------------------------
--- |
--- Module : Tokenize
--- Maintainer : AR
--- Stability : (stable)
--- Portability : (portable)
---
--- > CVS $Date: 2005/09/29 13:20:08 $
--- > CVS $Author: aarne $
--- > CVS $Revision: 1.14 $
---
--- lexers = tokenizers, to prepare input for GF grammars. AR 4\/1\/2002.
--- an entry for each is included in 'Custom.customTokenizer'
------------------------------------------------------------------------------
-
-module GF.UseGrammar.Tokenize ( tokWords,
- tokLits,
- tokVars,
- lexHaskell,
- lexHaskellLiteral,
- lexHaskellVar,
- lexText,
- lexTextVar,
- lexC2M, lexC2M',
- lexTextLiteral,
- lexIgnore,
- wordsLits
- ) where
-
-import GF.Data.Operations
----- import UseGrammar (isLiteral,identC)
-import GF.CF.CFIdent
-
-import Data.Char
-
--- lexers = tokenizers, to prepare input for GF grammars. AR 4/1/2002
--- an entry for each is included in Custom.customTokenizer
-
--- | just words
-tokWords :: String -> [CFTok]
-tokWords = map tS . words
-
-tokLits :: String -> [CFTok]
-tokLits = map mkCFTok . mergeStr . wordsLits where
- mergeStr ss = case ss of
- w@(c:cs):rest | elem c "\'\"" && c /= last w -> getStr [w] rest
- w :rest -> w : mergeStr rest
- [] -> []
- getStr v ss = case ss of
- w@(_:_):rest | elem (last w) "\'\"" -> (unwords (reverse (w:v))) : mergeStr rest
- w :rest -> getStr (w:v) rest
- [] -> reverse v
-
-tokVars :: String -> [CFTok]
-tokVars = map mkCFTokVar . wordsLits
-
-isFloat s = case s of
- c:cs | isDigit c -> isFloat cs
- '.':cs@(_:_) -> all isDigit cs
- _ -> False
-
-isString s = case s of
- c:cs@(_:_) -> (c == '\'' && d == '\'') || (c == '"' && d == '"') where d = last cs
- _ -> False
-
-
-mkCFTok :: String -> CFTok
-mkCFTok s = case s of
- '"' :cs@(_:_) | last cs == '"' -> tL $ init cs
- '\'':cs@(_:_) | last cs == '\'' -> tL $ init cs --- 's Gravenhage
- _:_ | isFloat s -> tF s
- _:_ | all isDigit s -> tI s
- _ -> tS s
-
-mkCFTokVar :: String -> CFTok
-mkCFTokVar s = case s of
- '?':_:_ -> tM s --- "?" --- compat with prCF
- 'x':'_':_ -> tV s
- 'x':[] -> tV s
- '$':xs@(_:_) -> if last s == '$' then tV (init xs) else tS s
- _ -> tS s
-
-mkTokVars :: (String -> [CFTok]) -> String -> [CFTok]
-mkTokVars tok = map tv . tok where
- tv (TS s) = mkCFTokVar s
- tv t = t
-
-mkLit :: String -> CFTok
-mkLit s
- | isFloat s = tF s
- | all isDigit s = tI s
- | otherwise = tL s
-
--- obsolete
-mkTL :: String -> CFTok
-mkTL s
- | isFloat s = tF s
- | all isDigit s = tI s
- | otherwise = tL ("'" ++ s ++ "'")
-
-
--- | Haskell lexer, usable for much code
-lexHaskell :: String -> [CFTok]
-lexHaskell ss = case lex ss of
- [(w@(_:_),ws)] -> tS w : lexHaskell ws
- _ -> []
-
--- | somewhat shaky text lexer
-lexText :: String -> [CFTok]
-lexText = uncap . lx where
-
- lx s = case s of
- '?':'?':cs -> tS "??" : lx cs
- p : cs | isMPunct p -> tS [p] : uncap (lx cs)
- p : cs | isPunct p -> tS [p] : lx cs
- s : cs | isSpace s -> lx cs
- _ : _ -> getWord s
- _ -> []
-
- getWord s = tS w : lx ws where (w,ws) = span isNotSpec s
- isMPunct c = elem c ".!?"
- isPunct c = elem c ",:;()\""
- isNotSpec c = not (isMPunct c || isPunct c || isSpace c)
- uncap (TS (c:cs) : ws) = tC (c:cs) : ws
- uncap s = s
-
--- | lexer for C--, a mini variant of C
-lexC2M :: String -> [CFTok]
-lexC2M = lexC2M' False
-
-lexC2M' :: Bool -> String -> [CFTok]
-lexC2M' isHigherOrder s = case s of
- '#':cs -> lexC $ dropWhile (/='\n') cs
- '/':'*':cs -> lexC $ dropComment cs
- c:cs | isSpace c -> lexC cs
- c:cs | isAlpha c -> getId s
- c:cs | isDigit c -> getLit s
- c:d:cs | isSymb [c,d] -> tS [c,d] : lexC cs
- c:cs | isSymb [c] -> tS [c] : lexC cs
- _ -> [] --- covers end of file and unknown characters
- where
- lexC = lexC2M' isHigherOrder
- getId s = mkT i : lexC cs where (i,cs) = span isIdChar s
- getLit s = tI i : lexC cs where (i,cs) = span isDigit s ---- Float!
- isIdChar c = isAlpha c || isDigit c || elem c "'_"
- isSymb = reservedAnsiCSymbol
- dropComment s = case s of
- '*':'/':cs -> cs
- _:cs -> dropComment cs
- _ -> []
- mkT i = if (isRes i) then (tS i) else
- if isHigherOrder then (tV i) else (tL ("'" ++ i ++ "'"))
- isRes = reservedAnsiC
-
-
-reservedAnsiCSymbol s = case lookupTree show s ansiCtree of
- Ok True -> True
- _ -> False
-
-reservedAnsiC s = case lookupTree show s ansiCtree of
- Ok False -> True
- _ -> False
-
--- | for an efficient lexer: precompile this!
-ansiCtree = buildTree $ [(s,True) | s <- reservedAnsiCSymbols] ++
- [(s,False) | s <- reservedAnsiCWords]
-
-reservedAnsiCSymbols = words $
- "<<= >>= << >> ++ -- == <= >= *= += -= %= /= &= ^= |= " ++
- "^ { } = , ; + * - ( ) < > & % ! ~"
-
-reservedAnsiCWords = words $
- "auto break case char const continue default " ++
- "do double else enum extern float for goto if int " ++
- "long register return short signed sizeof static struct switch typedef " ++
- "union unsigned void volatile while " ++
- "main printin putchar" --- these are not ansi-C
-
--- | turn unknown tokens into string literals; not recursively for literals 123, 'foo'
-unknown2string :: (String -> Bool) -> [CFTok] -> [CFTok]
-unknown2string isKnown = map mkOne where
- mkOne t@(TS s)
- | isKnown s = t
- | isFloat s = tF s
- | all isDigit s = tI s
- | otherwise = tL s
- mkOne t@(TC s) = if isKnown s then t else mkLit s
- mkOne t = t
-
-unknown2var :: (String -> Bool) -> [CFTok] -> [CFTok]
-unknown2var isKnown = map mkOne where
- mkOne t@(TS "??") = if isKnown "??" then t else tM "??"
- mkOne t@(TS s)
- | isKnown s = t
- | isFloat s = tF s
- | isString s = tL (init (tail s))
- | all isDigit s = tI s
- | otherwise = tV s
- mkOne t@(TC s) = if isKnown s then t else tV s
- mkOne t = t
-
-lexTextLiteral, lexHaskellLiteral, lexHaskellVar :: (String -> Bool) -> String -> [CFTok]
-
-lexTextLiteral isKnown = unknown2string (eitherUpper isKnown) . lexText
-lexHaskellLiteral isKnown = unknown2string isKnown . lexHaskell
-
-lexHaskellVar isKnown = unknown2var isKnown . lexHaskell
-lexTextVar isKnown = unknown2var (eitherUpper isKnown) . lexText
-
-
-eitherUpper isKnown w@(c:cs) = isKnown (toLower c : cs) || isKnown (toUpper c : cs)
-eitherUpper isKnown w = isKnown w
-
--- ignore unknown tokens (e.g. keyword spotting)
-
-lexIgnore :: (String -> Bool) -> [CFTok] -> [CFTok]
-lexIgnore isKnown = concatMap mkOne where
- mkOne t@(TS s)
- | isKnown s = [t]
- | otherwise = []
- mkOne t = [t]
-