diff options
| author | hallgren <hallgren@chalmers.se> | 2013-11-25 21:12:11 +0000 |
|---|---|---|
| committer | hallgren <hallgren@chalmers.se> | 2013-11-25 21:12:11 +0000 |
| commit | 9d7fdf7c9a525a3b5659a566f76d26d151dcd664 (patch) | |
| tree | 9ea97377d9938fc382c2036fa4c8fef9c33e33d8 /src/compiler/GF/Grammar/Lexer.x | |
| parent | 3210a506484864430504ed1caf2f547bb674e701 (diff) | |
Change how GF deals with character encodings in grammar files
1. The default encoding is changed from Latin-1 to UTF-8.
2. Alternate encodings should be specified as "--# -coding=enc", the old
"flags coding=enc" declarations have no effect but are still checked for
consistency.
3. A transitional warning is generated for files that contain non-ASCII
characters without specifying a character encoding:
"Warning: default encoding has changed from Latin-1 to UTF-8"
4. Conversion to Unicode is now done *before* lexing. This makes it possible
to allow arbitrary Unicode characters in identifiers. But identifiers are
still stored as ByteStrings, so they are limited to Latin-1 characters
for now.
5. Lexer.hs is no longer part of the repository. We now generate the lexer
from Lexer.x with alex>=3. Some workarounds for bugs in alex-3.0 were
needed. These bugs might already be fixed in newer versions of alex, but
we should be compatible with what is shipped in the Haskell Platform.
Diffstat (limited to 'src/compiler/GF/Grammar/Lexer.x')
| -rw-r--r-- | src/compiler/GF/Grammar/Lexer.x | 286 |
1 files changed, 286 insertions, 0 deletions
diff --git a/src/compiler/GF/Grammar/Lexer.x b/src/compiler/GF/Grammar/Lexer.x new file mode 100644 index 000000000..60c51f814 --- /dev/null +++ b/src/compiler/GF/Grammar/Lexer.x @@ -0,0 +1,286 @@ +-- -*- haskell -*- +{ +module GF.Grammar.Lexer + ( Token(..), Posn(..) + , P, runP, lexer, getPosn, failLoc + , isReservedWord + ) where + +import GF.Infra.Ident +--import GF.Data.Operations +import qualified Data.ByteString.Char8 as BS +import qualified Data.ByteString as WBS +import qualified Data.ByteString.Internal as BS(w2c) +import qualified Data.ByteString.UTF8 as UTF8 +import qualified Data.Map as Map +import Data.Word(Word8) +--import Debug.Trace(trace) +} + + +$l = [a-zA-Z\192 - \255] # [\215 \247] +$c = [A-Z\192-\221] # [\215] +$s = [a-z\222-\255] # [\247] +$d = [0-9] -- digit +$i = [$l $d _ '] -- identifier character +$u = [.\n] -- universal: any character + +@rsyms = -- symbols and non-identifier-like reserved words + \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/ + +:- +"--" [.]* ; -- Toss single line comments +"{-" ([$u # \-] | \- [$u # \}])* ("-")+ "}" ; + +$white+ ; +@rsyms { tok (res (T_Ident . identS)) } +\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . unpack)) } +(\_ | $l)($l | $d | \_ | \')* { tok (res (T_Ident . identS)) } + +\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . unpack) } + +(\-)? $d+ { tok (T_Integer . read . unpack) } +(\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double . read . unpack) } + +{ +--unpack = BS.unpack +unpack = id + +tok :: (String->Token) -> Posn -> String -> Token +tok f p s = f s + +data Token + = T_exclmark + | T_patt + | T_int_label + | T_oparen + | T_cparen + | T_tilde + | T_star + | T_starstar + | T_plus + | T_plusplus + | T_comma + | T_minus + | T_rarrow + | T_dot + | T_alt + | T_colon + | T_semicolon + | T_less + | T_equal + | T_big_rarrow + | T_great + | T_questmark + | T_obrack + | T_lam + | T_lamlam + | T_cbrack + | T_ocurly + | T_bar + | T_ccurly + | T_underscore + | T_at + | T_PType + | T_Str + | T_Strs + | T_Tok + | T_Type + | T_abstract + | T_case + | T_cat + | T_concrete + | T_data + | T_def + | T_flags + | T_fn + | T_fun + | T_in + | T_incomplete + | T_instance + | T_interface + | T_let + | T_lin + | T_lincat + | T_lindef + | T_linref + | T_of + | T_open + | T_oper + | T_param + | T_pattern + | T_pre + | T_printname + | T_resource + | T_strs + | T_table + | T_transfer + | T_variants + | T_where + | T_with + | T_String String -- string literals + | T_Integer Int -- integer literals + | T_Double Double -- double precision float literals + | T_Ident Ident + | T_EOF +-- deriving Show -- debug + +res = eitherResIdent +eitherResIdent :: (String -> Token) -> String -> Token +eitherResIdent tv s = + case Map.lookup s resWords of + Just t -> t + Nothing -> tv s + +isReservedWord :: BS.ByteString -> Bool +isReservedWord s = Map.member (BS.unpack s) resWords + +resWords = Map.fromList + [ b "!" T_exclmark + , b "#" T_patt + , b "$" T_int_label + , b "(" T_oparen + , b ")" T_cparen + , b "~" T_tilde + , b "*" T_star + , b "**" T_starstar + , b "+" T_plus + , b "++" T_plusplus + , b "," T_comma + , b "-" T_minus + , b "->" T_rarrow + , b "." T_dot + , b "/" T_alt + , b ":" T_colon + , b ";" T_semicolon + , b "<" T_less + , b "=" T_equal + , b "=>" T_big_rarrow + , b ">" T_great + , b "?" T_questmark + , b "[" T_obrack + , b "]" T_cbrack + , b "\\" T_lam + , b "\\\\" T_lamlam + , b "{" T_ocurly + , b "}" T_ccurly + , b "|" T_bar + , b "_" T_underscore + , b "@" T_at + , b "PType" T_PType + , b "Str" T_Str + , b "Strs" T_Strs + , b "Tok" T_Tok + , b "Type" T_Type + , b "abstract" T_abstract + , b "case" T_case + , b "cat" T_cat + , b "concrete" T_concrete + , b "data" T_data + , b "def" T_def + , b "flags" T_flags + , b "fn" T_fn + , b "fun" T_fun + , b "in" T_in + , b "incomplete" T_incomplete + , b "instance" T_instance + , b "interface" T_interface + , b "let" T_let + , b "lin" T_lin + , b "lincat" T_lincat + , b "lindef" T_lindef + , b "linref" T_linref + , b "of" T_of + , b "open" T_open + , b "oper" T_oper + , b "param" T_param + , b "pattern" T_pattern + , b "pre" T_pre + , b "printname" T_printname + , b "resource" T_resource + , b "strs" T_strs + , b "table" T_table + , b "transfer" T_transfer + , b "variants" T_variants + , b "where" T_where + , b "with" T_with + ] + where b s t = (s, t) + +unescapeInitTail :: String -> String +unescapeInitTail = unesc . tail where + unesc s = case s of + '\\':c:cs | elem c ['\"', '\\', '\''] -> c : unesc cs + '\\':'n':cs -> '\n' : unesc cs + '\\':'t':cs -> '\t' : unesc cs + '\"':[] -> [] + '\'':[] -> [] + c:cs -> c : unesc cs + _ -> [] + +------------------------------------------------------------------- +-- Alex wrapper code. +-- A modified "posn" wrapper. +------------------------------------------------------------------- + +data Posn = Pn {-# UNPACK #-} !Int + {-# UNPACK #-} !Int + +alexMove :: Posn -> Char -> Posn +alexMove (Pn l c) '\n' = Pn (l+1) 1 +alexMove (Pn l c) _ = Pn l (c+1) + +alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) +alexGetByte (AI p _ s) = + case WBS.uncons s of + Nothing -> Nothing + Just (w,s) -> + let p' = alexMove p c + c = BS.w2c w + in p' `seq` Just (w, (AI p' c s)) + +alexInputPrevChar :: AlexInput -> Char +alexInputPrevChar (AI p c s) = c + +data AlexInput = AI {-# UNPACK #-} !Posn -- current position, + {-# UNPACK #-} !Char -- previous char + {-# UNPACK #-} !BS.ByteString -- current input string + +data ParseResult a + = POk a + | PFailed Posn -- The position of the error + String -- The error message + +newtype P a = P { unP :: AlexInput -> ParseResult a } + +instance Monad P where + return a = a `seq` (P $ \s -> POk a) + (P m) >>= k = P $ \ s -> case m s of + POk a -> unP (k a) s + PFailed posn err -> PFailed posn err + fail msg = P $ \(AI posn _ _) -> PFailed posn msg + +runP :: P a -> BS.ByteString -> Either (Posn,String) a +runP (P f) txt = + case f (AI (Pn 1 0) ' ' txt) of + POk x -> Right x + PFailed pos msg -> Left (pos,msg) + +failLoc :: Posn -> String -> P a +failLoc pos msg = P $ \_ -> PFailed pos msg + +lexer :: (Token -> P a) -> P a +lexer cont = P go + where + --cont' t = trace (show t) (cont t) + go inp@(AI pos _ str) = + case alexScan inp 0 of + AlexEOF -> unP (cont T_EOF) inp + AlexError (AI pos _ _) -> PFailed pos "lexical error" + AlexSkip inp' len -> {-trace (show len) $-} go inp' + AlexToken inp' len act -> unP (cont (act pos (UTF8.toString (UTF8.take len str)))) inp' + +getPosn :: P Posn +getPosn = P $ \inp@(AI pos _ _) -> POk pos + +} |
