diff options
| author | hallgren <hallgren@chalmers.se> | 2013-11-25 21:12:11 +0000 |
|---|---|---|
| committer | hallgren <hallgren@chalmers.se> | 2013-11-25 21:12:11 +0000 |
| commit | 9d7fdf7c9a525a3b5659a566f76d26d151dcd664 (patch) | |
| tree | 9ea97377d9938fc382c2036fa4c8fef9c33e33d8 /src/compiler/GF/Grammar/lexer | |
| parent | 3210a506484864430504ed1caf2f547bb674e701 (diff) | |
Change how GF deals with character encodings in grammar files
1. The default encoding is changed from Latin-1 to UTF-8.
2. Alternate encodings should be specified as "--# -coding=enc", the old
"flags coding=enc" declarations have no effect but are still checked for
consistency.
3. A transitional warning is generated for files that contain non-ASCII
characters without specifying a character encoding:
"Warning: default encoding has changed from Latin-1 to UTF-8"
4. Conversion to Unicode is now done *before* lexing. This makes it possible
to allow arbitrary Unicode characters in identifiers. But identifiers are
still stored as ByteStrings, so they are limited to Latin-1 characters
for now.
5. Lexer.hs is no longer part of the repository. We now generate the lexer
from Lexer.x with alex>=3. Some workarounds for bugs in alex-3.0 were
needed. These bugs might already be fixed in newer versions of alex, but
we should be compatible with what is shipped in the Haskell Platform.
Diffstat (limited to 'src/compiler/GF/Grammar/lexer')
| -rw-r--r-- | src/compiler/GF/Grammar/lexer/Lexer.x | 273 |
1 files changed, 0 insertions, 273 deletions
diff --git a/src/compiler/GF/Grammar/lexer/Lexer.x b/src/compiler/GF/Grammar/lexer/Lexer.x deleted file mode 100644 index 460e7f452..000000000 --- a/src/compiler/GF/Grammar/lexer/Lexer.x +++ /dev/null @@ -1,273 +0,0 @@ -{ -module GF.Grammar.Lexer - ( Token(..), Posn(..) - , P, runP, lexer, getPosn, failLoc - , isReservedWord - ) where - -import GF.Infra.Ident -import qualified Data.ByteString.Char8 as BS -import qualified Data.Map as Map - -} - - -$l = [a-zA-Z\192 - \255] # [\215 \247] -$c = [A-Z\192-\221] # [\215] -$s = [a-z\222-\255] # [\247] -$d = [0-9] -- digit -$i = [$l $d _ '] -- identifier character -$u = [\0-\255] -- universal: any character - -@rsyms = -- symbols and non-identifier-like reserved words - \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/ - -:- -"--" [.]* ; -- Toss single line comments -"{-" ([$u # \-] | \- [$u # \}])* ("-")+ "}" ; - -$white+ ; -@rsyms { tok (eitherResIdent (T_Ident . identC . rawIdentC)) } -\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . BS.unpack)) } -(\_ | $l)($l | $d | \_ | \')* { tok (eitherResIdent (T_Ident . identC . rawIdentC)) } - -\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . BS.unpack) } - -(\-)? $d+ { tok (T_Integer . read . BS.unpack) } -(\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double . read . BS.unpack) } - -{ - -tok f p s = f s - -data Token - = T_exclmark - | T_patt - | T_int_label - | T_oparen - | T_cparen - | T_tilde - | T_star - | T_starstar - | T_plus - | T_plusplus - | T_comma - | T_minus - | T_rarrow - | T_dot - | T_alt - | T_colon - | T_semicolon - | T_less - | T_equal - | T_big_rarrow - | T_great - | T_questmark - | T_obrack - | T_lam - | T_lamlam - | T_cbrack - | T_ocurly - | T_bar - | T_ccurly - | T_underscore - | T_at - | T_PType - | T_Str - | T_Strs - | T_Tok - | T_Type - | T_abstract - | T_case - | T_cat - | T_concrete - | T_data - | T_def - | T_flags - | T_fn - | T_fun - | T_in - | T_incomplete - | T_instance - | T_interface - | T_let - | T_lin - | T_lincat - | T_lindef - | T_linref - | T_of - | T_open - | T_oper - | T_param - | T_pattern - | T_pre - | T_printname - | T_resource - | T_strs - | T_table - | T_transfer - | T_variants - | T_where - | T_with - | T_String String -- string literals - | T_Integer Int -- integer literals - | T_Double Double -- double precision float literals - | T_Ident Ident - | T_EOF - -eitherResIdent :: (BS.ByteString -> Token) -> BS.ByteString -> Token -eitherResIdent tv s = - case Map.lookup s resWords of - Just t -> t - Nothing -> tv s - -isReservedWord :: BS.ByteString -> Bool -isReservedWord s = Map.member s resWords - -resWords = Map.fromList - [ b "!" T_exclmark - , b "#" T_patt - , b "$" T_int_label - , b "(" T_oparen - , b ")" T_cparen - , b "~" T_tilde - , b "*" T_star - , b "**" T_starstar - , b "+" T_plus - , b "++" T_plusplus - , b "," T_comma - , b "-" T_minus - , b "->" T_rarrow - , b "." T_dot - , b "/" T_alt - , b ":" T_colon - , b ";" T_semicolon - , b "<" T_less - , b "=" T_equal - , b "=>" T_big_rarrow - , b ">" T_great - , b "?" T_questmark - , b "[" T_obrack - , b "]" T_cbrack - , b "\\" T_lam - , b "\\\\" T_lamlam - , b "{" T_ocurly - , b "}" T_ccurly - , b "|" T_bar - , b "_" T_underscore - , b "@" T_at - , b "PType" T_PType - , b "Str" T_Str - , b "Strs" T_Strs - , b "Tok" T_Tok - , b "Type" T_Type - , b "abstract" T_abstract - , b "case" T_case - , b "cat" T_cat - , b "concrete" T_concrete - , b "data" T_data - , b "def" T_def - , b "flags" T_flags - , b "fn" T_fn - , b "fun" T_fun - , b "in" T_in - , b "incomplete" T_incomplete - , b "instance" T_instance - , b "interface" T_interface - , b "let" T_let - , b "lin" T_lin - , b "lincat" T_lincat - , b "lindef" T_lindef - , b "linref" T_linref - , b "of" T_of - , b "open" T_open - , b "oper" T_oper - , b "param" T_param - , b "pattern" T_pattern - , b "pre" T_pre - , b "printname" T_printname - , b "resource" T_resource - , b "strs" T_strs - , b "table" T_table - , b "transfer" T_transfer - , b "variants" T_variants - , b "where" T_where - , b "with" T_with - ] - where b s t = (BS.pack s, t) - -unescapeInitTail :: String -> String -unescapeInitTail = unesc . tail where - unesc s = case s of - '\\':c:cs | elem c ['\"', '\\', '\''] -> c : unesc cs - '\\':'n':cs -> '\n' : unesc cs - '\\':'t':cs -> '\t' : unesc cs - '"':[] -> [] - '\'':[] -> [] - c:cs -> c : unesc cs - _ -> [] - -------------------------------------------------------------------- --- Alex wrapper code. --- A modified "posn" wrapper. -------------------------------------------------------------------- - -data Posn = Pn {-# UNPACK #-} !Int - {-# UNPACK #-} !Int - -alexMove :: Posn -> Char -> Posn -alexMove (Pn l c) '\n' = Pn (l+1) 1 -alexMove (Pn l c) _ = Pn l (c+1) - -alexGetChar :: AlexInput -> Maybe (Char,AlexInput) -alexGetChar (AI p _ s) = - case BS.uncons s of - Nothing -> Nothing - Just (c,s) -> - let p' = alexMove p c - in p' `seq` Just (c, (AI p' c s)) - -alexInputPrevChar :: AlexInput -> Char -alexInputPrevChar (AI p c s) = c - -data AlexInput = AI {-# UNPACK #-} !Posn -- current position, - {-# UNPACK #-} !Char -- previous char - {-# UNPACK #-} !BS.ByteString -- current input string - -data ParseResult a - = POk a - | PFailed Posn -- The position of the error - String -- The error message - -newtype P a = P { unP :: AlexInput -> ParseResult a } - -instance Monad P where - return a = a `seq` (P $ \s -> POk a) - (P m) >>= k = P $ \ s -> case m s of - POk a -> unP (k a) s - PFailed posn err -> PFailed posn err - fail msg = P $ \(AI posn _ _) -> PFailed posn msg - -runP :: P a -> BS.ByteString -> Either (Posn,String) a -runP (P f) txt = - case f (AI (Pn 1 0) ' ' txt) of - POk x -> Right x - PFailed pos msg -> Left (pos,msg) - -failLoc :: Posn -> String -> P a -failLoc pos msg = P $ \_ -> PFailed pos msg - -lexer :: (Token -> P a) -> P a -lexer cont = P go - where - go inp@(AI pos _ str) = - case alexScan inp 0 of - AlexEOF -> unP (cont T_EOF) inp - AlexError (AI pos _ _) -> PFailed pos "lexical error" - AlexSkip inp' len -> go inp' - AlexToken inp' len act -> unP (cont (act pos (BS.take len str))) inp' - -getPosn :: P Posn -getPosn = P $ \inp@(AI pos _ _) -> POk pos - -} |
