summaryrefslogtreecommitdiff
path: root/src/compiler/GF/Grammar/lexer
diff options
context:
space:
mode:
authorhallgren <hallgren@chalmers.se>2013-11-25 21:12:11 +0000
committerhallgren <hallgren@chalmers.se>2013-11-25 21:12:11 +0000
commit9d7fdf7c9a525a3b5659a566f76d26d151dcd664 (patch)
tree9ea97377d9938fc382c2036fa4c8fef9c33e33d8 /src/compiler/GF/Grammar/lexer
parent3210a506484864430504ed1caf2f547bb674e701 (diff)
Change how GF deals with character encodings in grammar files
1. The default encoding is changed from Latin-1 to UTF-8. 2. Alternate encodings should be specified as "--# -coding=enc", the old "flags coding=enc" declarations have no effect but are still checked for consistency. 3. A transitional warning is generated for files that contain non-ASCII characters without specifying a character encoding: "Warning: default encoding has changed from Latin-1 to UTF-8" 4. Conversion to Unicode is now done *before* lexing. This makes it possible to allow arbitrary Unicode characters in identifiers. But identifiers are still stored as ByteStrings, so they are limited to Latin-1 characters for now. 5. Lexer.hs is no longer part of the repository. We now generate the lexer from Lexer.x with alex>=3. Some workarounds for bugs in alex-3.0 were needed. These bugs might already be fixed in newer versions of alex, but we should be compatible with what is shipped in the Haskell Platform.
Diffstat (limited to 'src/compiler/GF/Grammar/lexer')
-rw-r--r--src/compiler/GF/Grammar/lexer/Lexer.x273
1 files changed, 0 insertions, 273 deletions
diff --git a/src/compiler/GF/Grammar/lexer/Lexer.x b/src/compiler/GF/Grammar/lexer/Lexer.x
deleted file mode 100644
index 460e7f452..000000000
--- a/src/compiler/GF/Grammar/lexer/Lexer.x
+++ /dev/null
@@ -1,273 +0,0 @@
-{
-module GF.Grammar.Lexer
- ( Token(..), Posn(..)
- , P, runP, lexer, getPosn, failLoc
- , isReservedWord
- ) where
-
-import GF.Infra.Ident
-import qualified Data.ByteString.Char8 as BS
-import qualified Data.Map as Map
-
-}
-
-
-$l = [a-zA-Z\192 - \255] # [\215 \247]
-$c = [A-Z\192-\221] # [\215]
-$s = [a-z\222-\255] # [\247]
-$d = [0-9] -- digit
-$i = [$l $d _ '] -- identifier character
-$u = [\0-\255] -- universal: any character
-
-@rsyms = -- symbols and non-identifier-like reserved words
- \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/
-
-:-
-"--" [.]* ; -- Toss single line comments
-"{-" ([$u # \-] | \- [$u # \}])* ("-")+ "}" ;
-
-$white+ ;
-@rsyms { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }
-\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . BS.unpack)) }
-(\_ | $l)($l | $d | \_ | \')* { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }
-
-\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . BS.unpack) }
-
-(\-)? $d+ { tok (T_Integer . read . BS.unpack) }
-(\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double . read . BS.unpack) }
-
-{
-
-tok f p s = f s
-
-data Token
- = T_exclmark
- | T_patt
- | T_int_label
- | T_oparen
- | T_cparen
- | T_tilde
- | T_star
- | T_starstar
- | T_plus
- | T_plusplus
- | T_comma
- | T_minus
- | T_rarrow
- | T_dot
- | T_alt
- | T_colon
- | T_semicolon
- | T_less
- | T_equal
- | T_big_rarrow
- | T_great
- | T_questmark
- | T_obrack
- | T_lam
- | T_lamlam
- | T_cbrack
- | T_ocurly
- | T_bar
- | T_ccurly
- | T_underscore
- | T_at
- | T_PType
- | T_Str
- | T_Strs
- | T_Tok
- | T_Type
- | T_abstract
- | T_case
- | T_cat
- | T_concrete
- | T_data
- | T_def
- | T_flags
- | T_fn
- | T_fun
- | T_in
- | T_incomplete
- | T_instance
- | T_interface
- | T_let
- | T_lin
- | T_lincat
- | T_lindef
- | T_linref
- | T_of
- | T_open
- | T_oper
- | T_param
- | T_pattern
- | T_pre
- | T_printname
- | T_resource
- | T_strs
- | T_table
- | T_transfer
- | T_variants
- | T_where
- | T_with
- | T_String String -- string literals
- | T_Integer Int -- integer literals
- | T_Double Double -- double precision float literals
- | T_Ident Ident
- | T_EOF
-
-eitherResIdent :: (BS.ByteString -> Token) -> BS.ByteString -> Token
-eitherResIdent tv s =
- case Map.lookup s resWords of
- Just t -> t
- Nothing -> tv s
-
-isReservedWord :: BS.ByteString -> Bool
-isReservedWord s = Map.member s resWords
-
-resWords = Map.fromList
- [ b "!" T_exclmark
- , b "#" T_patt
- , b "$" T_int_label
- , b "(" T_oparen
- , b ")" T_cparen
- , b "~" T_tilde
- , b "*" T_star
- , b "**" T_starstar
- , b "+" T_plus
- , b "++" T_plusplus
- , b "," T_comma
- , b "-" T_minus
- , b "->" T_rarrow
- , b "." T_dot
- , b "/" T_alt
- , b ":" T_colon
- , b ";" T_semicolon
- , b "<" T_less
- , b "=" T_equal
- , b "=>" T_big_rarrow
- , b ">" T_great
- , b "?" T_questmark
- , b "[" T_obrack
- , b "]" T_cbrack
- , b "\\" T_lam
- , b "\\\\" T_lamlam
- , b "{" T_ocurly
- , b "}" T_ccurly
- , b "|" T_bar
- , b "_" T_underscore
- , b "@" T_at
- , b "PType" T_PType
- , b "Str" T_Str
- , b "Strs" T_Strs
- , b "Tok" T_Tok
- , b "Type" T_Type
- , b "abstract" T_abstract
- , b "case" T_case
- , b "cat" T_cat
- , b "concrete" T_concrete
- , b "data" T_data
- , b "def" T_def
- , b "flags" T_flags
- , b "fn" T_fn
- , b "fun" T_fun
- , b "in" T_in
- , b "incomplete" T_incomplete
- , b "instance" T_instance
- , b "interface" T_interface
- , b "let" T_let
- , b "lin" T_lin
- , b "lincat" T_lincat
- , b "lindef" T_lindef
- , b "linref" T_linref
- , b "of" T_of
- , b "open" T_open
- , b "oper" T_oper
- , b "param" T_param
- , b "pattern" T_pattern
- , b "pre" T_pre
- , b "printname" T_printname
- , b "resource" T_resource
- , b "strs" T_strs
- , b "table" T_table
- , b "transfer" T_transfer
- , b "variants" T_variants
- , b "where" T_where
- , b "with" T_with
- ]
- where b s t = (BS.pack s, t)
-
-unescapeInitTail :: String -> String
-unescapeInitTail = unesc . tail where
- unesc s = case s of
- '\\':c:cs | elem c ['\"', '\\', '\''] -> c : unesc cs
- '\\':'n':cs -> '\n' : unesc cs
- '\\':'t':cs -> '\t' : unesc cs
- '"':[] -> []
- '\'':[] -> []
- c:cs -> c : unesc cs
- _ -> []
-
--------------------------------------------------------------------
--- Alex wrapper code.
--- A modified "posn" wrapper.
--------------------------------------------------------------------
-
-data Posn = Pn {-# UNPACK #-} !Int
- {-# UNPACK #-} !Int
-
-alexMove :: Posn -> Char -> Posn
-alexMove (Pn l c) '\n' = Pn (l+1) 1
-alexMove (Pn l c) _ = Pn l (c+1)
-
-alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
-alexGetChar (AI p _ s) =
- case BS.uncons s of
- Nothing -> Nothing
- Just (c,s) ->
- let p' = alexMove p c
- in p' `seq` Just (c, (AI p' c s))
-
-alexInputPrevChar :: AlexInput -> Char
-alexInputPrevChar (AI p c s) = c
-
-data AlexInput = AI {-# UNPACK #-} !Posn -- current position,
- {-# UNPACK #-} !Char -- previous char
- {-# UNPACK #-} !BS.ByteString -- current input string
-
-data ParseResult a
- = POk a
- | PFailed Posn -- The position of the error
- String -- The error message
-
-newtype P a = P { unP :: AlexInput -> ParseResult a }
-
-instance Monad P where
- return a = a `seq` (P $ \s -> POk a)
- (P m) >>= k = P $ \ s -> case m s of
- POk a -> unP (k a) s
- PFailed posn err -> PFailed posn err
- fail msg = P $ \(AI posn _ _) -> PFailed posn msg
-
-runP :: P a -> BS.ByteString -> Either (Posn,String) a
-runP (P f) txt =
- case f (AI (Pn 1 0) ' ' txt) of
- POk x -> Right x
- PFailed pos msg -> Left (pos,msg)
-
-failLoc :: Posn -> String -> P a
-failLoc pos msg = P $ \_ -> PFailed pos msg
-
-lexer :: (Token -> P a) -> P a
-lexer cont = P go
- where
- go inp@(AI pos _ str) =
- case alexScan inp 0 of
- AlexEOF -> unP (cont T_EOF) inp
- AlexError (AI pos _ _) -> PFailed pos "lexical error"
- AlexSkip inp' len -> go inp'
- AlexToken inp' len act -> unP (cont (act pos (BS.take len str))) inp'
-
-getPosn :: P Posn
-getPosn = P $ \inp@(AI pos _ _) -> POk pos
-
-}