Change how GF deals with character encodings in grammar files

1. The default encoding is changed from Latin-1 to UTF-8. 2. Alternate encodings should be specified as "--# -coding=enc", the old "flags coding=enc" declarations have no effect but are still checked for consistency. 3. A transitional warning is generated for files that contain non-ASCII characters without specifying a character encoding: "Warning: default encoding has changed from Latin-1 to UTF-8" 4. Conversion to Unicode is now done *before* lexing. This makes it possible to allow arbitrary Unicode characters in identifiers. But identifiers are still stored as ByteStrings, so they are limited to Latin-1 characters for now. 5. Lexer.hs is no longer part of the repository. We now generate the lexer from Lexer.x with alex>=3. Some workarounds for bugs in alex-3.0 were needed. These bugs might already be fixed in newer versions of alex, but we should be compatible with what is shipped in the Haskell Platform.
author: hallgren <hallgren@chalmers.se> 2013-11-25 21:12:11 +0000
committer: hallgren <hallgren@chalmers.se> 2013-11-25 21:12:11 +0000
commit: 9d7fdf7c9a525a3b5659a566f76d26d151dcd664 (patch)
tree: 9ea97377d9938fc382c2036fa4c8fef9c33e33d8 /src/compiler/GF/Grammar/lexer
parent: 3210a506484864430504ed1caf2f547bb674e701 (diff)
1 files changed, 0 insertions, 273 deletions
diff --git a/src/compiler/GF/Grammar/lexer/Lexer.x b/src/compiler/GF/Grammar/lexer/Lexer.x
deleted file mode 100644
index 460e7f452..000000000
--- a/src/compiler/GF/Grammar/lexer/Lexer.x
+++ /dev/null
@@ -1,273 +0,0 @@
-{
-module GF.Grammar.Lexer
-         ( Token(..), Posn(..)
-         , P, runP, lexer, getPosn, failLoc
-         , isReservedWord
-         ) where
-
-import GF.Infra.Ident
-import qualified Data.ByteString.Char8 as BS
-import qualified Data.Map as Map
-
-}
-
-
-$l = [a-zA-Z\192 - \255] # [\215 \247]
-$c = [A-Z\192-\221] # [\215]
-$s = [a-z\222-\255] # [\247]
-$d = [0-9]                -- digit
-$i = [$l $d _ ']          -- identifier character
-$u = [\0-\255]            -- universal: any character
-
-@rsyms =    -- symbols and non-identifier-like reserved words
-   \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/
-
-:-
-"--" [.]* ; -- Toss single line comments
-"{-" ([$u # \-] | \- [$u # \}])* ("-")+ "}" ; 
-
-$white+ ;
-@rsyms                          { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }
-\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . BS.unpack)) }
-(\_ | $l)($l | $d | \_ | \')*   { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }
-
-\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . BS.unpack) }
-
-(\-)? $d+                       { tok (T_Integer . read . BS.unpack) }
-(\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double  . read . BS.unpack) }
-
-{
-
-tok f p s = f s
-
-data Token
- = T_exclmark
- | T_patt
- | T_int_label
- | T_oparen
- | T_cparen
- | T_tilde
- | T_star
- | T_starstar
- | T_plus
- | T_plusplus
- | T_comma
- | T_minus
- | T_rarrow
- | T_dot
- | T_alt
- | T_colon
- | T_semicolon
- | T_less
- | T_equal
- | T_big_rarrow
- | T_great
- | T_questmark
- | T_obrack
- | T_lam
- | T_lamlam
- | T_cbrack
- | T_ocurly
- | T_bar
- | T_ccurly
- | T_underscore
- | T_at
- | T_PType
- | T_Str
- | T_Strs
- | T_Tok
- | T_Type
- | T_abstract
- | T_case
- | T_cat
- | T_concrete
- | T_data
- | T_def
- | T_flags
- | T_fn
- | T_fun
- | T_in
- | T_incomplete
- | T_instance
- | T_interface
- | T_let
- | T_lin
- | T_lincat
- | T_lindef
- | T_linref
- | T_of
- | T_open
- | T_oper
- | T_param
- | T_pattern
- | T_pre
- | T_printname
- | T_resource
- | T_strs
- | T_table
- | T_transfer
- | T_variants
- | T_where
- | T_with
- | T_String  String          -- string literals
- | T_Integer Int             -- integer literals
- | T_Double  Double          -- double precision float literals
- | T_Ident   Ident
- | T_EOF
-
-eitherResIdent :: (BS.ByteString -> Token) -> BS.ByteString -> Token
-eitherResIdent tv s = 
-  case Map.lookup s resWords of
-    Just t  -> t
-    Nothing -> tv s
-
-isReservedWord :: BS.ByteString -> Bool
-isReservedWord s = Map.member s resWords
-
-resWords = Map.fromList
- [ b "!"  T_exclmark
- , b "#"  T_patt
- , b "$"  T_int_label
- , b "("  T_oparen
- , b ")"  T_cparen
- , b "~"  T_tilde
- , b "*"  T_star
- , b "**" T_starstar
- , b "+"  T_plus
- , b "++" T_plusplus
- , b ","  T_comma
- , b "-"  T_minus
- , b "->" T_rarrow
- , b "."  T_dot
- , b "/"  T_alt
- , b ":"  T_colon
- , b ";"  T_semicolon
- , b "<"  T_less
- , b "="  T_equal
- , b "=>" T_big_rarrow
- , b ">"  T_great
- , b "?"  T_questmark
- , b "["  T_obrack
- , b "]"  T_cbrack
- , b "\\" T_lam
- , b "\\\\" T_lamlam
- , b "{"  T_ocurly
- , b "}"  T_ccurly
- , b "|"  T_bar
- , b "_"  T_underscore
- , b "@"  T_at
- , b "PType"      T_PType
- , b "Str"        T_Str
- , b "Strs"       T_Strs
- , b "Tok"        T_Tok
- , b "Type"       T_Type
- , b "abstract"   T_abstract
- , b "case"       T_case
- , b "cat"        T_cat
- , b "concrete"   T_concrete
- , b "data"       T_data
- , b "def"        T_def
- , b "flags"      T_flags
- , b "fn"         T_fn
- , b "fun"        T_fun
- , b "in"         T_in
- , b "incomplete" T_incomplete
- , b "instance"   T_instance
- , b "interface"  T_interface
- , b "let"        T_let
- , b "lin"        T_lin
- , b "lincat"     T_lincat
- , b "lindef"     T_lindef
- , b "linref"     T_linref
- , b "of"         T_of
- , b "open"       T_open
- , b "oper"       T_oper
- , b "param"      T_param
- , b "pattern"    T_pattern
- , b "pre"        T_pre
- , b "printname"  T_printname
- , b "resource"   T_resource
- , b "strs"       T_strs
- , b "table"      T_table
- , b "transfer"   T_transfer
- , b "variants"   T_variants
- , b "where"      T_where
- , b "with"       T_with
- ]
- where b s t = (BS.pack s, t)
-
-unescapeInitTail :: String -> String
-unescapeInitTail = unesc . tail where
-  unesc s = case s of
-    '\\':c:cs | elem c ['\"', '\\', '\''] -> c : unesc cs
-    '\\':'n':cs  -> '\n' : unesc cs
-    '\\':'t':cs  -> '\t' : unesc cs
-    '"':[]    -> []
-    '\'':[]    -> []
-    c:cs      -> c : unesc cs
-    _         -> []
-
--------------------------------------------------------------------
--- Alex wrapper code.
--- A modified "posn" wrapper.
--------------------------------------------------------------------
-
-data Posn = Pn {-# UNPACK #-} !Int
-               {-# UNPACK #-} !Int
-
-alexMove :: Posn -> Char -> Posn
-alexMove (Pn l c) '\n' = Pn (l+1) 1
-alexMove (Pn l c) _    = Pn l     (c+1)
-
-alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
-alexGetChar (AI p _ s) =
-  case BS.uncons s of
-    Nothing  -> Nothing
-    Just (c,s) ->
-             let p' = alexMove p c
-              in p' `seq` Just (c, (AI p' c s))
-
-alexInputPrevChar :: AlexInput -> Char
-alexInputPrevChar (AI p c s) = c
-
-data AlexInput = AI {-# UNPACK #-} !Posn            -- current position,
-                    {-# UNPACK #-} !Char            -- previous char
-                    {-# UNPACK #-} !BS.ByteString   -- current input string
-
-data ParseResult a
-  = POk a
-  | PFailed Posn	-- The position of the error
-            String      -- The error message
-
-newtype P a = P { unP :: AlexInput -> ParseResult a }
-
-instance Monad P where
-  return a    = a `seq` (P $ \s -> POk a)
-  (P m) >>= k = P $ \ s -> case m s of
-                             POk a         -> unP (k a) s
-                             PFailed posn err -> PFailed posn err
-  fail msg    = P $ \(AI posn _ _) -> PFailed posn msg
-
-runP :: P a -> BS.ByteString -> Either (Posn,String) a
-runP (P f) txt =
-  case f (AI (Pn 1 0) ' ' txt) of
-    POk x           -> Right x
-    PFailed pos msg -> Left  (pos,msg)
-
-failLoc :: Posn -> String -> P a
-failLoc pos msg = P $ \_ -> PFailed pos msg
-
-lexer :: (Token -> P a) -> P a
-lexer cont = P go
-  where
-    go inp@(AI pos _ str) =
-      case alexScan inp 0 of
-        AlexEOF                -> unP (cont T_EOF) inp
-        AlexError (AI pos _ _) -> PFailed pos "lexical error"
-        AlexSkip  inp' len     -> go inp'
-        AlexToken inp' len act -> unP (cont (act pos (BS.take len str))) inp'
-
-getPosn :: P Posn
-getPosn = P $ \inp@(AI pos _ _) -> POk pos
-
-}
author	hallgren <hallgren@chalmers.se>	2013-11-25 21:12:11 +0000
committer	hallgren <hallgren@chalmers.se>	2013-11-25 21:12:11 +0000
commit	9d7fdf7c9a525a3b5659a566f76d26d151dcd664 (patch)
tree	9ea97377d9938fc382c2036fa4c8fef9c33e33d8 /src/compiler/GF/Grammar/lexer
parent	3210a506484864430504ed1caf2f547bb674e701 (diff)