summaryrefslogtreecommitdiff
path: root/src/compiler/GF/Grammar
diff options
context:
space:
mode:
authorhallgren <hallgren@chalmers.se>2013-11-26 16:12:03 +0000
committerhallgren <hallgren@chalmers.se>2013-11-26 16:12:03 +0000
commit3f57151cc346be0fbf0726d4953f3529ea45e7f4 (patch)
tree6106269ff6ea7abb0c27d15cafdd818bb91b6bf7 /src/compiler/GF/Grammar
parent9d7fdf7c9a525a3b5659a566f76d26d151dcd664 (diff)
Represent identifiers as UTF-8-encoded ByteStrings
This was a fairly simple change thanks to previous work on making the Ident type abstract and the fact that PGF.CId already uses UTF-8-encoded ByteStrings. One potential pitfall is that Data.ByteString.UTF8 uses the same type for ByteStrings as Data.ByteString. I renamed ident2bs to ident2utf8 and bsCId to utf8CId, to make it clearer that they work with UTF-8-encoded ByteStrings. Since both the compiler input and identifiers are now UTF-8-encoded ByteStrings, the lexer now creates identifiers without copying any characters. **END OF DESCRIPTION*** Place the long patch description above the ***END OF DESCRIPTION*** marker. The first line of this file will be the patch name. This patch contains the following changes: M ./src/compiler/GF/Compile/CheckGrammar.hs -3 +3 M ./src/compiler/GF/Compile/GrammarToPGF.hs -2 +2 M ./src/compiler/GF/Grammar/Binary.hs -5 +1 M ./src/compiler/GF/Grammar/Lexer.x -11 +13 M ./src/compiler/GF/Infra/Ident.hs -19 +36 M ./src/runtime/haskell/PGF.hs -1 +1 M ./src/runtime/haskell/PGF/CId.hs -2 +3
Diffstat (limited to 'src/compiler/GF/Grammar')
-rw-r--r--src/compiler/GF/Grammar/Binary.hs6
-rw-r--r--src/compiler/GF/Grammar/Lexer.x24
2 files changed, 14 insertions, 16 deletions
diff --git a/src/compiler/GF/Grammar/Binary.hs b/src/compiler/GF/Grammar/Binary.hs
index 6cd3832c0..7b4540ce5 100644
--- a/src/compiler/GF/Grammar/Binary.hs
+++ b/src/compiler/GF/Grammar/Binary.hs
@@ -30,7 +30,7 @@ import PGF.Data(Literal(..))
gfoVersion = "GF03"
instance Binary Ident where
- put id = put (ident2bs id)
+ put id = put (ident2utf8 id)
get = do bs <- get
if bs == BS.pack "_"
then return identW
@@ -295,10 +295,6 @@ instance Binary Label where
1 -> fmap LVar get
_ -> decodingError
-instance Binary RawIdent where
- put = put . rawId2bs
- get = fmap rawIdentC get
-
--putGFOVersion = mapM_ (putWord8 . fromIntegral . ord) gfoVersion
--getGFOVersion = replicateM (length gfoVersion) (fmap (chr . fromIntegral) getWord8)
--putGFOVersion = put gfoVersion
diff --git a/src/compiler/GF/Grammar/Lexer.x b/src/compiler/GF/Grammar/Lexer.x
index 60c51f814..c4f7159a2 100644
--- a/src/compiler/GF/Grammar/Lexer.x
+++ b/src/compiler/GF/Grammar/Lexer.x
@@ -33,9 +33,9 @@ $u = [.\n] -- universal: any character
"{-" ([$u # \-] | \- [$u # \}])* ("-")+ "}" ;
$white+ ;
-@rsyms { tok (res (T_Ident . identS)) }
-\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . unpack)) }
-(\_ | $l)($l | $d | \_ | \')* { tok (res (T_Ident . identS)) }
+@rsyms { tok ident }
+\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (res T_Ident . identS . unescapeInitTail . unpack) }
+(\_ | $l)($l | $d | \_ | \')* { tok ident }
\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . unpack) }
@@ -43,10 +43,12 @@ $white+ ;
(\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double . read . unpack) }
{
---unpack = BS.unpack
-unpack = id
+unpack = UTF8.toString
+--unpack = id
-tok :: (String->Token) -> Posn -> String -> Token
+ident = res T_Ident . identC . rawIdentC
+
+--tok :: (String->Token) -> Posn -> String -> Token
tok f p s = f s
data Token
@@ -126,14 +128,14 @@ data Token
-- deriving Show -- debug
res = eitherResIdent
-eitherResIdent :: (String -> Token) -> String -> Token
+eitherResIdent :: (Ident -> Token) -> Ident -> Token
eitherResIdent tv s =
case Map.lookup s resWords of
Just t -> t
Nothing -> tv s
-isReservedWord :: BS.ByteString -> Bool
-isReservedWord s = Map.member (BS.unpack s) resWords
+isReservedWord :: Ident -> Bool
+isReservedWord ident = Map.member ident resWords
resWords = Map.fromList
[ b "!" T_exclmark
@@ -205,7 +207,7 @@ resWords = Map.fromList
, b "where" T_where
, b "with" T_with
]
- where b s t = (s, t)
+ where b s t = (identS s, t)
unescapeInitTail :: String -> String
unescapeInitTail = unesc . tail where
@@ -278,7 +280,7 @@ lexer cont = P go
AlexEOF -> unP (cont T_EOF) inp
AlexError (AI pos _ _) -> PFailed pos "lexical error"
AlexSkip inp' len -> {-trace (show len) $-} go inp'
- AlexToken inp' len act -> unP (cont (act pos (UTF8.toString (UTF8.take len str)))) inp'
+ AlexToken inp' len act -> unP (cont (act pos ({-UTF8.toString-} (UTF8.take len str)))) inp'
getPosn :: P Posn
getPosn = P $ \inp@(AI pos _ _) -> POk pos