diff options
| author | hallgren <hallgren@chalmers.se> | 2013-11-26 16:12:03 +0000 |
|---|---|---|
| committer | hallgren <hallgren@chalmers.se> | 2013-11-26 16:12:03 +0000 |
| commit | 3f57151cc346be0fbf0726d4953f3529ea45e7f4 (patch) | |
| tree | 6106269ff6ea7abb0c27d15cafdd818bb91b6bf7 /src/compiler/GF/Infra | |
| parent | 9d7fdf7c9a525a3b5659a566f76d26d151dcd664 (diff) | |
Represent identifiers as UTF-8-encoded ByteStrings
This was a fairly simple change thanks to previous work on making the Ident
type abstract and the fact that PGF.CId already uses UTF-8-encoded
ByteStrings.
One potential pitfall is that Data.ByteString.UTF8 uses the same type for
ByteStrings as Data.ByteString. I renamed ident2bs to ident2utf8 and
bsCId to utf8CId, to make it clearer that they work with UTF-8-encoded
ByteStrings.
Since both the compiler input and identifiers are now UTF-8-encoded
ByteStrings, the lexer now creates identifiers without copying any characters.
**END OF DESCRIPTION***
Place the long patch description above the ***END OF DESCRIPTION*** marker.
The first line of this file will be the patch name.
This patch contains the following changes:
M ./src/compiler/GF/Compile/CheckGrammar.hs -3 +3
M ./src/compiler/GF/Compile/GrammarToPGF.hs -2 +2
M ./src/compiler/GF/Grammar/Binary.hs -5 +1
M ./src/compiler/GF/Grammar/Lexer.x -11 +13
M ./src/compiler/GF/Infra/Ident.hs -19 +36
M ./src/runtime/haskell/PGF.hs -1 +1
M ./src/runtime/haskell/PGF/CId.hs -2 +3
Diffstat (limited to 'src/compiler/GF/Infra')
| -rw-r--r-- | src/compiler/GF/Infra/Ident.hs | 55 |
1 files changed, 36 insertions, 19 deletions
diff --git a/src/compiler/GF/Infra/Ident.hs b/src/compiler/GF/Infra/Ident.hs index 4792852dd..a5874b744 100644 --- a/src/compiler/GF/Infra/Ident.hs +++ b/src/compiler/GF/Infra/Ident.hs @@ -13,20 +13,24 @@ ----------------------------------------------------------------------------- module GF.Infra.Ident (-- * Identifiers - Ident, ident2bs, showIdent, ppIdent, prefixIdent, + Ident, ident2utf8, showIdent, ppIdent, prefixIdent, identS, identC, identV, identA, identAV, identW, argIdent, isArgIdent, getArgIndex, varStr, varX, isWildIdent, varIndex, -- * Raw Identifiers RawIdent, rawIdentS, rawIdentC, ident2raw, prefixRawIdent, - isPrefixOf, showRawIdent, rawId2bs{-, + isPrefixOf, showRawIdent{-, -- * Refreshing identifiers IdState, initIdStateN, initIdState, lookVar, refVar, refVarPlus-} ) where -import qualified Data.ByteString.Char8 as BS +import qualified Data.ByteString.UTF8 as UTF8 +import qualified Data.ByteString.Char8 as BS(append,isPrefixOf) + -- Limit use of BS functions to the ones that work correctly on + -- UTF-8-encoded bytestrings! import Data.Char(isDigit) +import Data.Binary(Binary(..)) import Text.PrettyPrint(Doc,text) @@ -41,31 +45,41 @@ data Ident = | IA {-# UNPACK #-} !RawIdent {-# UNPACK #-} !Int -- ^ /INTERNAL/ argument of cat at position | IAV {-# UNPACK #-} !RawIdent {-# UNPACK #-} !Int {-# UNPACK #-} !Int -- ^ /INTERNAL/ argument of cat with bindings at position -- - deriving (Eq, Ord, Show, Read) -newtype RawIdent = Id { rawId2bs :: BS.ByteString } +-- | Identifiers are stored as UTF-8-encoded bytestrings. +newtype RawIdent = Id { rawId2utf8 :: UTF8.ByteString } deriving (Eq, Ord, Show, Read) -rawIdentS = Id . BS.pack +pack = UTF8.fromString +unpack = UTF8.toString + +rawIdentS = Id . pack rawIdentC = Id -showRawIdent = BS.unpack . rawId2bs +showRawIdent = unpack . rawId2utf8 prefixRawIdent (Id x) (Id y) = Id (BS.append x y) isPrefixOf (Id x) (Id y) = BS.isPrefixOf x y -ident2bs :: Ident -> BS.ByteString -ident2bs i = case i of +instance Binary RawIdent where + put = put . rawId2utf8 + get = fmap rawIdentC get + + +-- | This function should be used with care, since the returned ByteString is +-- UTF-8-encoded. +ident2utf8 :: Ident -> UTF8.ByteString +ident2utf8 i = case i of IC (Id s) -> s - IV (Id s) n -> BS.append s (BS.pack ('_':show n)) - IA (Id s) j -> BS.append s (BS.pack ('_':show j)) - IAV (Id s) b j -> BS.append s (BS.pack ('_':show b ++ '_':show j)) - IW -> BS.pack "_" + IV (Id s) n -> BS.append s (pack ('_':show n)) + IA (Id s) j -> BS.append s (pack ('_':show j)) + IAV (Id s) b j -> BS.append s (pack ('_':show b ++ '_':show j)) + IW -> pack "_" -ident2raw = Id . ident2bs +ident2raw = Id . ident2utf8 showIdent :: Ident -> String -showIdent i = BS.unpack $! ident2bs i +showIdent i = unpack $! ident2utf8 i ppIdent :: Ident -> Doc ppIdent = text . showIdent @@ -83,7 +97,7 @@ identW :: Ident prefixIdent :: String -> Ident -> Ident -prefixIdent pref = identC . Id . BS.append (BS.pack pref) . ident2bs +prefixIdent pref = identC . Id . BS.append (pack pref) . ident2utf8 -- normal identifier -- ident s = IC s @@ -99,8 +113,11 @@ isArgIdent _ = False getArgIndex (IA _ i) = Just i getArgIndex (IAV _ _ i) = Just i -getArgIndex (IC (Id s)) - | isDigit (BS.last s) = (Just . read . BS.unpack . snd . BS.spanEnd isDigit) s +getArgIndex (IC (Id bs)) + | isDigit c = + -- (Just . read . unpack . snd . BS.spanEnd isDigit) bs -- not ok with UTF-8 + (Just . read . reverse . takeWhile isDigit) s + where s@(c:_) = reverse (unpack bs) getArgIndex x = Nothing -- | used in lin defaults @@ -117,7 +134,7 @@ isWildIdent x = case x of IC s | s == wild -> True _ -> False -wild = Id (BS.pack "_") +wild = Id (pack "_") varIndex :: Ident -> Int varIndex (IV _ n) = n |
