summaryrefslogtreecommitdiff
path: root/src/compiler/GF/Compile
diff options
context:
space:
mode:
authorhallgren <hallgren@chalmers.se>2013-11-25 21:12:11 +0000
committerhallgren <hallgren@chalmers.se>2013-11-25 21:12:11 +0000
commit9d7fdf7c9a525a3b5659a566f76d26d151dcd664 (patch)
tree9ea97377d9938fc382c2036fa4c8fef9c33e33d8 /src/compiler/GF/Compile
parent3210a506484864430504ed1caf2f547bb674e701 (diff)
Change how GF deals with character encodings in grammar files
1. The default encoding is changed from Latin-1 to UTF-8. 2. Alternate encodings should be specified as "--# -coding=enc", the old "flags coding=enc" declarations have no effect but are still checked for consistency. 3. A transitional warning is generated for files that contain non-ASCII characters without specifying a character encoding: "Warning: default encoding has changed from Latin-1 to UTF-8" 4. Conversion to Unicode is now done *before* lexing. This makes it possible to allow arbitrary Unicode characters in identifiers. But identifiers are still stored as ByteStrings, so they are limited to Latin-1 characters for now. 5. Lexer.hs is no longer part of the repository. We now generate the lexer from Lexer.x with alex>=3. Some workarounds for bugs in alex-3.0 were needed. These bugs might already be fixed in newer versions of alex, but we should be compatible with what is shipped in the Haskell Platform.
Diffstat (limited to 'src/compiler/GF/Compile')
-rw-r--r--src/compiler/GF/Compile/GetGrammar.hs59
-rw-r--r--src/compiler/GF/Compile/ReadFiles.hs69
2 files changed, 94 insertions, 34 deletions
diff --git a/src/compiler/GF/Compile/GetGrammar.hs b/src/compiler/GF/Compile/GetGrammar.hs
index 2f40d0242..10a857bf9 100644
--- a/src/compiler/GF/Compile/GetGrammar.hs
+++ b/src/compiler/GF/Compile/GetGrammar.hs
@@ -18,40 +18,61 @@ import Prelude hiding (catch)
import GF.Data.Operations
-import GF.System.Catch
+--import GF.System.Catch
import GF.Infra.UseIO
-import GF.Infra.Option(Options,optPreprocessors,addOptions,optEncoding,flag,renameEncoding)
+import GF.Infra.Option(Options,optPreprocessors,addOptions,renameEncoding,optEncoding,flag,defaultEncoding)
import GF.Grammar.Lexer
import GF.Grammar.Parser
import GF.Grammar.Grammar
-import GF.Compile.Coding
+--import GF.Compile.Coding
+import GF.Compile.ReadFiles(parseSource,lift)
+--import GF.Text.Coding(decodeUnicodeIO)
import qualified Data.ByteString.Char8 as BS
-import Control.Monad (foldM)
+import Data.Char(isAscii)
+import Control.Monad (foldM,when,unless)
import System.Cmd (system)
-import System.IO(mkTextEncoding)
+--import System.IO(mkTextEncoding) --,utf8
import System.Directory(removeFile)
getSourceModule :: Options -> FilePath -> IOE SourceModule
-getSourceModule opts file0 = ioe $
- do tmp <- foldM runPreprocessor (Source file0) (flag optPreprocessors opts)
- content <- keepTemp tmp
- case runP pModDef content of
- Left (Pn l c,msg) -> do file <- writeTemp tmp
+getSourceModule opts file0 =
+ errIn file0 $
+ do tmp <- lift $ foldM runPreprocessor (Source file0) (flag optPreprocessors opts)
+ raw <- lift $ keepTemp tmp
+ --ePutStrLn $ "1 "++file0
+ (optCoding,parsed) <- parseSource opts pModDef raw
+ case parsed of
+ Left (Pn l c,msg) -> do file <- lift $ writeTemp tmp
let location = file++":"++show l++":"++show c
- return (Bad (location++":\n "++msg))
- Right (i,mi00) ->
- do removeTemp tmp
- let mi0 =mi00 {mflags=mflags mi00 `addOptions` opts, msrc=file0}
- mi <- transcodeModule (i,mi0)
- return (Ok mi)
- `catch` (return . Bad . show)
-
+ raise (location++":\n "++msg)
+ Right (i,mi0) ->
+ do lift $ removeTemp tmp
+ let mi =mi0 {mflags=mflags mi0 `addOptions` opts, msrc=file0}
+ optCoding' = renameEncoding `fmap` flag optEncoding (mflags mi0)
+ case (optCoding,optCoding') of
+ (Nothing,Nothing) ->
+ unless (BS.all isAscii raw) $
+ ePutStrLn $ file0++":\n Warning: default encoding has changed from Latin-1 to UTF-8"
+ (_,Just coding') ->
+ when (coding/=coding') $
+ raise $ "Encoding mismatch: "++coding++" /= "++coding'
+ where coding = maybe defaultEncoding renameEncoding optCoding
+ _ -> return ()
+ --lift $ transcodeModule' (i,mi) -- old lexer
+ return (i,mi) -- new lexer
+
+{-
transcodeModule sm00 =
- do enc <- mkTextEncoding (renameEncoding (flag optEncoding (mflags (snd sm00))))
+ do enc <- mkTextEncoding (getEncoding (mflags (snd sm00)))
let sm = decodeStringsInModule enc sm00
return sm
+transcodeModule' sm00 =
+ do let enc = utf8
+ let sm = decodeStringsInModule enc sm00
+ return sm
+-}
runPreprocessor :: Temporary -> String -> IO Temporary
runPreprocessor tmp0 p =
diff --git a/src/compiler/GF/Compile/ReadFiles.hs b/src/compiler/GF/Compile/ReadFiles.hs
index 5e65dcba6..70b0d6ee6 100644
--- a/src/compiler/GF/Compile/ReadFiles.hs
+++ b/src/compiler/GF/Compile/ReadFiles.hs
@@ -21,7 +21,8 @@
module GF.Compile.ReadFiles
( getAllFiles,ModName,ModEnv,importsOfModule,
gfoFile,gfFile,isGFO,gf2gfo,
- getOptionsFromFile) where
+ parseSource,lift,
+ getOptionsFromFile,getPragmas) where
import Prelude hiding (catch)
import GF.System.Catch
@@ -34,6 +35,10 @@ import GF.Grammar.Parser
import GF.Grammar.Grammar
import GF.Grammar.Binary
+import System.IO(mkTextEncoding)
+import qualified Data.ByteString.UTF8 as UTF8
+import GF.Text.Coding(decodeUnicodeIO)
+
import Control.Monad
import Data.Maybe(isJust)
import qualified Data.ByteString.Char8 as BS
@@ -50,7 +55,7 @@ type ModEnv = Map.Map ModName (UTCTime,[ModName])
-- | Returns a list of all files to be compiled in topological order i.e.
-- the low level (leaf) modules are first.
-getAllFiles :: (MonadIO m,ErrorMonad m) => Options -> [InitPath] -> ModEnv -> FileName -> m [FullPath]
+--getAllFiles :: (MonadIO m,ErrorMonad m) => Options -> [InitPath] -> ModEnv -> FileName -> m [FullPath]
getAllFiles opts ps env file = do
-- read module headers from all files recursively
ds <- liftM reverse $ get [] [] (justModuleName file)
@@ -117,14 +122,10 @@ getAllFiles opts ps env file = do
Just mo -> return (st,importsOfModule mo)
Nothing
| isGFO file -> raise (file ++ " is compiled with different GF version and I can't find the source file")
- | otherwise -> do s <- liftIO $ BS.readFile file
- case runP pModHeader s of
- Left (Pn l c,msg) -> raise (file ++ ":" ++ show l ++ ":" ++ show c ++ ": " ++ msg)
- Right mo -> return (CSComp,importsOfModule mo)
- CSComp -> do s <- liftIO $ BS.readFile file
- case runP pModHeader s of
- Left (Pn l c,msg) -> raise (file ++ ":" ++ show l ++ ":" ++ show c ++ ": " ++ msg)
- Right mo -> return (st,importsOfModule mo)
+ | otherwise -> do mo <- parseModHeader opts file
+ return (CSComp,importsOfModule mo)
+ CSComp -> do mo <- parseModHeader opts file
+ return (st,importsOfModule mo)
testErr (mname == name)
("module name" +++ mname +++ "differs from file name" +++ name)
return (name,st,t,isJust gfTime,imps,dropFileName file)
@@ -209,17 +210,55 @@ importsOfModule (m,mi) = (modName m,depModInfo mi [])
modName = showIdent
+
+parseModHeader opts file =
+ do --ePutStrLn file
+ (_,parsed) <- parseSource opts pModHeader =<< lift (BS.readFile file)
+ case parsed of
+ Right mo -> return mo
+ Left (Pn l c,msg) ->
+ raise (file ++ ":" ++ show l ++ ":" ++ show c ++ ": " ++ msg)
+
+parseSource opts p raw =
+ do (coding,utf8) <- toUTF8 opts raw
+ return (coding,runP p utf8)
+
+toUTF8 opts0 raw =
+ do opts <- getPragmas raw
+ let given = flag optEncoding opts -- explicitly given encoding
+ coding = getEncoding $ opts0 `addOptions` opts
+ utf8 <- if coding=="UTF-8"
+ then return raw
+ else lift $ do --ePutStrLn $ "toUTF8 from "++coding
+ enc <- mkTextEncoding coding
+ -- decodeUnicodeIO uses a lot of stack space,
+ -- so we need to split the file into smaller pieces
+ ls <- mapM (decodeUnicodeIO enc) (BS.lines raw)
+ return $ UTF8.fromString (unlines ls)
+ return (given,utf8)
+
+--lift io = ioe (fmap Ok io `catch` (return . Bad . show))
+lift io = liftIO io
+
-- | options can be passed to the compiler by comments in @--#@, in the main file
getOptionsFromFile :: (MonadIO m,ErrorMonad m) => FilePath -> m Options
getOptionsFromFile file = do
s <- either (\_ -> raise $ "File " ++ file ++ " does not exist") return =<<
liftIO (try $ BS.readFile file)
- let ls = filter (BS.isPrefixOf (BS.pack "--#")) $ BS.lines s
- fs = map (BS.unpack . BS.unwords . BS.words . BS.drop 3) ls
- parseModuleOptions fs
+ opts <- getPragmas s
+ -- The coding flag should not be inherited by other files
+ return (addOptions opts (modifyFlags $ \ f -> f{optEncoding=Nothing}))
+
+
+getPragmas :: (ErrorMonad m) => BS.ByteString -> m Options
+getPragmas = parseModuleOptions .
+ map (BS.unpack . BS.unwords . BS.words . BS.drop 3) .
+ filter (BS.isPrefixOf (BS.pack "--#")) . BS.lines
-getFilePath :: MonadIO m => [FilePath] -> String -> m (Maybe FilePath)
-getFilePath paths file = liftIO $ get paths
+--getFilePath :: MonadIO m => [FilePath] -> String -> m (Maybe FilePath)
+getFilePath paths file =
+ liftIO $ do --ePutStrLn $ "getFilePath "++show paths++" "++show file
+ get paths
where
get [] = return Nothing
get (p:ps) = do