summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorkrasimir <krasimir@chalmers.se>2016-05-11 07:11:45 +0000
committerkrasimir <krasimir@chalmers.se>2016-05-11 07:11:45 +0000
commit9abc6aaddeb892015015f9f02d5a790701f1c8a2 (patch)
tree1be6993dc30c96508ac38b008dd94e87adccdacb
parent403e080273176098a4b9eabfad03e78ccaed4b7a (diff)
many of the uses of peekCString and withCString in the Haskell binding were incorrect since they encode the string in the system locale while the C runtime is always using UTF8
-rw-r--r--src/runtime/haskell-bind/PGF2.hsc120
-rw-r--r--src/runtime/haskell-bind/PGF2/FFI.hs39
2 files changed, 99 insertions, 60 deletions
diff --git a/src/runtime/haskell-bind/PGF2.hsc b/src/runtime/haskell-bind/PGF2.hsc
index 96677d3bd..af310b17f 100644
--- a/src/runtime/haskell-bind/PGF2.hsc
+++ b/src/runtime/haskell-bind/PGF2.hsc
@@ -99,7 +99,7 @@ languages p =
getLanguages :: IORef (Map.Map String Concr) -> MapItorCallback
getLanguages ref itor key value exn = do
langs <- readIORef ref
- name <- peekCString (castPtr key)
+ name <- peekUtf8CString (castPtr key)
concr <- fmap (\ptr -> Concr ptr p) $ peek (castPtr value)
writeIORef ref $! Map.insert name concr langs
@@ -108,18 +108,18 @@ generateAll p cat =
unsafePerformIO $
do genPl <- gu_new_pool
exprPl <- gu_new_pool
- enum <- withCString cat $ \cat -> do
- exn <- gu_new_exn genPl
- pgf_generate_all (pgf p) cat exn genPl exprPl
+ cat <- newUtf8CString cat genPl
+ exn <- gu_new_exn genPl
+ enum <- pgf_generate_all (pgf p) cat exn genPl exprPl
genFPl <- newForeignPtr gu_pool_finalizer genPl
exprFPl <- newForeignPtr gu_pool_finalizer exprPl
fromPgfExprEnum enum genFPl (p,exprFPl)
abstractName :: PGF -> AbsName
-abstractName p = unsafePerformIO (peekCString =<< pgf_abstract_name (pgf p))
+abstractName p = unsafePerformIO (peekUtf8CString =<< pgf_abstract_name (pgf p))
startCat :: PGF -> Cat
-startCat p = unsafePerformIO (peekCString =<< pgf_start_cat (pgf p))
+startCat p = unsafePerformIO (peekUtf8CString =<< pgf_start_cat (pgf p))
loadConcr :: Concr -> FilePath -> IO ()
loadConcr c fpath =
@@ -146,12 +146,13 @@ unloadConcr c = pgf_concrete_unload (concr c)
functionType :: PGF -> CId -> Type
functionType p fn =
unsafePerformIO $
- withCString fn $ \c_fn -> do
+ withGuPool $ \tmpPl -> do
+ c_fn <- newUtf8CString fn tmpPl
c_type <- pgf_function_type (pgf p) c_fn
peekType c_type
where
peekType c_type = do
- cid <- (#peek PgfType, cid) c_type >>= peekCString
+ cid <- (#peek PgfType, cid) c_type >>= peekUtf8CString
c_hypos <- (#peek PgfType, hypos) c_type
n_hypos <- (#peek GuSeq, len) c_hypos
hs <- peekHypos (c_hypos `plusPtr` (#offset GuSeq, data)) 0 n_hypos
@@ -161,7 +162,7 @@ functionType p fn =
peekHypos :: Ptr a -> Int -> Int -> IO [Hypo]
peekHypos c_hypo i n
- | i < n = do cid <- (#peek PgfHypo, cid) c_hypo >>= peekCString
+ | i < n = do cid <- (#peek PgfHypo, cid) c_hypo >>= peekUtf8CString
ty <- (#peek PgfHypo, type) c_hypo >>= peekType
bt <- fmap toBindType ((#peek PgfHypo, bind_type) c_hypo)
hs <- peekHypos (plusPtr c_hypo (#size PgfHypo)) (i+1) n
@@ -190,7 +191,7 @@ graphvizAbstractTree p e =
exn <- gu_new_exn tmpPl
pgf_graphviz_abstract_tree (pgf p) (expr e) out exn
s <- gu_string_buf_freeze sb tmpPl
- peekCString s
+ peekUtf8CString s
graphvizParseTree :: Concr -> Expr -> String
@@ -201,7 +202,7 @@ graphvizParseTree c e =
exn <- gu_new_exn tmpPl
pgf_graphviz_parse_tree (concr c) (expr e) out exn
s <- gu_string_buf_freeze sb tmpPl
- peekCString s
+ peekUtf8CString s
-----------------------------------------------------------------------------
-- Functions using Concr
@@ -210,15 +211,17 @@ graphvizParseTree c e =
type MorphoAnalysis = (Fun,String,Float)
lookupMorpho :: Concr -> String -> [MorphoAnalysis]
-lookupMorpho (Concr concr master) sent = unsafePerformIO $
- do ref <- newIORef []
- allocaBytes (#size PgfMorphoCallback) $ \cback ->
- do fptr <- wrapLookupMorphoCallback (getAnalysis ref)
- (#poke PgfMorphoCallback, callback) cback fptr
- withCString sent $ \c_sent ->
- pgf_lookup_morpho concr c_sent cback nullPtr
- freeHaskellFunPtr fptr
- readIORef ref
+lookupMorpho (Concr concr master) sent =
+ unsafePerformIO $
+ withGuPool $ \tmpPl -> do
+ ref <- newIORef []
+ cback <- gu_malloc tmpPl (#size PgfMorphoCallback)
+ fptr <- wrapLookupMorphoCallback (getAnalysis ref)
+ (#poke PgfMorphoCallback, callback) cback fptr
+ c_sent <- newUtf8CString sent tmpPl
+ pgf_lookup_morpho concr c_sent cback nullPtr
+ freeHaskellFunPtr fptr
+ readIORef ref
fullFormLexicon :: Concr -> [(String, [MorphoAnalysis])]
fullFormLexicon lang =
@@ -237,7 +240,7 @@ fullFormLexicon lang =
if ffEntry == nullPtr
then do finalizeForeignPtr fpl
return []
- else do tok <- peekCString =<< pgf_fullform_get_string ffEntry
+ else do tok <- peekUtf8CString =<< pgf_fullform_get_string ffEntry
ref <- newIORef []
allocaBytes (#size PgfMorphoCallback) $ \cback ->
do fptr <- wrapLookupMorphoCallback (getAnalysis ref)
@@ -250,8 +253,8 @@ fullFormLexicon lang =
getAnalysis :: IORef [MorphoAnalysis] -> LookupMorphoCallback
getAnalysis ref self c_lemma c_anal prob exn = do
ans <- readIORef ref
- lemma <- peekCString c_lemma
- anal <- peekCString c_anal
+ lemma <- peekUtf8CString c_lemma
+ anal <- peekUtf8CString c_anal
writeIORef ref ((lemma, anal, prob):ans)
parse :: Concr -> Cat -> String -> Either String [(Expr,Float)]
@@ -274,26 +277,26 @@ parseWithHeuristics :: Concr -- ^ the language with which we parse
-> Either String [(Expr,Float)]
parseWithHeuristics lang cat sent heuristic callbacks =
unsafePerformIO $
- do parsePl <- gu_new_pool
- exprPl <- gu_new_pool
+ do exprPl <- gu_new_pool
+ parsePl <- gu_new_pool
exn <- gu_new_exn parsePl
- enum <- withCString cat $ \cat ->
- withCString sent $ \sent -> do
- callbacks_map <- mkCallbacksMap (concr lang) callbacks parsePl
- pgf_parse_with_heuristics (concr lang) cat sent heuristic callbacks_map exn parsePl exprPl
+ cat <- newUtf8CString cat parsePl
+ sent <- newUtf8CString sent parsePl
+ callbacks_map <- mkCallbacksMap (concr lang) callbacks parsePl
+ enum <- pgf_parse_with_heuristics (concr lang) cat sent heuristic callbacks_map exn parsePl exprPl
failed <- gu_exn_is_raised exn
if failed
then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError
if is_parse_error
then do c_tok <- (#peek GuExn, data.data) exn
- tok <- peekCString c_tok
+ tok <- peekUtf8CString c_tok
gu_pool_free parsePl
gu_pool_free exprPl
return (Left tok)
else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn
if is_exn
then do c_msg <- (#peek GuExn, data.data) exn
- msg <- peekCString c_msg
+ msg <- peekUtf8CString c_msg
gu_pool_free parsePl
gu_pool_free exprPl
throwIO (PGFError msg)
@@ -308,15 +311,15 @@ parseWithHeuristics lang cat sent heuristic callbacks =
mkCallbacksMap :: Ptr PgfConcr -> [(String, Int -> String -> Int -> Maybe (Expr,Float,Int))] -> Ptr GuPool -> IO (Ptr PgfCallbacksMap)
mkCallbacksMap concr callbacks pool = do
callbacks_map <- pgf_new_callbacks_map concr pool
- forM_ callbacks $ \(cat,match) ->
- withCString cat $ \ccat -> do
- match <- wrapLiteralMatchCallback (match_callback match)
- predict <- wrapLiteralPredictCallback predict_callback
- hspgf_callbacks_map_add_literal concr callbacks_map ccat match predict pool
+ forM_ callbacks $ \(cat,match) -> do
+ ccat <- newUtf8CString cat pool
+ match <- wrapLiteralMatchCallback (match_callback match)
+ predict <- wrapLiteralPredictCallback predict_callback
+ hspgf_callbacks_map_add_literal concr callbacks_map ccat match predict pool
return callbacks_map
where
match_callback match _ clin_idx csentence poffset out_pool = do
- sentence <- peekCString csentence
+ sentence <- peekUtf8CString csentence
coffset <- peek poffset
case match (fromIntegral clin_idx) sentence (fromIntegral coffset) of
Nothing -> return nullPtr
@@ -361,26 +364,26 @@ parseWithOracle lang cat sent (predict,complete,literal) =
do parsePl <- gu_new_pool
exprPl <- gu_new_pool
exn <- gu_new_exn parsePl
- enum <- withCString cat $ \cat ->
- withCString sent $ \sent -> do
- predictPtr <- maybe (return nullFunPtr) (wrapOracleCallback . oracleWrapper) predict
- completePtr <- maybe (return nullFunPtr) (wrapOracleCallback . oracleWrapper) complete
- literalPtr <- maybe (return nullFunPtr) (wrapOracleLiteralCallback . oracleLiteralWrapper) literal
- cback <- hspgf_new_oracle_callback sent predictPtr completePtr literalPtr parsePl
- pgf_parse_with_oracle (concr lang) cat sent cback exn parsePl exprPl
+ cat <- newUtf8CString cat parsePl
+ sent <- newUtf8CString sent parsePl
+ predictPtr <- maybe (return nullFunPtr) (wrapOracleCallback . oracleWrapper) predict
+ completePtr <- maybe (return nullFunPtr) (wrapOracleCallback . oracleWrapper) complete
+ literalPtr <- maybe (return nullFunPtr) (wrapOracleLiteralCallback . oracleLiteralWrapper) literal
+ cback <- hspgf_new_oracle_callback sent predictPtr completePtr literalPtr parsePl
+ enum <- pgf_parse_with_oracle (concr lang) cat sent cback exn parsePl exprPl
failed <- gu_exn_is_raised exn
if failed
then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError
if is_parse_error
then do c_tok <- (#peek GuExn, data.data) exn
- tok <- peekCString c_tok
+ tok <- peekUtf8CString c_tok
gu_pool_free parsePl
gu_pool_free exprPl
return (Left tok)
else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn
if is_exn
then do c_msg <- (#peek GuExn, data.data) exn
- msg <- peekCString c_msg
+ msg <- peekUtf8CString c_msg
gu_pool_free parsePl
gu_pool_free exprPl
throwIO (PGFError msg)
@@ -393,13 +396,13 @@ parseWithOracle lang cat sent (predict,complete,literal) =
return (Right exprs)
where
oracleWrapper oracle catPtr lblPtr offset = do
- cat <- peekCString catPtr
- lbl <- peekCString lblPtr
+ cat <- peekUtf8CString catPtr
+ lbl <- peekUtf8CString lblPtr
return (oracle cat lbl (fromIntegral offset))
oracleLiteralWrapper oracle catPtr lblPtr poffset out_pool = do
- cat <- peekCString catPtr
- lbl <- peekCString lblPtr
+ cat <- peekUtf8CString catPtr
+ lbl <- peekUtf8CString lblPtr
offset <- peek poffset
case oracle cat lbl (fromIntegral offset) of
Just (e,prob,offset) ->
@@ -425,7 +428,8 @@ parseWithOracle lang cat sent (predict,complete,literal) =
hasLinearization :: Concr -> Fun -> Bool
hasLinearization lang id = unsafePerformIO $
- withCString id (pgf_has_linearization (concr lang))
+ withGuPool $ \pl ->
+ newUtf8CString id pl >>= pgf_has_linearization (concr lang)
linearize :: Concr -> Expr -> String
linearize lang e = unsafePerformIO $
@@ -441,11 +445,11 @@ linearize lang e = unsafePerformIO $
else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn
if is_exn
then do c_msg <- (#peek GuExn, data.data) exn
- msg <- peekCString c_msg
+ msg <- peekUtf8CString c_msg
throwIO (PGFError msg)
else throwIO (PGFError "The abstract tree cannot be linearized")
else do lin <- gu_string_buf_freeze sb pl
- peekCString lin
+ peekUtf8CString lin
linearizeAll :: Concr -> Expr -> [String]
linearizeAll lang e = unsafePerformIO $
@@ -473,7 +477,7 @@ linearizeAll lang e = unsafePerformIO $
then collect cts exn pl
else throwExn exn pl
else do lin <- gu_string_buf_freeze sb tmpPl
- s <- peekCString lin
+ s <- peekUtf8CString lin
ss <- unsafeInterleaveIO (collect cts exn pl)
return (s:ss)
@@ -481,7 +485,7 @@ linearizeAll lang e = unsafePerformIO $
is_exn <- gu_exn_caught exn gu_exn_type_PgfExn
if is_exn
then do c_msg <- (#peek GuExn, data.data) exn
- msg <- peekCString c_msg
+ msg <- peekUtf8CString c_msg
gu_pool_free pl
throwIO (PGFError msg)
else do gu_pool_free pl
@@ -500,7 +504,7 @@ alignWords lang e = unsafePerformIO $
else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn
if is_exn
then do c_msg <- (#peek GuExn, data.data) exn
- msg <- peekCString c_msg
+ msg <- peekUtf8CString c_msg
throwIO (PGFError msg)
else throwIO (PGFError "The abstract tree cannot be linearized")
else do len <- (#peek GuSeq, len) seq
@@ -510,7 +514,7 @@ alignWords lang e = unsafePerformIO $
peekAlignmentPhrase :: Ptr () -> IO (String, [Int])
peekAlignmentPhrase ptr = do
c_phrase <- (#peek PgfAlignmentPhrase, phrase) ptr
- phrase <- peekCString c_phrase
+ phrase <- peekUtf8CString c_phrase
n_fids <- (#peek PgfAlignmentPhrase, n_fids) ptr
(fids :: [CInt]) <- peekArray (fromIntegral (n_fids :: CInt)) (ptr `plusPtr` (#offset PgfAlignmentPhrase, fids))
return (phrase, map fromIntegral fids)
@@ -532,7 +536,7 @@ functions p =
getFunctions :: IORef [String] -> MapItorCallback
getFunctions ref itor key value exn = do
names <- readIORef ref
- name <- peekCString (castPtr key)
+ name <- peekUtf8CString (castPtr key)
writeIORef ref $! (name : names)
categories :: PGF -> [Cat]
diff --git a/src/runtime/haskell-bind/PGF2/FFI.hs b/src/runtime/haskell-bind/PGF2/FFI.hs
index 3ba5858bc..1e3abec64 100644
--- a/src/runtime/haskell-bind/PGF2/FFI.hs
+++ b/src/runtime/haskell-bind/PGF2/FFI.hs
@@ -2,12 +2,13 @@
module PGF2.FFI where
+import Foreign ( alloca, poke )
import Foreign.C
---import Foreign.C.String
import Foreign.Ptr
import Foreign.ForeignPtr
import Control.Exception
import GHC.Ptr
+import Data.Int(Int32)
data PGF = PGF {pgf :: Ptr PgfPGF, pgfMaster :: ForeignPtr GuPool}
data Concr = Concr {concr :: Ptr PgfConcr, concrMaster :: PGF}
@@ -72,10 +73,16 @@ foreign import ccall "gu/file.h gu_file_in"
foreign import ccall "gu/enum.h gu_enum_next"
gu_enum_next :: Ptr a -> Ptr (Ptr b) -> Ptr GuPool -> IO ()
-
+
foreign import ccall "gu/string.h gu_string_buf_freeze"
gu_string_buf_freeze :: Ptr GuStringBuf -> Ptr GuPool -> IO CString
+foreign import ccall unsafe "gu/utf8.h gu_utf8_decode"
+ gu_utf8_decode :: Ptr CString -> IO Int32
+
+foreign import ccall unsafe "gu/utf8.h gu_utf8_encode"
+ gu_utf8_encode :: Int32 -> Ptr CString -> IO ()
+
withGuPool :: (Ptr GuPool -> IO a) -> IO a
withGuPool f = bracket gu_new_pool gu_pool_free f
@@ -85,6 +92,34 @@ newOut pool =
out <- gu_string_buf_out sb
return (sb,out)
+peekUtf8CString :: CString -> IO String
+peekUtf8CString ptr =
+ alloca $ \pptr ->
+ poke pptr ptr >> decode pptr
+ where
+ decode pptr = do
+ x <- gu_utf8_decode pptr
+ if x == 0
+ then return []
+ else do cs <- decode pptr
+ return (((toEnum . fromEnum) x) : cs)
+
+newUtf8CString :: String -> Ptr GuPool -> IO CString
+newUtf8CString s pool = do
+ -- An UTF8 character takes up to 6 bytes. We allocate enough
+ -- memory for the worst case. This is wasteful but those
+ -- strings are usually allocated only temporary.
+ ptr <- gu_malloc pool (fromIntegral (length s * 6+1))
+ alloca $ \pptr ->
+ poke pptr ptr >> encode s pptr
+ return ptr
+ where
+ encode [] pptr = do
+ gu_utf8_encode 0 pptr
+ encode (c:cs) pptr = do
+ gu_utf8_encode ((toEnum . fromEnum) c) pptr
+ encode cs pptr
+
------------------------------------------------------------------
-- libpgf API