summaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
authorkrasimir <krasimir@chalmers.se>2017-01-26 10:31:43 +0000
committerkrasimir <krasimir@chalmers.se>2017-01-26 10:31:43 +0000
commit17163ae88163587f2c9615898a5789aaf3bee298 (patch)
treee0924b1b6f90b3696345925b4b01dc625a4c582b /src/runtime
parent6af632dd185b176222724cce47a49428f1301752 (diff)
copy the types for BracketedString from the Haskell runtime to the Haskell bindings
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/haskell-bind/PGF2.hsc40
1 files changed, 40 insertions, 0 deletions
diff --git a/src/runtime/haskell-bind/PGF2.hsc b/src/runtime/haskell-bind/PGF2.hsc
index 1f8d07c12..5d0484c1e 100644
--- a/src/runtime/haskell-bind/PGF2.hsc
+++ b/src/runtime/haskell-bind/PGF2.hsc
@@ -44,6 +44,8 @@ module PGF2 (-- * PGF
ConcName,Concr,languages,
-- ** Linearization
linearize,linearizeAll,
+ FId, LIndex, BracketedString(..), showBracketedString, flattenBracketedString,
+
alignWords,
-- ** Parsing
parse, parseWithHeuristics,
@@ -65,6 +67,7 @@ import Prelude hiding (fromEnum)
import Control.Exception(Exception,throwIO)
import Control.Monad(forM_)
import System.IO.Unsafe(unsafePerformIO,unsafeInterleaveIO)
+import Text.PrettyPrint
import PGF2.Expr
import PGF2.FFI
@@ -541,6 +544,43 @@ linearizeAll lang e = unsafePerformIO $
else do gu_pool_free pl
throwIO (PGFError "The abstract tree cannot be linearized")
+type FId = Int
+type LIndex = Int
+
+-- | BracketedString represents a sentence that is linearized
+-- as usual but we also want to retain the ''brackets'' that
+-- mark the beginning and the end of each constituent.
+data BracketedString
+ = Leaf String -- ^ this is the leaf i.e. a single token
+ | Bracket CId {-# UNPACK #-} !FId {-# UNPACK #-} !LIndex CId [BracketedString]
+ -- ^ this is a bracket. The 'CId' is the category of
+ -- the phrase. The 'FId' is an unique identifier for
+ -- every phrase in the sentence. For context-free grammars
+ -- i.e. without discontinuous constituents this identifier
+ -- is also unique for every bracket. When there are discontinuous
+ -- phrases then the identifiers are unique for every phrase but
+ -- not for every bracket since the bracket represents a constituent.
+ -- The different constituents could still be distinguished by using
+ -- the constituent index i.e. 'LIndex'. If the grammar is reduplicating
+ -- then the constituent indices will be the same for all brackets
+ -- that represents the same constituent.
+ -- The second 'CId' is the name of the abstract function that generated
+ -- this phrase.
+
+-- | Renders the bracketed string as a string where
+-- the brackets are shown as @(S ...)@ where
+-- @S@ is the category.
+showBracketedString :: BracketedString -> String
+showBracketedString = render . ppBracketedString
+
+ppBracketedString (Leaf t) = text t
+ppBracketedString (Bracket cat fid index _ bss) = parens (ppCId cat <> colon <> int fid <+> hsep (map ppBracketedString bss))
+
+-- | Extracts the sequence of tokens from the bracketed string
+flattenBracketedString :: BracketedString -> [String]
+flattenBracketedString (Leaf w) = [w]
+flattenBracketedString (Bracket _ _ _ _ bss) = concatMap flattenBracketedString bss
+
alignWords :: Concr -> Expr -> [(String, [Int])]
alignWords lang e = unsafePerformIO $
withGuPool $ \pl ->