1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
|
----------------------------------------------------------------------
-- |
-- Module : VisualizeTree
-- Maintainer : KA
-- Stability : (stable)
-- Portability : (portable)
--
-- Print a graph of an abstract syntax tree in Graphviz DOT format
-- Based on BB's VisualizeGrammar
-----------------------------------------------------------------------------
module PGF.VisualizeTree
( GraphvizOptions(..)
, graphvizDefaults
, graphvizAbstractTree
, graphvizParseTree
, graphvizParseTreeDep
, graphvizDependencyTree
, Labels, getDepLabels
, CncLabels, getCncDepLabels
, graphvizBracketedString
, graphvizAlignment
, gizaAlignment
, conlls2latexDoc
) where
import Prelude hiding ((<>)) -- GHC 8.4.1 clash with Text.PrettyPrint
import PGF.CId (wildCId,showCId,ppCId,mkCId) --CId,pCId,
import PGF.Data
import PGF.Expr (Tree) -- showExpr
import PGF.Linearize
----import PGF.LatexVisualize (conll2latex) ---- should be separate module?
import PGF.Macros (lookValCat, BracketedString(..))
--lookMap, BracketedTokn(..), flattenBracketedString
import qualified Data.Map as Map
--import qualified Data.IntMap as IntMap
import Data.List (intersperse,nub,mapAccumL,find,groupBy,sortBy,partition)
import Data.Ord (comparing)
import Data.Char (isDigit)
import Data.Maybe (fromMaybe)
import Text.PrettyPrint
--import Data.Array.IArray
--import Control.Monad
--import qualified Data.Set as Set
--import qualified Text.ParserCombinators.ReadP as RP
data GraphvizOptions = GraphvizOptions {noLeaves :: Bool,
noFun :: Bool,
noCat :: Bool,
noDep :: Bool,
nodeFont :: String,
leafFont :: String,
nodeColor :: String,
leafColor :: String,
nodeEdgeStyle :: String,
leafEdgeStyle :: String
}
graphvizDefaults = GraphvizOptions False False False True "" "" "" "" "" ""
-- | Renders abstract syntax tree in Graphviz format.
-- The pair of 'Bool' @(funs,cats)@ lets you control whether function names and
-- category names are included in the rendered tree.
graphvizAbstractTree :: PGF -> (Bool,Bool) -> Tree -> String
graphvizAbstractTree pgf (funs,cats) = render . tree2graph
where
tree2graph t =
text "graph {" $$
ppGraph [] [] 0 t $$
text "}"
getAbs xs (EAbs _ x e) = getAbs (x:xs) e
getAbs xs (ETyped e _) = getAbs xs e
getAbs xs e = (xs,e)
getApp (EApp x (EImplArg y)) es = getApp x es
getApp (EApp x y) es = getApp x (y:es)
getApp (ETyped e _) es = getApp e es
getApp e es = (e,es)
getLbl scope (EFun f) = let fun = if funs then ppCId f else empty
cat = if cats then ppCId (lookValCat (abstract pgf) f) else empty
sep = if funs && cats then colon else empty
in fun <+> sep <+> cat
getLbl scope (ELit l) = text (escapeStr (render (ppLit l)))
getLbl scope (EMeta i) = ppMeta i
getLbl scope (EVar i) = ppCId (scope !! i)
getLbl scope (ETyped e _) = getLbl scope e
getLbl scope (EImplArg e) = getLbl scope e
ppGraph scope ps i e0 =
let (xs, e1) = getAbs [] e0
(e2,args) = getApp e1 []
binds = if null xs
then empty
else text "\\\\" <> hcat (punctuate comma (map ppCId xs)) <+> text "->"
(lbl,eargs) = case e2 of
EAbs _ _ _ -> (char '@', e2:args) -- eta-redexes are rendered with artificial "@" node
_ -> (getLbl scope' e2, args)
scope' = xs ++ scope
in ppNode (i:ps) <> text "[label =" <+> doubleQuotes (binds <+> lbl) <> text ", style = \"solid\", shape = \"plaintext\"] ;" $$
(if null ps
then empty
else ppNode ps <+> text "--" <+> ppNode (i:ps) <+> text "[style = \"solid\"];") $$
vcat (zipWith (ppGraph scope' (i:ps)) [0..] eargs)
ppNode ps = char 'n' <> hcat (punctuate (char '_') (map int ps))
escapeStr [] = []
escapeStr ('\\':cs) = '\\':'\\':escapeStr cs
escapeStr ('"' :cs) = '\\':'"' :escapeStr cs
escapeStr (c :cs) = c :escapeStr cs
type Labels = Map.Map CId [String]
-- | Visualize word dependency tree.
graphvizDependencyTree
:: String -- ^ Output format: @"latex"@, @"conll"@, @"malt_tab"@, @"malt_input"@ or @"dot"@
-> Bool -- ^ Include extra information (debug)
-> Maybe Labels -- ^ abstract label information obtained with 'getDepLabels'
-> Maybe CncLabels -- ^ concrete label information obtained with ' ' (was: unused (was: @Maybe String@))
-> PGF
-> CId -- ^ The language of analysis
-> Tree
-> String -- ^ Rendered output in the specified format
graphvizDependencyTree format debug mlab mclab pgf lang t =
case format of
"latex" -> render . ppLaTeX $ conll2latex' conll
"svg" -> render . ppSVG . toSVG $ conll2latex' conll
"conll" -> printCoNLL conll
"conllu" -> printCoNLL ([["# text = " ++ linearize pgf lang t], ["# tree = " ++ showExpr [] t]] ++ conll)
"malt_tab" -> render $ vcat (map (hcat . intersperse (char '\t') . (\ws -> [ws !! 0,ws !! 1,ws !! 3,ws !! 6,ws !! 7])) wnodes)
"malt_input" -> render $ vcat (map (hcat . intersperse (char '\t') . take 6) wnodes)
_ -> render $ text "digraph {" $$
space $$
nest 2 (text "rankdir=LR ;" $$
text "node [shape = plaintext] ;" $$
vcat nodes $$
vcat links) $$
text "}"
where
conll = fixCoNLL (maybe [] id mclab) conll0
conll0 = (map.map) render wnodes
nodes = map mkNode leaves
links = map mkLink [(fid, fromMaybe (dep_lbl,nil) (lookup fid deps)) | ((cat,fid,fun,_),_,w) <- tail leaves]
-- CoNLL format: ID FORM LEMMA PLEMMA POS PPOS FEAT PFEAT HEAD PHEAD DEPREL PDEPREL
-- P variants are automatically predicted rather than gold standard
wnodes = [[int i, maltws ws, ppCId fun, ppCId (posCat cat), ppCId cat, int lind, int parent, text lab, unspec, unspec] |
((cat,fid,fun,lind),i,ws) <- tail leaves,
let (lab,parent) = fromMaybe (dep_lbl,0)
(do (lbl,fid) <- lookup fid deps
(_,i,_) <- find (\((_,fid1,_,_),i,_) -> fid == fid1) leaves
return (lbl,i))
]
maltws = text . concat . intersperse "+" . words -- no spaces in column 2
nil = -1
bss = bracketedLinearize pgf lang t
root = (wildCId,nil,wildCId,0)
leaves = (root,0,root_lbl) : (groupAndIndexIt 1 . concatMap (getLeaves root)) bss
deps = let (_,(h,deps)) = getDeps 0 [] t []
in (h,(dep_lbl,nil)):deps
groupAndIndexIt id [] = []
groupAndIndexIt id ((p,w):pws) = (p,id,w) : groupAndIndexIt (id+1) pws
--- groupAndIndexIt id ((p,w):pws) = let (ws,pws1) = collect pws
--- in (p,id,unwords (w:ws)) : groupAndIndexIt (id+1) pws1
where
collect pws@((p1,w):pws1)
| p == p1 = let (ws,pws2) = collect pws1
in (w:ws,pws2)
collect pws = ([],pws)
getLeaves parent bs =
case bs of
Leaf w -> [(parent,w)]
Bracket cat fid _ lind fun _ bss -> concatMap (getLeaves (cat,fid,fun,lind)) bss
mkNode ((_,p,_,_),i,w) =
tag p <+> brackets (text "label = " <> doubleQuotes (int i <> char '.' <+> text w)) <+> semi
mkLink (x,(lbl,y)) = tag y <+> text "->" <+> tag x <+> text "[label = " <> doubleQuotes (text lbl) <> text "] ;"
labels = maybe Map.empty id mlab
clabels = maybe [] id mclab
posCat cat = case Map.lookup cat labels of
Just [p] -> mkCId p
_ -> cat
getDeps n_fid xs (EAbs _ x e) es = getDeps n_fid (x:xs) e es
getDeps n_fid xs (EApp e1 e2) es = getDeps n_fid xs e1 (e2:es)
getDeps n_fid xs (EImplArg e) es = getDeps n_fid xs e es
getDeps n_fid xs (ETyped e _) es = getDeps n_fid xs e es
getDeps n_fid xs (EFun f) es = let (n_fid_1,ds) = descend n_fid xs es
(mb_h, deps) = selectHead f ds
in case mb_h of
Just (fid,deps0) -> (n_fid_1+1,(fid,deps0++
[(n_fid_1,(dep_lbl,fid))]++
concat [(m,(lbl,fid)):ds | (lbl,(m,ds)) <- deps]))
Nothing -> (n_fid_1+1,(n_fid_1,concat [(m,(lbl,n_fid_1)):ds | (lbl,(m,ds)) <- deps]))
getDeps n_fid xs (EMeta i) es = (n_fid+2,(n_fid,[]))
getDeps n_fid xs (EVar i) _ = (n_fid+2,(n_fid,[]))
getDeps n_fid xs (ELit l) [] = (n_fid+1,(n_fid,[]))
descend n_fid xs es = mapAccumL (\n_fid e -> getDeps n_fid xs e []) n_fid es
selectHead f ds =
case Map.lookup f labels of
Just lbls -> extractHead (zip lbls ds)
Nothing -> extractLast ds
where
extractHead [] = (Nothing, [])
extractHead (ld@(l,d):lds)
| l == head_lbl = (Just d,lds)
| otherwise = let (mb_h,deps) = extractHead lds
in (mb_h,ld:deps)
extractLast [] = (Nothing, [])
extractLast (d:ds)
| null ds = (Just d,[])
| otherwise = let (mb_h,deps) = extractLast ds
in (mb_h,(dep_lbl,d):deps)
dep_lbl = "dep"
head_lbl = "head"
root_lbl = "root"
unspec = text "_"
-- auxiliaries for UD conversion PK 15/12/2018
rmcomments :: String -> String
rmcomments s = case s of
'-':'-':_ -> []
'#':'f':'u':'n':rest -> rmcomments rest -- the new gf-ud format
'#':'c':'a':'t':rest -> rmcomments rest
x:xs -> x : rmcomments xs
_ -> []
-- | Prepare lines obtained from a configuration file for labels for
-- use with 'graphvizDependencyTree'. Format per line /fun/ /label/@*@.
--- ignore other gf-ud annotatations than #fun and #cat at this point
getDepLabels :: String -> Labels
getDepLabels s = Map.fromList [(mkCId f,ls) | f:ls <- map (words . rmcomments) (lines s), not (head f == '#')]
-- the old function, without dependencies
graphvizParseTree :: PGF -> Language -> GraphvizOptions -> Tree -> String
graphvizParseTree = graphvizParseTreeDep Nothing
graphvizParseTreeDep :: Maybe Labels -> PGF -> Language -> GraphvizOptions -> Tree -> String
graphvizParseTreeDep mbl pgf lang opts tree = graphvizBracketedString opts mbl tree $ bracketedLinearize pgf lang tree
graphvizBracketedString :: GraphvizOptions -> Maybe Labels -> Tree -> [BracketedString] -> String
graphvizBracketedString opts mbl tree bss = render graphviz_code
where
graphviz_code
= text "graph {" $$
text node_style $$
vcat internal_nodes $$
(if noLeaves opts then empty
else text leaf_style $$
leaf_nodes
) $$ text "}"
leaf_style = mkOption "edge" "style" (leafEdgeStyle opts) ++
mkOption "edge" "color" (leafColor opts) ++
mkOption "node" "fontcolor" (leafColor opts) ++
mkOption "node" "fontname" (leafFont opts) ++
mkOption "node" "shape" "plaintext"
node_style = mkOption "edge" "style" (nodeEdgeStyle opts) ++
mkOption "edge" "color" (nodeColor opts) ++
mkOption "node" "fontcolor" (nodeColor opts) ++
mkOption "node" "fontname" (nodeFont opts) ++
mkOption "node" "shape" nodeshape
where nodeshape | noFun opts && noCat opts = "point"
| otherwise = "plaintext"
mkOption object optname optvalue
| null optvalue = ""
| otherwise = object ++ "[" ++ optname ++ "=\"" ++ optvalue ++ "\"]; "
mkNode fun cat
| noFun opts = showCId cat
| noCat opts = showCId fun
| otherwise = showCId fun ++ " : " ++ showCId cat
nil = -1
internal_nodes = [mkLevel internals |
internals <- getInternals (map ((,) nil) bss),
not (null internals)]
leaf_nodes = mkLevel [(parent, id, mkLeafNode cat word) |
(id, (parent, (cat,word))) <- zip [100000..] (concatMap (getLeaves (mkCId "?") nil) bss)]
getInternals [] = []
getInternals nodes
= nub [(parent, fid, mkNode fun cat) |
(parent, Bracket cat fid _ _ fun _ _) <- nodes]
: getInternals [(fid, child) |
(_, Bracket _ fid _ _ _ _ children) <- nodes,
child <- children]
getLeaves cat parent (Leaf word) = [(parent, (cat, word))] -- the lowest cat before the word
getLeaves _ parent (Bracket cat fid _ i _ _ children)
= concatMap (getLeaves cat fid) children
mkLevel nodes
= text "subgraph {rank=same;" $$
nest 2 (-- the following gives the name of the node and its label:
vcat [tag id <> text (mkOption "" "label" lbl) | (_, id, lbl) <- nodes] $$
-- the following is for fixing the order between the children:
(if length nodes > 1 then
text (mkOption "edge" "style" "invis") $$
hsep (intersperse (text " -- ") [tag id | (_, id, _) <- nodes]) <+> semi
else empty)
) $$
text "}" $$
-- the following is for the edges between parent and children:
vcat [tag pid <> text " -- " <> tag id <> text (depLabel node) | node@(pid, id, _) <- nodes, pid /= nil] $$
space
depLabel node@(parent,id,lbl)
| noDep opts = ";"
| otherwise = case getArg id of
Just (fun,arg) -> mkOption "" "label" (lookLabel fun arg)
_ -> ";"
getArg i = getArgumentPlace i (expr2numtree tree) Nothing
labels = maybe Map.empty id mbl
lookLabel fun arg = case Map.lookup fun labels of
Just xx | length xx > arg -> case xx !! arg of
"head" -> ""
l -> l
_ -> argLabel fun arg
argLabel fun arg = if arg==0 then "" else "dep#" ++ show arg --showCId fun ++ "#" ++ show arg
-- assuming the arg is head, if no configuration is given; always true for 1-arg funs
mkLeafNode cat word
| noDep opts = word --- || not (noCat opts) -- show POS only if intermediate nodes hidden
| otherwise = posCat cat ++ "\n" ++ word -- show POS in dependency tree
posCat cat = case Map.lookup cat labels of
Just [p] -> p
_ -> showCId cat
---- to restore the argument place from bracketed linearization
data NumTree = NumTree Int CId [NumTree]
getArgumentPlace :: Int -> NumTree -> Maybe (CId,Int) -> Maybe (CId,Int)
getArgumentPlace i tree@(NumTree int fun ts) mfi
| i == int = mfi
| otherwise = case [fj | (t,x) <- zip ts [0..], Just fj <- [getArgumentPlace i t (Just (fun,x))]] of
fj:_ -> Just fj
_ -> Nothing
expr2numtree :: Expr -> NumTree
expr2numtree = fst . renumber 0 . flatten where
flatten e = case e of
EApp f a -> case flatten f of
NumTree _ g ts -> NumTree 0 g (ts ++ [flatten a])
EFun f -> NumTree 0 f []
renumber i t@(NumTree _ f ts) = case renumbers i ts of
(ts',j) -> (NumTree j f ts', j+1)
renumbers i ts = case ts of
t:tt -> case renumber i t of
(t',j) -> case renumbers j tt of (tt',k) -> (t':tt',k)
_ -> ([],i)
----- end this terrible stuff AR 4/11/2015
type Rel = (Int,[Int])
-- possibly needs changes after clearing about many-to-many on this level
type IndexedSeq = (Int,[String])
type LangSeq = [IndexedSeq]
data PreAlign = PreAlign [LangSeq] [[Rel]]
deriving Show
-- alignment structure for a phrase in 2 languages, along with the
-- many-to-many relations
genPreAlignment :: PGF -> [Language] -> Expr -> PreAlign
genPreAlignment pgf langs = lin2align . linsBracketed
where
linsBracketed t = [bracketedLinearize pgf lang t | lang <- langs]
lin2align :: [[BracketedString]] -> PreAlign
lin2align bsss = PreAlign langSeqs langRels
where
(langSeqs,langRels) = mkLayers leaves
nil = -1
leaves = map (groupAndIndexIt 0 . concatMap (getLeaves nil)) bsss
groupAndIndexIt id [] = []
groupAndIndexIt id ((p,w):pws) = let (ws,pws1) = collect pws
in (p,id,unwords (w:ws)) : groupAndIndexIt (id+1) pws1
where
collect pws@((p1,w):pws1)
| p == p1 = let (ws,pws2) = collect pws1
in (w:ws,pws2)
collect pws = ([],pws)
getLeaves parent bs =
case bs of
Leaf w -> [(parent,w)]
Bracket _ fid _ _ _ _ bss -> concatMap (getLeaves fid) bss
mkLayers (cs:css:rest) = let (lrest, rrest) = mkLayers (css:rest)
in ((fields cs) : lrest, (map (mkLinks css) cs) : rrest)
mkLayers [cs] = ([fields cs], [])
mkLayers _ = ([],[])
mkLinks cs (p0,id0,_) = (id0,indices)
where
indices = [id1 | (p1,id1,_) <- cs, p1 == p0]
fields cs = [(id, [w]) | (_,id,w) <- cs]
-- we assume we have 2 languages - source and target
gizaAlignment :: PGF -> (Language,Language) -> Expr -> (String,String,String)
gizaAlignment pgf (l1,l2) e = let PreAlign [rl1,rl2] rels = genPreAlignment pgf [l1,l2] e
in
(unwords (map showIndSeq rl1), unwords (concat $ map snd rl2),
unwords $ words $ showRels rl2 (concat rels))
showIndSeq (_,l) = let ww = map words l
w_ = map (intersperse "_") ww
in
concat $ concat w_
showRels inds2 [] = []
showRels inds2 ((ind,is):rest) =
let lOffs = computeOffset inds2 0
ltemp = [(i,getOffsetIndex i lOffs) | i <- is]
lcurr = concat $ map (\(offset,ncomp) -> [show ind ++ "-" ++ show (-1 + offset + ii) ++ " "| ii <- [1..ncomp]]) (map snd ltemp)
lrest = showRels inds2 rest
in
(unwords lcurr) ++ lrest
getOffsetIndex i lst = let ll = filter (\(x,_) -> x == i) lst
in
snd $ head ll
computeOffset [] transp = []
computeOffset ((i,l):rest) transp = let nw = (length $ words $ concat l)
in (i,(transp,nw)) : (computeOffset rest (transp + nw))
-- alignment in the Graphviz format from the intermediate structure
-- same effect as the old direct function
graphvizAlignment :: PGF -> [Language] -> Expr -> String
graphvizAlignment pgf langs exp =
render (text "digraph {" $$
space $$
nest 2 (text "rankdir=LR ;" $$
text "node [shape = record] ;" $$
space $$
renderList 0 lrels rrels) $$
text "}")
where
(PreAlign lrels rrels) = genPreAlignment pgf langs exp
renderList ii (l:ls) (r:rs) = struct ii <> text "[label = \"" <> fields l <> text "\"] ;" $$
(case ls of
[] -> empty
_ -> vcat [struct ii <> colon <> tag id0
<> colon <> char 'e' <+> text "->" <+> struct (ii+1)
<> colon <> tag id1 <> colon <> char 'w' <+> semi
| (id0,ids) <- r, id1 <- ids] $$ renderList (ii + 1) ls rs)
renderList ii [] _ = empty
renderList ii [l] [] = struct ii <> text "[label = \"" <> fields l <> text "\"] ;"
fields cs = hsep (intersperse (char '|') [tbrackets (tag id) <> text (' ':w) | (id,ws) <- cs, w <- ws])
-- auxiliaries for graphviz syntax
struct l = text ("struct" ++ show l)
tbrackets d = char '<' <> d <> char '>'
tag i
| i < 0 = char 'r' <> int (negate i)
| otherwise = char 'n' <> int i
---------------------- should be a separate module?
-- visualization with latex output. AR Nov 2015
conlls2latexDoc :: [String] -> String
conlls2latexDoc =
render .
latexDoc .
vcat .
intersperse (text "" $+$ app "vspace" (text "4mm")) .
map conll2latex .
filter (not . null)
conll2latex :: String -> Doc
conll2latex = ppLaTeX . conll2latex' . parseCoNLL
conll2latex' :: CoNLL -> [LaTeX]
conll2latex' = dep2latex . conll2dep'
data Dep = Dep {
wordLength :: Int -> Double -- length of word at position int -- was: fixed width, millimetres (>= 20.0)
, tokens :: [(String,(String,String))] -- word, (pos,features) (0..)
, deps :: [((Int,Int),String)] -- from, to, label
, root :: Int -- root word position
}
-- some general measures
defaultWordLength = 20.0 -- the default fixed width word length, making word 100 units
defaultUnit = 0.2 -- unit in latex pictures, 0.2 millimetres
spaceLength = 10.0
charWidth = 1.8
wsize rwld w = 100 * rwld w + spaceLength -- word length, units
wpos rwld i = sum [wsize rwld j | j <- [0..i-1]] -- start position of the i'th word
wdist rwld x y = sum [wsize rwld i | i <- [min x y .. max x y - 1]] -- distance between words x and y
labelheight h = h + arcbase + 3 -- label just above arc; 25 would put it just below
labelstart c = c - 15.0 -- label starts 15u left of arc centre
arcbase = 30.0 -- arcs start and end 40u above the bottom
arcfactor r = r * 600 -- reduction of arc size from word distance
xyratio = 3 -- width/height ratio of arcs
putArc :: (Int -> Double) -> Int -> Int -> Int -> String -> [DrawingCommand]
putArc frwld height x y label = [oval,arrowhead,labelling] where
oval = Put (ctr,arcbase) (OvalTop (wdth,hght))
arrowhead = Put (endp,arcbase + 5) (ArrowDown 5) -- downgoing arrow 5u above the arc base
labelling = Put (labelstart ctr,labelheight (hght/2)) (TinyText label)
dxy = wdist frwld x y -- distance between words, >>= 20.0
ndxy = 100 * rwld * fromIntegral height -- distance that is indep of word length
hdxy = dxy / 2 -- half the distance
wdth = dxy - (arcfactor rwld)/dxy -- longer arcs are wider in proportion
hght = ndxy / (xyratio * rwld) -- arc height is independent of word length
begp = min x y -- begin position of oval
ctr = wpos frwld begp + hdxy + (if x < y then 20 else 10) -- LR arcs are farther right from center of oval
endp = (if x < y then (+) else (-)) ctr (wdth/2) -- the point of the arrow
rwld = 0.5 ----
dep2latex :: Dep -> [LaTeX]
dep2latex d =
[Comment (unwords (map fst (tokens d))),
Picture defaultUnit (width,height) (
[Put (wpos rwld i,0) (Text w) | (i,w) <- zip [0..] (map fst (tokens d))] -- words
++ [Put (wpos rwld i,15) (TinyText w) | (i,(w,_)) <- zip [0..] (map snd (tokens d))] -- pos tags 15u above bottom
--- ++ [Put (wpos rwld i,-15) (TinyText w) | (i,(_,w)) <- zip [0..] (map snd (tokens d))] -- features 15u below bottom -> DON'T SHOW
++ concat [putArc rwld (aheight x y) x y label | ((x,y),label) <- deps d] -- arcs and labels
++ [Put (wpos rwld (root d) + 15,height) (ArrowDown (height-arcbase))]
++ [Put (wpos rwld (root d) + 20,height - 10) (TinyText "root")]
)]
where
wld i = wordLength d i -- >= 20.0
rwld i = (wld i) / defaultWordLength -- >= 1.0
aheight x y = depth (min x y) (max x y) + 1 ---- abs (x-y)
arcs = [(min u v, max u v) | ((u,v),_) <- deps d]
depth x y = case [(u,v) | (u,v) <- arcs, (x < u && v <= y) || (x == u && v < y)] of ---- only projective arcs counted
[] -> 0
uvs -> 1 + maximum (0:[depth u v | (u,v) <- uvs])
width = {-round-} (sum [wsize rwld w | (w,_) <- zip [0..] (tokens d)]) + {-round-} spaceLength * fromIntegral ((length (tokens d)) - 1)
height = 50 + 20 * {-round-} (maximum (0:[aheight x y | ((x,y),_) <- deps d]))
type CoNLL = [[String]]
parseCoNLL :: String -> CoNLL
parseCoNLL = map words . lines
--conll2dep :: String -> Dep
--conll2dep = conll2dep' . parseCoNLL
conll2dep' :: CoNLL -> Dep
conll2dep' ls = Dep {
wordLength = wld
, tokens = toks
, deps = dps
, root = head $ [read x-1 | x:_:_:_:_:_:"0":_ <- ls] ++ [1]
}
where
wld i = maximum (0:[charWidth * fromIntegral (length w) | w <- let (tok,(pos,feat)) = toks !! i in [tok,pos {-,feat-}]]) --- feat not shown
toks = [(w,(c,m)) | _:w:_:c:_:m:_ <- ls]
dps = [((read y-1, read x-1),lab) | x:_:_:_:_:_:y:lab:_ <- ls, y /="0"]
--maxdist = maximum [abs (x-y) | ((x,y),_) <- dps]
-- * LaTeX Pictures (see https://en.wikibooks.org/wiki/LaTeX/Picture)
-- We render both LaTeX and SVG from this intermediate representation of
-- LaTeX pictures.
data LaTeX = Comment String | Picture UnitLengthMM Size [DrawingCommand]
data DrawingCommand = Put Position Object
data Object = Text String | TinyText String | OvalTop Size | ArrowDown Length
type UnitLengthMM = Double
type Size = (Double,Double)
type Position = (Double,Double)
type Length = Double
-- * latex formatting
ppLaTeX = vcat . map ppLaTeX1
where
ppLaTeX1 el =
case el of
Comment s -> comment s
Picture unit size cmds ->
app "setlength{\\unitlength}" (text (show unit ++ "mm"))
$$ hang (app "begin" (text "picture")<>text (show size)) 2
(vcat (map ppDrawingCommand cmds))
$$ app "end" (text "picture")
$$ text ""
ppDrawingCommand (Put pos obj) = put pos (ppObject obj)
ppObject obj =
case obj of
Text s -> text s
TinyText s -> small (text s)
OvalTop size -> text "\\oval" <> text (show size) <> text "[t]"
ArrowDown len -> app "vector(0,-1)" (text (show len))
put p@(_,_) = app ("put" ++ show p)
small w = text "{\\tiny" <+> w <> text "}"
comment s = text "%%" <+> text s -- line break show follow
app macro arg = text "\\" <> text macro <> text "{" <> arg <> text "}"
latexDoc :: Doc -> Doc
latexDoc body =
vcat [text "\\documentclass{article}",
text "\\usepackage[utf8]{inputenc}",
text "\\begin{document}",
body,
text "\\end{document}"]
-- * SVG (see https://www.w3.org/Graphics/SVG/IG/resources/svgprimer.html)
-- | Render LaTeX pictures as SVG
toSVG = concatMap toSVG1
where
toSVG1 el =
case el of
Comment s -> []
Picture unit size@(w,h) cmds ->
[Elem "svg" ["width".=x1,"height".=y0+5,
("viewBox",unwords (map show [0,0,x1,y0+5])),
("version","1.1"),
("xmlns","http://www.w3.org/2000/svg")]
(white_bg:concatMap draw cmds)]
where
white_bg =
Elem "rect" ["x".=0,"y".=0,"width".=x1,"height".=y0+5,
("fill","white")] []
draw (Put pos obj) = objectSVG pos obj
objectSVG pos obj =
case obj of
Text s -> [text 16 pos s]
TinyText s -> [text 10 pos s]
OvalTop size -> [ovalTop pos size]
ArrowDown len -> arrowDown pos len
text h (x,y) s =
Elem "text" ["x".=xc x,"y".=yc y-2,"font-size".=h]
[CharData s]
ovalTop (x,y) (w,h) =
Elem "path" [("d",path),("stroke","black"),("fill","none")] []
where
x1 = x-w/2
x2 = min x (x1+r)
x3 = max x (x4-r)
x4 = x+w/2
y1 = y
y2 = y+r
r = h/2
sx = show . xc
sy = show . yc
path = unwords (["M",sx x1,sy y1,"Q",sx x1,sy y2,sx x2,sy y2,
"L",sx x3,sy y2,"Q",sx x4,sy y2,sx x4,sy y1])
arrowDown (x,y) len =
[Elem "line" ["x1".=xc x,"y1".=yc y,"x2".=xc x,"y2".=y2,
("stroke","black")] [],
Elem "path" [("d",unwords arrowhead)] []]
where
x2 = xc x
y2 = yc (y-len)
arrowhead = "M":map show [x2,y2,x2-3,y2-6,x2+3,y2-6]
xc x = num x+5
yc y = y0-num y
x1 = num w+10
y0 = num h+20
num x = round (scale*x)
scale = unit*5
infix 0 .=
n.=v = (n,show v)
-- * SVG is XML
data SVG = CharData String | Elem TagName Attrs [SVG]
type TagName = String
type Attrs = [(String,String)]
ppSVG svg =
vcat [text "<?xml version=\"1.0\" standalone=\"no\"?>",
text "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"",
text "\"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">",
text "",
vcat (map ppSVG1 svg)] -- It should be a single <svg> element...
where
ppSVG1 svg1 =
case svg1 of
CharData s -> text (encode s)
Elem tag attrs [] ->
text "<"<>text tag<>cat (map attr attrs) <> text "/>"
Elem tag attrs svg ->
cat [text "<"<>text tag<>cat (map attr attrs) <> text ">",
nest 2 (cat (map ppSVG1 svg)),
text "</"<>text tag<>text ">"]
attr (n,v) = text " "<>text n<>text "=\""<>text (encode v)<>text "\""
encode s = foldr encodeEntity "" s
encodeEntity = encodeEntity' (const False)
encodeEntity' esc c r =
case c of
'&' -> "&"++r
'<' -> "<"++r
'>' -> ">"++r
_ -> c:r
----------------------------------
-- concrete syntax annotations (local) on top of conll
-- examples of annotations:
-- UseComp {"not"} PART neg head
-- UseComp {*} AUX cop head
type CncLabels = [CncLabel]
data CncLabel =
CncSyncat (String, String -> Maybe (String -> String,String,String))
-- (fun, word/lemma -> (pos,label,target))
-- the pos can remain unchanged, as in the current notation in the article
| CncMorpho (String,[String])
-- (category, features in ascending order)
| CncForm (String,(String,String))
-- (wordform, (lemma,features))
fixCoNLL :: CncLabels -> CoNLL -> CoNLL
fixCoNLL cncLabels conll = map (fixMorpho . fixDep) (markRoot conll) where
labels = [l | CncSyncat l <- cncLabels]
flabels = [r | CncMorpho r <- cncLabels]
-- change the root label from dep to root
--- doing this for the leftmost word of the root node
markRoot rows = case rows of
(i:word:fun:pos:cat:x_:"0":lab_:xs):rs -> (i:word:fun:pos:cat:x_:"0":"root":xs) : map (markNoRoot i) rs
r:rs -> r : markRoot rs
_ -> rows --- what about if there is no root?
markNoRoot r row@(i:word:fun:pos:cat:x_:j:label:xs) = case j of
"0" -> (i:word:fun:pos:cat:x_: r :label:xs)
_ -> row
fixDep row = case row of
(i:word:fun:pos:cat:x_:j:label:xs) | label /= "root" -> case look (fun,word) of
Just (pos',label',"head") -> (i:word:fun:pos' pos:cat: x_: j :label':xs)
Just (pos',label',target) -> (i:word:fun:pos' pos:cat: x_: getDep j target:label':xs)
_ -> row
_ -> row
fixMorpho (i:word:fun:pos:cat: mo :j:label:xs) = (i:word:fun:pos:cat: (feat cat word mo) :j:label:xs)
look (fun,word) = case lookup fun labels of
Just relabel -> case relabel word of
Just row -> Just row
_ -> case lookup "*" labels of
Just starlabel -> starlabel word
_ -> Nothing
_ -> case lookup "*" labels of
Just starlabel -> starlabel word
_ -> Nothing
getDep j label = maybe j id $ lookup (label,j) [((label,j),i) | i:word:fun:pos:cat:x_:j:label:xs <- conll]
feat cat word x = case lookup cat flabels of
Just tags | all isDigit x && length tags > read x -> tags !! read x
_ -> case lookup (show word) flabels of
Just (t:_) -> t
_ -> cat ++ "-" ++ x
getCncDepLabels :: String -> CncLabels
getCncDepLabels s = wlabels ws ++ flabels fs
where
wlabels =
map CncSyncat .
map merge .
groupBy (\ (x,_) (a,_) -> x == a) .
sortBy (comparing fst) .
concatMap analyse .
filter chooseW
flabels =
map CncMorpho .
map collectTags .
map words
(fs,ws) = partition chooseF $ map uncomment $ lines s
--- choose is for compatibility with the general notation
chooseW line = notElem '(' line &&
elem '{' line
--- ignoring non-local (with "(") and abstract (without "{") rules
---- TODO: this means that "(" cannot be a token
chooseF line = take 1 line == "@" --- feature assignments have the form e.g. @N SgNom SgGen ; no spaces inside tags
uncomment line = case line of
'-':'-':_ -> ""
c:cs -> c : uncomment cs
_ -> line
analyse line = case break (=='{') line of
(beg,_:ws) -> case break (=='}') ws of
(toks,_:target) -> case (getToks beg, words target) of
(funs,[ label,j]) -> [(fun, (tok, (id, label,j))) | fun <- funs, tok <- getToks toks]
(funs,[pos,label,j]) -> [(fun, (tok, (const pos,label,j))) | fun <- funs, tok <- getToks toks]
_ -> []
_ -> []
_ -> []
merge rules@((fun,_):_) = (fun, \tok ->
case lookup tok (map snd rules) of
Just new -> return new
_ -> lookup "*" (map snd rules)
)
getToks = map unquote . filter (/=",") . toks
toks s = case lex s of [(t,"")] -> [t] ; [(t,cc)] -> t:toks cc ; _ -> []
unquote s = case s of '"':cc@(_:_) | last cc == '"' -> init cc ; _ -> s
collectTags (w:ws) = (tail w,ws)
-- added init to remove the last \n. otherwise, two empty lines are in between each sentence PK 17/12/2018
printCoNLL :: CoNLL -> String
printCoNLL = init . unlines . map (concat . intersperse "\t")
|