@@ -146,6 +146,54 @@ def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:
146
146
return hash .hexdigest ()
147
147
148
148
149
+ class NamedRefLink :
150
+ "Named reference link being preserved until we can resolve it correctly."
151
+
152
+ def __init__ (self , ref : TextStringObject , source_pdf : Union [PdfReader ,None ]):
153
+ "ref: TextStringObject with named reference"
154
+ self ._ref = ref
155
+ self ._source_pdf = source_pdf
156
+
157
+ def find_referenced_page (self ) -> Union [IndirectObject ,None ]:
158
+ if not self ._source_pdf :
159
+ return None
160
+ dest = self ._source_pdf .named_destinations .get (str (self ._ref ))
161
+ if dest :
162
+ return dest .page
163
+
164
+ def patch_reference (self , target_pdf , new_page : IndirectObject ) -> None :
165
+ "target_pdf: PdfWriter which the new link went into"
166
+
167
+ def _get_referenced_page_ix (pdf : PdfWriter , ref : IndirectObject ):
168
+ try :
169
+ return [
170
+ p .indirect_reference for p in pdf .flattened_pages
171
+ ].index (ref )
172
+ except ValueError :
173
+ return None
174
+
175
+ # point named destination in new PDF to the new page
176
+ if str (self ._ref ) not in target_pdf .named_destinations :
177
+ new_page_ix = _get_referenced_page_ix (target_pdf , new_page )
178
+ if new_page_ix is not None :
179
+ target_pdf .add_named_destination (str (self ._ref ), new_page_ix )
180
+
181
+
182
+ class DirectRefLink :
183
+ "Direct reference link being preserved until we can resolve it correctly."
184
+
185
+ def __init__ (self , ref : ArrayObject , source_pdf : Union [PdfReader ,None ]):
186
+ "ref: an ArrayObject whose first element is the Page indir obj"
187
+ self ._ref = ref
188
+
189
+ def find_referenced_page (self ) -> IndirectObject :
190
+ return self ._ref [0 ]
191
+
192
+ def patch_reference (self , target_pdf , new_page : IndirectObject ) -> None :
193
+ "target_pdf: PdfWriter which the new link went into"
194
+ self ._ref [0 ] = new_page
195
+
196
+
149
197
class PdfWriter (PdfDocCommon ):
150
198
"""
151
199
Write a PDF file out, given pages produced by another class or through
@@ -209,6 +257,11 @@ def __init__(
209
257
"""The PDF file identifier,
210
258
defined by the ID in the PDF file's trailer dictionary."""
211
259
260
+ self ._unresolved_links : list [Union [NamedRefLink ,DirectRefLink ]] = []
261
+ "Tracks links in pages added to the writer for resolving later."
262
+ self ._merged_in_pages : List [Tuple [IndirectObject ,IndirectObject ]] = []
263
+ "Tracks pages added to the writer and what page they turned into."
264
+
212
265
if self .incremental :
213
266
if isinstance (fileobj , (str , Path )):
214
267
with open (fileobj , "rb" ) as f :
@@ -482,12 +535,50 @@ def _add_page(
482
535
]
483
536
except Exception :
484
537
pass
538
+
539
+ def _extract_links (new_page : PageObject , old_page : PageObject ) -> List [Union [NamedRefLink ,DirectRefLink ]]:
540
+ new_links = [_build_link (link , new_page ) for link in new_page .get ("/Annots" , [])]
541
+ old_links = [_build_link (link , old_page ) for link in old_page .get ("/Annots" , [])]
542
+
543
+ return [(new_link , old_link ) for (new_link , old_link )
544
+ in zip (new_links , old_links )
545
+ if new_link ]
546
+
547
+ def _build_link (indir_obj : IndirectObject , page : PageObject ) -> Union [NamedRefLink ,DirectRefLink ]:
548
+ if page .pdf :
549
+ src = page .pdf
550
+ else :
551
+ src = None
552
+
553
+ link = indir_obj .get_object ()
554
+ if link ["/Subtype" ] != "/Link" :
555
+ return None
556
+
557
+ if "/A" in link :
558
+ action = link ["/A" ]
559
+ if action .get ("/S" ) != "/GoTo" :
560
+ return None
561
+
562
+ return _create_link (action ["/D" ], src )
563
+
564
+ elif "/Dest" in link :
565
+ return _create_link (link ["/Dest" ], src )
566
+
567
+ return None # nothing we need to do
568
+
569
+ def _create_link (ref : Union [TextStringObject ,ArrayObject ], src : Union [PdfReader ,None ]) -> Union [NamedRefLink ,DirectRefLink ]:
570
+ if isinstance (ref , TextStringObject ):
571
+ return NamedRefLink (ref , src )
572
+ else :
573
+ return DirectRefLink (ref , src )
574
+
485
575
page = cast (
486
576
"PageObject" , page_org .clone (self , False , excluded_keys ).get_object ()
487
577
)
488
578
if page_org .pdf is not None :
489
579
other = page_org .pdf .pdf_header
490
580
self .pdf_header = _get_max_pdf_version_header (self .pdf_header , other )
581
+
491
582
node , idx = self ._get_page_in_node (index )
492
583
page [NameObject (PA .PARENT )] = node .indirect_reference
493
584
@@ -505,6 +596,16 @@ def _add_page(
505
596
recurse += 1
506
597
if recurse > 1000 :
507
598
raise PyPdfError ("Too many recursive calls!" )
599
+
600
+ if page_org .pdf is not None :
601
+ # the page may contain links to other pages, and those other
602
+ # pages may or may not already be added. we store the
603
+ # information we need, so that we can resolve the references
604
+ # later.
605
+ self ._unresolved_links .extend (_extract_links (page , page_org ))
606
+ self ._merged_in_pages .append ( (page_org .indirect_reference ,
607
+ page .indirect_reference ) )
608
+
508
609
return page
509
610
510
611
def set_need_appearances_writer (self , state : bool = True ) -> None :
@@ -1349,6 +1450,30 @@ def encrypt(
1349
1450
self ._add_object (entry )
1350
1451
self ._encrypt_entry = entry
1351
1452
1453
+ def _resolve_links (self ) -> None :
1454
+ """Patch up links that were added to the document earlier, to
1455
+ make sure they still point to the same pages."""
1456
+
1457
+ def _get_referenced_page_ix (pdf : PdfWriter , ref : IndirectObject ) -> Union [int ,None ]:
1458
+ try :
1459
+ return [
1460
+ p .indirect_reference for p in pdf .flattened_pages
1461
+ ].index (ref )
1462
+ except ValueError :
1463
+ return None
1464
+
1465
+ for (new_link , old_link ) in self ._unresolved_links :
1466
+ old_page = old_link .find_referenced_page ()
1467
+ if not old_page :
1468
+ continue
1469
+ new_page = None
1470
+ for (page_org , page_created ) in self ._merged_in_pages :
1471
+ if page_org == old_page :
1472
+ new_page = page_created
1473
+ if new_page is None :
1474
+ continue
1475
+ new_link .patch_reference (self , new_page )
1476
+
1352
1477
def write_stream (self , stream : StreamType ) -> None :
1353
1478
if hasattr (stream , "mode" ) and "b" not in stream .mode :
1354
1479
logger_warning (
@@ -1360,6 +1485,7 @@ def write_stream(self, stream: StreamType) -> None:
1360
1485
# if not self._root:
1361
1486
# self._root = self._add_object(self._root_object)
1362
1487
# self._sweep_indirect_references(self._root)
1488
+ self ._resolve_links ()
1363
1489
1364
1490
if self .incremental :
1365
1491
self ._reader .stream .seek (0 )
0 commit comments