Skip to content

Commit d0b2c8a

Browse files
committed
ENH: Automatically preserve links in added pages
1 parent ae7a064 commit d0b2c8a

File tree

3 files changed

+219
-0
lines changed

3 files changed

+219
-0
lines changed

pypdf/_writer.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,54 @@ def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:
146146
return hash.hexdigest()
147147

148148

149+
class NamedRefLink:
150+
"Named reference link being preserved until we can resolve it correctly."
151+
152+
def __init__(self, ref: TextStringObject, source_pdf: Union[PdfReader,None]):
153+
"ref: TextStringObject with named reference"
154+
self._ref = ref
155+
self._source_pdf = source_pdf
156+
157+
def find_referenced_page(self) -> Union[IndirectObject,None]:
158+
if not self._source_pdf:
159+
return None
160+
dest = self._source_pdf.named_destinations.get(str(self._ref))
161+
if dest:
162+
return dest.page
163+
164+
def patch_reference(self, target_pdf, new_page: IndirectObject) -> None:
165+
"target_pdf: PdfWriter which the new link went into"
166+
167+
def _get_referenced_page_ix(pdf: PdfWriter, ref: IndirectObject):
168+
try:
169+
return [
170+
p.indirect_reference for p in pdf.flattened_pages
171+
].index(ref)
172+
except ValueError:
173+
return None
174+
175+
# point named destination in new PDF to the new page
176+
if str(self._ref) not in target_pdf.named_destinations:
177+
new_page_ix = _get_referenced_page_ix(target_pdf, new_page)
178+
if new_page_ix is not None:
179+
target_pdf.add_named_destination(str(self._ref), new_page_ix)
180+
181+
182+
class DirectRefLink:
183+
"Direct reference link being preserved until we can resolve it correctly."
184+
185+
def __init__(self, ref: ArrayObject, source_pdf: Union[PdfReader,None]):
186+
"ref: an ArrayObject whose first element is the Page indir obj"
187+
self._ref = ref
188+
189+
def find_referenced_page(self) -> IndirectObject:
190+
return self._ref[0]
191+
192+
def patch_reference(self, target_pdf, new_page: IndirectObject) -> None:
193+
"target_pdf: PdfWriter which the new link went into"
194+
self._ref[0] = new_page
195+
196+
149197
class PdfWriter(PdfDocCommon):
150198
"""
151199
Write a PDF file out, given pages produced by another class or through
@@ -209,6 +257,11 @@ def __init__(
209257
"""The PDF file identifier,
210258
defined by the ID in the PDF file's trailer dictionary."""
211259

260+
self._unresolved_links: list[Union[NamedRefLink,DirectRefLink]] = []
261+
"Tracks links in pages added to the writer for resolving later."
262+
self._merged_in_pages: List[Tuple[IndirectObject,IndirectObject]] = []
263+
"Tracks pages added to the writer and what page they turned into."
264+
212265
if self.incremental:
213266
if isinstance(fileobj, (str, Path)):
214267
with open(fileobj, "rb") as f:
@@ -482,12 +535,50 @@ def _add_page(
482535
]
483536
except Exception:
484537
pass
538+
539+
def _extract_links(new_page: PageObject, old_page: PageObject) -> List[Union[NamedRefLink,DirectRefLink]]:
540+
new_links = [_build_link(link, new_page) for link in new_page.get("/Annots", [])]
541+
old_links = [_build_link(link, old_page) for link in old_page.get("/Annots", [])]
542+
543+
return [(new_link, old_link) for (new_link, old_link)
544+
in zip(new_links, old_links)
545+
if new_link]
546+
547+
def _build_link(indir_obj: IndirectObject, page: PageObject) -> Union[NamedRefLink,DirectRefLink]:
548+
if page.pdf:
549+
src = page.pdf
550+
else:
551+
src = None
552+
553+
link = indir_obj.get_object()
554+
if link["/Subtype"] != "/Link":
555+
return None
556+
557+
if "/A" in link:
558+
action = link["/A"]
559+
if action.get("/S") != "/GoTo":
560+
return None
561+
562+
return _create_link(action["/D"], src)
563+
564+
elif "/Dest" in link:
565+
return _create_link(link["/Dest"], src)
566+
567+
return None # nothing we need to do
568+
569+
def _create_link(ref: Union[TextStringObject,ArrayObject], src: Union[PdfReader,None]) -> Union[NamedRefLink,DirectRefLink]:
570+
if isinstance(ref, TextStringObject):
571+
return NamedRefLink(ref, src)
572+
else:
573+
return DirectRefLink(ref, src)
574+
485575
page = cast(
486576
"PageObject", page_org.clone(self, False, excluded_keys).get_object()
487577
)
488578
if page_org.pdf is not None:
489579
other = page_org.pdf.pdf_header
490580
self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)
581+
491582
node, idx = self._get_page_in_node(index)
492583
page[NameObject(PA.PARENT)] = node.indirect_reference
493584

@@ -505,6 +596,16 @@ def _add_page(
505596
recurse += 1
506597
if recurse > 1000:
507598
raise PyPdfError("Too many recursive calls!")
599+
600+
if page_org.pdf is not None:
601+
# the page may contain links to other pages, and those other
602+
# pages may or may not already be added. we store the
603+
# information we need, so that we can resolve the references
604+
# later.
605+
self._unresolved_links.extend(_extract_links(page, page_org))
606+
self._merged_in_pages.append( (page_org.indirect_reference,
607+
page.indirect_reference) )
608+
508609
return page
509610

510611
def set_need_appearances_writer(self, state: bool = True) -> None:
@@ -1349,6 +1450,30 @@ def encrypt(
13491450
self._add_object(entry)
13501451
self._encrypt_entry = entry
13511452

1453+
def _resolve_links(self) -> None:
1454+
"""Patch up links that were added to the document earlier, to
1455+
make sure they still point to the same pages."""
1456+
1457+
def _get_referenced_page_ix(pdf: PdfWriter, ref: IndirectObject) -> Union[int,None]:
1458+
try:
1459+
return [
1460+
p.indirect_reference for p in pdf.flattened_pages
1461+
].index(ref)
1462+
except ValueError:
1463+
return None
1464+
1465+
for (new_link, old_link) in self._unresolved_links:
1466+
old_page = old_link.find_referenced_page()
1467+
if not old_page:
1468+
continue
1469+
new_page = None
1470+
for (page_org, page_created) in self._merged_in_pages:
1471+
if page_org == old_page:
1472+
new_page = page_created
1473+
if new_page is None:
1474+
continue
1475+
new_link.patch_reference(self, new_page)
1476+
13521477
def write_stream(self, stream: StreamType) -> None:
13531478
if hasattr(stream, "mode") and "b" not in stream.mode:
13541479
logger_warning(
@@ -1360,6 +1485,7 @@ def write_stream(self, stream: StreamType) -> None:
13601485
# if not self._root:
13611486
# self._root = self._add_object(self._root_object)
13621487
# self._sweep_indirect_references(self._root)
1488+
self._resolve_links()
13631489

13641490
if self.incremental:
13651491
self._reader.stream.seek(0)

tests/example_files.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,3 +110,7 @@
110110
url: https://github.com/py-pdf/pypdf/files/12483807/AEO.1172.pdf
111111
- local_filename: iss3268.pdf
112112
url: https://github.com/user-attachments/files/20060394/broken.pdf
113+
- local_filename: direct-link.pdf
114+
url: https://github.com/user-attachments/files/20348304/tst.pdf
115+
- local_filename: named-reference.pdf
116+
url: https://github.com/user-attachments/files/20455804/MinimalJob.pdf

tests/test_merger.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,3 +409,92 @@ def test_deprecate_pdfmerger():
409409
def test_get_reference():
410410
writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf")
411411
assert writer.get_reference(writer.pages[0]) == writer.pages[0].indirect_reference
412+
413+
414+
@pytest.mark.enable_socket
415+
def test_direct_link_preserved(pdf_file_path):
416+
# this could be any PDF -- we don't care which
417+
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
418+
writer = PdfWriter(clone_from = reader)
419+
420+
# this PDF has a direct link from p1 to p2
421+
merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf")))
422+
for p in merger.pages:
423+
writer.add_page(p)
424+
425+
writer.write(pdf_file_path)
426+
427+
check = PdfReader(pdf_file_path)
428+
page3 = check.pages[2]
429+
link = page3["/Annots"][0].get_object()
430+
assert "/Link" == link["/Subtype"]
431+
dest = link["/Dest"][0] # indirect ref of page referred to
432+
433+
page4 = check.flattened_pages[3]
434+
assert dest == page4.indirect_reference, "Link from page 3 to page 4 is broken"
435+
436+
437+
@pytest.mark.enable_socket
438+
def test_direct_link_preserved_reordering(pdf_file_path):
439+
# this could be any PDF -- we don't care which
440+
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
441+
writer = PdfWriter(clone_from = reader)
442+
443+
# this PDF has a direct link from p1 to p2
444+
merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf")))
445+
for p in merger.pages:
446+
writer.add_page(p)
447+
448+
# let's insert a page to mess up the page order
449+
writer.insert_page(reader.pages[0], 3)
450+
451+
writer.write(pdf_file_path)
452+
453+
check = PdfReader(pdf_file_path)
454+
page3 = check.pages[2]
455+
link = page3["/Annots"][0].get_object()
456+
assert "/Link" == link["/Subtype"]
457+
dest = link["/Dest"][0] # indirect ref of page referred to
458+
459+
page5 = check.flattened_pages[4] # it moved one out
460+
assert dest == page5.indirect_reference, "Link from page 3 to page 5 is broken"
461+
462+
463+
@pytest.mark.enable_socket
464+
def test_direct_link_page_missing(pdf_file_path):
465+
# this could be any PDF -- we don't care which
466+
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
467+
writer = PdfWriter(clone_from = reader)
468+
469+
# this PDF has a direct link from p1 to p2
470+
merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf")))
471+
writer.add_page(merger.pages[0])
472+
# but we're not adding page 2
473+
474+
writer.write(pdf_file_path) # verify nothing crashes
475+
476+
477+
@pytest.mark.enable_socket
478+
def test_named_reference_preserved(pdf_file_path):
479+
# this could be any PDF -- we don't care which
480+
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
481+
writer = PdfWriter(clone_from = reader)
482+
483+
# this PDF has a named reference from from p3 to p5
484+
merger = PdfReader(BytesIO(get_data_from_url(name="named-reference.pdf")))
485+
for p in merger.pages:
486+
writer.add_page(p)
487+
488+
writer.write(pdf_file_path)
489+
490+
check = PdfReader(pdf_file_path)
491+
page5 = check.pages[4]
492+
page7 = check.flattened_pages[6]
493+
for link in page5["/Annots"]:
494+
action = link["/A"]
495+
assert "/GoTo" == action.get("/S")
496+
dest = str(action["/D"])
497+
assert dest in check.named_destinations
498+
pref = check.named_destinations[dest].page
499+
500+
assert pref == page7.indirect_reference, "Link from page 5 to page 7 is broken"

0 commit comments

Comments
 (0)