Skip to content

Commit

Permalink
ENH: Tolerate PDF with invalid xref pointed objects (#2335)
Browse files Browse the repository at this point in the history
Closes #2326
  • Loading branch information
pubpub-zz authored Mar 30, 2024
1 parent 7883580 commit 3a6e4d0
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 12 deletions.
19 changes: 19 additions & 0 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,25 @@ def read(self, stream: StreamType) -> None:
# non-zero-index is actually correct
stream.seek(loc, 0) # return to where it was

# remove wrong objects (not pointing to correct structures) - cf #2326
if not self.strict:
loc = stream.tell()
for gen, xref_entry in self.xref.items():
if gen == 65535:
continue
ids = list(xref_entry.keys())
for id in ids:
stream.seek(xref_entry[id], 0)
try:
self.read_object_header(stream)
except ValueError:
logger_warning(
f"Ignoring wrong pointing object {id} {gen} (offset {xref_entry[id]})",
__name__,
)
del xref_entry[id] # we can delete the id, we are parsing ids
stream.seek(loc, 0) # return to where it was

def _basic_validation(self, stream: StreamType) -> None:
"""Ensure file is not empty. Read at most 5 bytes."""
stream.seek(0, os.SEEK_SET)
Expand Down
25 changes: 14 additions & 11 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,9 @@ def test_get_images(src, expected_images):
False,
-1,
False,
["startxref on same line as offset"],
[
"startxref on same line as offset",
],
),
(
False,
Expand Down Expand Up @@ -322,11 +324,12 @@ def test_get_images_raw(
b"%%%%EOF"
)
pdf_data = pdf_data % (
pdf_data.find(b"1 0 obj"),
pdf_data.find(b"2 0 obj"),
pdf_data.find(b"3 0 obj"),
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
# - 1 below in the find because of the double %
pdf_data.find(b"1 0 obj") - 1,
pdf_data.find(b"2 0 obj") - 1,
pdf_data.find(b"3 0 obj") - 1,
pdf_data.find(b"4 0 obj") - 1,
pdf_data.find(b"5 0 obj") - 1,
b"/Prev 0 " if with_prev_0 else b"",
# startx_correction should be -1 due to double % at the beginning
# inducing an error on startxref computation
Expand Down Expand Up @@ -593,11 +596,11 @@ def test_read_unknown_zero_pages(caplog):
b"%%%%EOF"
)
pdf_data = pdf_data % (
pdf_data.find(b"1 0 obj"),
pdf_data.find(b"2 0 obj"),
pdf_data.find(b"3 0 obj"),
pdf_data.find(b"4 0 obj"),
pdf_data.find(b"5 0 obj"),
pdf_data.find(b"1 0 obj") - 1,
pdf_data.find(b"2 0 obj") - 1,
pdf_data.find(b"3 0 obj") - 1,
pdf_data.find(b"4 0 obj") - 1,
pdf_data.find(b"5 0 obj") - 1,
pdf_data.find(b"xref") - 1,
)
pdf_stream = io.BytesIO(pdf_data)
Expand Down
6 changes: 5 additions & 1 deletion tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,11 @@ def test_get_fields_warns(tmp_path, caplog, url, name):
retrieved_fields = reader.get_fields(fileobj=fp)

assert retrieved_fields == {}
assert normalize_warnings(caplog.text) == ["Object 2 0 not defined."]
assert normalize_warnings(caplog.text) == [
"Ignoring wrong pointing object 1 65536 (offset 0)",
"Ignoring wrong pointing object 2 65536 (offset 0)",
"Object 2 0 not defined.",
]


@pytest.mark.enable_socket()
Expand Down

0 comments on commit 3a6e4d0

Please sign in to comment.