Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add capability to remove /Info from pypdf #2820

Merged
merged 50 commits into from
Sep 14, 2024
Merged
Show file tree
Hide file tree
Changes from 49 commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
fba73a4
ENH: add incremental capability to PdfWriter
pubpub-zz Aug 23, 2024
0543709
fix test
pubpub-zz Aug 24, 2024
29030d4
fixes + first test
pubpub-zz Aug 25, 2024
1067b74
coverage
pubpub-zz Aug 25, 2024
f1d3fbe
coverage
pubpub-zz Aug 25, 2024
ae97bc7
cope with multiple level pages
pubpub-zz Aug 26, 2024
d9a99d9
test + doc
pubpub-zz Aug 26, 2024
3c4cfdc
coverage
pubpub-zz Aug 26, 2024
38d4b35
coverage
pubpub-zz Aug 26, 2024
79eca73
coverage
pubpub-zz Aug 26, 2024
290c5a6
coverage
pubpub-zz Aug 26, 2024
173578d
coverage
pubpub-zz Aug 26, 2024
b2b0c9e
Merge branch 'main' into incremental
pubpub-zz Aug 26, 2024
1a6eda5
simplification
pubpub-zz Aug 26, 2024
d43d25b
coverage
pubpub-zz Aug 27, 2024
7e2e74d
Merge branch 'main' into incremental
pubpub-zz Aug 27, 2024
708e449
Merge branch 'main' into incremental
pubpub-zz Aug 28, 2024
c9a6c95
ENH: add capability to remove /Info from pypdf
pubpub-zz Aug 28, 2024
5147266
coverage
pubpub-zz Aug 28, 2024
ec9aafe
oups
pubpub-zz Aug 28, 2024
ff76e02
Merge remote-tracking branch 'py-pdf/main' into incremental
pubpub-zz Sep 1, 2024
14a93f1
move to X-reference stream for increment
pubpub-zz Sep 1, 2024
53e141f
coverage
pubpub-zz Sep 1, 2024
b4b7c1b
coverage
pubpub-zz Sep 1, 2024
7bc3abd
coverage
pubpub-zz Sep 1, 2024
ffa2f0c
fix
pubpub-zz Sep 1, 2024
b072952
mypy
pubpub-zz Sep 1, 2024
ca724f9
Merge branch 'incremental' into info_clean
pubpub-zz Sep 1, 2024
454c4fe
coverage
pubpub-zz Sep 1, 2024
a3b6246
Merge branch 'main' into incremental
pubpub-zz Sep 5, 2024
08fd21a
Merge branch 'main' into info_clean
pubpub-zz Sep 5, 2024
494e00a
Update pypdf/_doc_common.py
pubpub-zz Sep 8, 2024
eba1c9f
Update pypdf/_doc_common.py
pubpub-zz Sep 8, 2024
d68db51
Update pypdf/_doc_common.py
pubpub-zz Sep 8, 2024
8b3182d
Update pypdf/_doc_common.py
pubpub-zz Sep 8, 2024
fe6aac7
Update pypdf/_writer.py
pubpub-zz Sep 8, 2024
0be4bb4
Update pypdf/_writer.py
pubpub-zz Sep 8, 2024
4c585c0
Update pypdf/_writer.py
pubpub-zz Sep 8, 2024
fbe54d0
Update pypdf/_writer.py
pubpub-zz Sep 8, 2024
e3c1e2c
Update pypdf/_writer.py
pubpub-zz Sep 8, 2024
6e65943
clarify assert mypy
pubpub-zz Sep 8, 2024
4121672
doc hash_bin
pubpub-zz Sep 8, 2024
bcc5c1d
doc hash_bin
pubpub-zz Sep 8, 2024
02ac507
Merge branch 'main' into incremental
pubpub-zz Sep 8, 2024
bc6caba
Update pypdf/_page.py
stefan6419846 Sep 8, 2024
8659de2
Update pypdf/_writer.py
stefan6419846 Sep 8, 2024
99e6dfc
Apply suggestions from code review
stefan6419846 Sep 8, 2024
9234a03
Merge remote-tracking branch 'origin/incremental' into info_clean
pubpub-zz Sep 9, 2024
8f121b2
Merge remote-tracking branch 'py-pdf/main' into info_clean
pubpub-zz Sep 13, 2024
be48872
improve docs
stefan6419846 Sep 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions docs/user/metadata.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,30 @@ writer.add_metadata(
}
)

# Clear all data but keep the entry in PDF
writer.metadata = {}

# Replace all entries with new set of entries
writer.metadata = {
"/Author": "Martin",
"/Producer": "Libre Writer",
}

# Save the new PDF to a file
with open("meta-pdf.pdf", "wb") as f:
writer.write(f)
```

## Removing metadata entry

```python
from pypdf import PdfWriter

writer = PdfWriter("example.pdf")

# Remove Metadata (/Info entry)
writer.metadata = None

# Save the new PDF to a file
with open("meta-pdf.pdf", "wb") as f:
writer.write(f)
Expand Down
85 changes: 63 additions & 22 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
)

from ._cmap import _default_fonts_space_width, build_char_map_from_dict
from ._doc_common import PdfDocCommon
from ._doc_common import DocumentInformation, PdfDocCommon
from ._encryption import EncryptAlgorithm, Encryption
from ._page import PageObject
from ._page_labels import nums_clear_range, nums_insert, nums_next
Expand Down Expand Up @@ -194,7 +194,7 @@ def __init__(
"""

self._ID: Union[ArrayObject, None] = None
self._info_obj: PdfObject
self._info_obj: Optional[PdfObject]

if self.incremental:
if isinstance(fileobj, (str, Path)):
Expand Down Expand Up @@ -309,13 +309,26 @@ def _info(self) -> Optional[DictionaryObject]:
Returns:
/Info Dictionary; None if the entry does not exist
"""
return cast(DictionaryObject, self._info_obj.get_object())
return (
None
if self._info_obj is None
else cast(DictionaryObject, self._info_obj.get_object())
)

@_info.setter
def _info(self, value: Union[IndirectObject, DictionaryObject]) -> None:
obj = cast(DictionaryObject, self._info_obj.get_object())
obj.clear()
obj.update(cast(DictionaryObject, value.get_object()))
def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None:
if value is None:
try:
self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore
except (KeyError, AttributeError):
pass
self._info_obj = None
else:
if self._info_obj is None:
self._info_obj = self._add_object(DictionaryObject())
obj = cast(DictionaryObject, self._info_obj.get_object())
obj.clear()
obj.update(cast(DictionaryObject, value.get_object()))

@property
def xmp_metadata(self) -> Optional[XmpInformation]:
Expand Down Expand Up @@ -1186,6 +1199,7 @@ def clone_reader_document_root(self, reader: PdfReader) -> None:
self._objects = [None] * cast(int, reader.trailer["/Size"])
else:
self._objects.clear()
self._info_obj = None
self._root_object = reader.root_object.clone(self)
self._pages = self._root_object.raw_get("/Pages")

Expand Down Expand Up @@ -1226,22 +1240,21 @@ def clone_document_from_reader(
document.
"""
self.clone_reader_document_root(reader)
if TK.INFO in reader.trailer:
inf = reader._info
if self.incremental:
if inf is not None:
self._info_obj = cast(
IndirectObject, inf.clone(self).indirect_reference
)
self._original_hash[
cast(IndirectObject, self._info_obj.indirect_reference).idnum - 1
] = cast(DictionaryObject, self._info_obj.get_object()).hash_bin()
elif inf is not None:
self._info_obj = self._add_object(
DictionaryObject(cast(DictionaryObject, inf.get_object()))
inf = reader._info
if self.incremental:
if inf is not None:
self._info_obj = cast(
IndirectObject, inf.clone(self).indirect_reference
)
else:
self._info_obj = self._add_object(DictionaryObject())
assert isinstance(self._info, DictionaryObject), "for mypy"
self._original_hash[
self._info_obj.indirect_reference.idnum - 1
] = self._info.hash_bin()
elif inf is not None:
self._info_obj = self._add_object(
DictionaryObject(cast(DictionaryObject, inf.get_object()))
)
# else: _info_obj = None done in clone_reader_document_root()

try:
self._ID = cast(ArrayObject, reader._ID).clone(self)
Expand Down Expand Up @@ -1547,6 +1560,34 @@ def _write_trailer(self, stream: StreamType, xref_location: int) -> None:
trailer.write_to_stream(stream)
stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof

@property
def metadata(self) -> Optional[DocumentInformation]:
"""
Retrieve/set the PDF file's document information dictionary, if it exists.

Args:
value: dict with the entries to be set. if None : remove the /Info entry from the pdf.
stefan6419846 marked this conversation as resolved.
Show resolved Hide resolved

Note that some PDF files use (xmp)metadata streams instead of document
stefan6419846 marked this conversation as resolved.
Show resolved Hide resolved
information dictionaries, and these metadata streams will not be
accessed by this function.
"""
return super().metadata

@metadata.setter
def metadata(
self,
value: Optional[Union[DocumentInformation, DictionaryObject, Dict[Any, Any]]],
) -> None:
if value is None:
self._info = None
else:
if self._info is not None:
self._info.clear()
else:
self._info = DictionaryObject()
self.add_metadata(value)

def add_metadata(self, infos: Dict[str, Any]) -> None:
"""
Add custom metadata to the output.
Expand Down
36 changes: 34 additions & 2 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1795,9 +1795,33 @@ def test_missing_info():

writer = PdfWriter(clone_from=reader)
assert len(writer.pages) == len(reader.pages)
assert writer.metadata is None
b = BytesIO()
writer.write(b)
assert b"/Info" not in b.getvalue()

reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf")
writer._info = reader._info
writer.metadata = reader.metadata
assert dict(writer._info) == dict(reader._info)
assert writer.metadata == reader.metadata
b = BytesIO()
writer.write(b)
assert b"/Info" in b.getvalue()

writer.metadata = {}
writer._info = DictionaryObject() # for code coverage
b = BytesIO()
writer.write(b)
assert b"/Info" in b.getvalue()
assert writer.metadata == {}

writer.metadata = None
writer.metadata = None # for code coverage
assert writer.metadata is None
assert PdfWriter().metadata == {"/Producer": "pypdf"}
b = BytesIO()
writer.write(b)
assert b"/Info" not in b.getvalue()


@pytest.mark.enable_socket()
Expand Down Expand Up @@ -2417,6 +2441,8 @@ def test_increment_writer(caplog):
writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=True)
# 1 object is modified: page 0 inherits MediaBox so is changed
assert len(writer.list_objects_in_increment()) == 1
b = BytesIO()
writer.write(b)

writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=False)
# 1 object is modified: page 0 inherits MediaBox so is changed
Expand All @@ -2438,7 +2464,13 @@ def test_increment_writer(caplog):

# clone without info
writer = PdfWriter(RESOURCE_ROOT / "missing_info.pdf", incremental=True)
assert len(writer.list_objects_in_increment()) == 0
assert writer.metadata is None
writer.metadata = {}
assert writer.metadata == {}
assert len(writer.list_objects_in_increment()) == 1
assert writer._info == {}
writer.metadata = None
assert len(writer.list_objects_in_increment()) == 0
assert writer.metadata is None
b = BytesIO()
writer.write(b)
Loading