diff --git a/docs/user/metadata.md b/docs/user/metadata.md index 7f0a57694..a2bbdf9f0 100644 --- a/docs/user/metadata.md +++ b/docs/user/metadata.md @@ -76,6 +76,30 @@ writer.add_metadata( } ) +# Clear all data but keep the entry in PDF +writer.metadata = {} + +# Replace all entries with new set of entries +writer.metadata = { + "/Author": "Martin", + "/Producer": "Libre Writer", +} + +# Save the new PDF to a file +with open("meta-pdf.pdf", "wb") as f: + writer.write(f) +``` + +## Removing metadata entry + +```python +from pypdf import PdfWriter + +writer = PdfWriter("example.pdf") + +# Remove Metadata (/Info entry) +writer.metadata = None + # Save the new PDF to a file with open("meta-pdf.pdf", "wb") as f: writer.write(f) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 823106fdc..edcd391e4 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -53,7 +53,7 @@ ) from ._cmap import _default_fonts_space_width, build_char_map_from_dict -from ._doc_common import PdfDocCommon +from ._doc_common import DocumentInformation, PdfDocCommon from ._encryption import EncryptAlgorithm, Encryption from ._page import PageObject from ._page_labels import nums_clear_range, nums_insert, nums_next @@ -194,7 +194,7 @@ def __init__( """ self._ID: Union[ArrayObject, None] = None - self._info_obj: PdfObject + self._info_obj: Optional[PdfObject] if self.incremental: if isinstance(fileobj, (str, Path)): @@ -309,13 +309,26 @@ def _info(self) -> Optional[DictionaryObject]: Returns: /Info Dictionary; None if the entry does not exist """ - return cast(DictionaryObject, self._info_obj.get_object()) + return ( + None + if self._info_obj is None + else cast(DictionaryObject, self._info_obj.get_object()) + ) @_info.setter - def _info(self, value: Union[IndirectObject, DictionaryObject]) -> None: - obj = cast(DictionaryObject, self._info_obj.get_object()) - obj.clear() - obj.update(cast(DictionaryObject, value.get_object())) + def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None: + if value is None: + try: + self._objects[self._info_obj.indirect_reference.idnum - 1] = None # type: ignore + except (KeyError, AttributeError): + pass + self._info_obj = None + else: + if self._info_obj is None: + self._info_obj = self._add_object(DictionaryObject()) + obj = cast(DictionaryObject, self._info_obj.get_object()) + obj.clear() + obj.update(cast(DictionaryObject, value.get_object())) @property def xmp_metadata(self) -> Optional[XmpInformation]: @@ -1186,6 +1199,7 @@ def clone_reader_document_root(self, reader: PdfReader) -> None: self._objects = [None] * cast(int, reader.trailer["/Size"]) else: self._objects.clear() + self._info_obj = None self._root_object = reader.root_object.clone(self) self._pages = self._root_object.raw_get("/Pages") @@ -1226,22 +1240,21 @@ def clone_document_from_reader( document. """ self.clone_reader_document_root(reader) - if TK.INFO in reader.trailer: - inf = reader._info - if self.incremental: - if inf is not None: - self._info_obj = cast( - IndirectObject, inf.clone(self).indirect_reference - ) - self._original_hash[ - cast(IndirectObject, self._info_obj.indirect_reference).idnum - 1 - ] = cast(DictionaryObject, self._info_obj.get_object()).hash_bin() - elif inf is not None: - self._info_obj = self._add_object( - DictionaryObject(cast(DictionaryObject, inf.get_object())) + inf = reader._info + if self.incremental: + if inf is not None: + self._info_obj = cast( + IndirectObject, inf.clone(self).indirect_reference ) - else: - self._info_obj = self._add_object(DictionaryObject()) + assert isinstance(self._info, DictionaryObject), "for mypy" + self._original_hash[ + self._info_obj.indirect_reference.idnum - 1 + ] = self._info.hash_bin() + elif inf is not None: + self._info_obj = self._add_object( + DictionaryObject(cast(DictionaryObject, inf.get_object())) + ) + # else: _info_obj = None done in clone_reader_document_root() try: self._ID = cast(ArrayObject, reader._ID).clone(self) @@ -1547,6 +1560,34 @@ def _write_trailer(self, stream: StreamType, xref_location: int) -> None: trailer.write_to_stream(stream) stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode()) # eof + @property + def metadata(self) -> Optional[DocumentInformation]: + """ + Retrieve/set the PDF file's document information dictionary, if it exists. + + Args: + value: Dictionary with the entries to set. If None, remove the /Info entry from the PDF. + + Note that some PDF files use (XMP) metadata streams instead of document + information dictionaries, and these metadata streams will not be + accessed by this function. + """ + return super().metadata + + @metadata.setter + def metadata( + self, + value: Optional[Union[DocumentInformation, DictionaryObject, Dict[Any, Any]]], + ) -> None: + if value is None: + self._info = None + else: + if self._info is not None: + self._info.clear() + else: + self._info = DictionaryObject() + self.add_metadata(value) + def add_metadata(self, infos: Dict[str, Any]) -> None: """ Add custom metadata to the output. diff --git a/tests/test_writer.py b/tests/test_writer.py index 7b9cbf003..e06db389b 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1795,9 +1795,33 @@ def test_missing_info(): writer = PdfWriter(clone_from=reader) assert len(writer.pages) == len(reader.pages) + assert writer.metadata is None + b = BytesIO() + writer.write(b) + assert b"/Info" not in b.getvalue() + reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") - writer._info = reader._info + writer.metadata = reader.metadata assert dict(writer._info) == dict(reader._info) + assert writer.metadata == reader.metadata + b = BytesIO() + writer.write(b) + assert b"/Info" in b.getvalue() + + writer.metadata = {} + writer._info = DictionaryObject() # for code coverage + b = BytesIO() + writer.write(b) + assert b"/Info" in b.getvalue() + assert writer.metadata == {} + + writer.metadata = None + writer.metadata = None # for code coverage + assert writer.metadata is None + assert PdfWriter().metadata == {"/Producer": "pypdf"} + b = BytesIO() + writer.write(b) + assert b"/Info" not in b.getvalue() @pytest.mark.enable_socket() @@ -2417,6 +2441,8 @@ def test_increment_writer(caplog): writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=True) # 1 object is modified: page 0 inherits MediaBox so is changed assert len(writer.list_objects_in_increment()) == 1 + b = BytesIO() + writer.write(b) writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=False) # 1 object is modified: page 0 inherits MediaBox so is changed @@ -2438,7 +2464,13 @@ def test_increment_writer(caplog): # clone without info writer = PdfWriter(RESOURCE_ROOT / "missing_info.pdf", incremental=True) + assert len(writer.list_objects_in_increment()) == 0 + assert writer.metadata is None + writer.metadata = {} + assert writer.metadata == {} assert len(writer.list_objects_in_increment()) == 1 - assert writer._info == {} + writer.metadata = None + assert len(writer.list_objects_in_increment()) == 0 + assert writer.metadata is None b = BytesIO() writer.write(b)