From fba73a47fc08a28b6b7d013104e2d322039e9cae Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Fri, 23 Aug 2024 23:05:37 +0200
Subject: [PATCH 01/40] ENH:  add incremental capability to PdfWriter

closes #2780
---
 pypdf/_doc_common.py              |  22 ++-
 pypdf/_page.py                    |  12 ++
 pypdf/_protocols.py               |   3 +
 pypdf/_reader.py                  |   2 +
 pypdf/_writer.py                  | 230 +++++++++++++++++++++++-------
 pypdf/constants.py                |   7 +-
 pypdf/generic/_base.py            |  90 +++++++++++-
 pypdf/generic/_data_structures.py |  26 ++++
 tests/test_reader.py              |   2 +-
 9 files changed, 336 insertions(+), 58 deletions(-)

diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
index 4f607340d..12848fb8e 100644
--- a/pypdf/_doc_common.py
+++ b/pypdf/_doc_common.py
@@ -254,6 +254,8 @@ class PdfDocCommon:
 
     _encryption: Optional[Encryption] = None
 
+    _readonly: bool = False
+
     @property
     @abstractmethod
     def root_object(self) -> DictionaryObject:
@@ -349,7 +351,7 @@ def get_num_pages(self) -> int:
             return self.root_object["/Pages"]["/Count"]  # type: ignore
         else:
             if self.flattened_pages is None:
-                self._flatten()
+                self._flatten(self._readonly)
             assert self.flattened_pages is not None
             return len(self.flattened_pages)
 
@@ -366,7 +368,7 @@ def get_page(self, page_number: int) -> PageObject:
             A :class:`PageObject<pypdf._page.PageObject>` instance.
         """
         if self.flattened_pages is None:
-            self._flatten()
+            self._flatten(self._readonly)
         assert self.flattened_pages is not None, "hint for mypy"
         return self.flattened_pages[page_number]
 
@@ -1082,10 +1084,19 @@ def page_mode(self) -> Optional[PagemodeType]:
 
     def _flatten(
         self,
+        list_only: bool = False,
         pages: Union[None, DictionaryObject, PageObject] = None,
         inherit: Optional[Dict[str, Any]] = None,
         indirect_reference: Optional[IndirectObject] = None,
     ) -> None:
+        """
+        prepare the document pages to ease searching
+        args:
+            list_only: will only list the pages witin _flatten_pages
+            pages,
+            inherit,
+            indirect_reference: used recursively to flatten the /Pages object
+        """
         inheritable_page_attributes = (
             NameObject(PG.RESOURCES),
             NameObject(PG.MEDIABOX),
@@ -1122,7 +1133,7 @@ def _flatten(
                 if obj:
                     # damaged file may have invalid child in /Pages
                     try:
-                        self._flatten(obj, inherit, **addt)
+                        self._flatten(list_only, obj, inherit, **addt)
                     except RecursionError:
                         raise PdfReadError(
                             "Maximum recursion depth reached during page flattening."
@@ -1134,7 +1145,8 @@ def _flatten(
                 if attr_in not in pages:
                     pages[attr_in] = value
             page_obj = PageObject(self, indirect_reference)
-            page_obj.update(pages)
+            if not list_only:
+                page_obj.update(pages)
 
             # TODO: Could flattened_pages be None at this point?
             self.flattened_pages.append(page_obj)  # type: ignore
@@ -1158,7 +1170,7 @@ def remove_page(
                 or destinations to reference a detached page.
         """
         if self.flattened_pages is None:
-            self._flatten()
+            self._flatten(self._readonly)
         assert self.flattened_pages is not None
         if isinstance(page, IndirectObject):
             p = page.get_object()
diff --git a/pypdf/_page.py b/pypdf/_page.py
index c51aee1ab..8a8c47eec 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -493,6 +493,18 @@ def __init__(
         # below Union for mypy but actually Optional[List[str]]
         self.indirect_reference = indirect_reference
 
+    def hash_bin(self) -> int:
+        """
+        Returns:
+            hash considering type and value
+        used to detect modified object
+        Note: this function is overloaded to return the same results
+            as a DictionaryObject
+        """
+        return hash(
+            (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))
+        )
+
     def hash_value_data(self) -> bytes:
         data = super().hash_value_data()
         data += b"%d" % id(self)
diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py
index b5fa14879..431db1a11 100644
--- a/pypdf/_protocols.py
+++ b/pypdf/_protocols.py
@@ -74,6 +74,9 @@ class PdfWriterProtocol(PdfCommonDocProtocol, Protocol):
     _objects: List[Any]
     _id_translated: Dict[int, Dict[int, int]]
 
+    incremental: bool
+    _reader: Any  # PdfReader
+
     @abstractmethod
     def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
         ...  # pragma: no cover
diff --git a/pypdf/_reader.py b/pypdf/_reader.py
index 1ffcd436d..cd6be5083 100644
--- a/pypdf/_reader.py
+++ b/pypdf/_reader.py
@@ -136,6 +136,7 @@ def __init__(
             with open(stream, "rb") as fh:
                 stream = BytesIO(fh.read())
             self._stream_opened = True
+        self._startxref: int = 0
         self.read(stream)
         self.stream = stream
 
@@ -560,6 +561,7 @@ def read(self, stream: StreamType) -> None:
         self._basic_validation(stream)
         self._find_eof_marker(stream)
         startxref = self._find_startxref_pos(stream)
+        self._startxref = startxref
 
         # check and eventually correct the startxref only in not strict
         xref_issue_nr = self._get_xref_issues(stream, startxref)
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index a72e2a23d..e47679d45 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -1,3 +1,6 @@
+# TODO : thing about pages to have a global soluce without rework;
+# consider question about heritage of properties
+
 # Copyright (c) 2006, Mathieu Fenniak
 # Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
 #
@@ -154,10 +157,35 @@ def __init__(
         self,
         fileobj: Union[None, PdfReader, StrByteType, Path] = "",
         clone_from: Union[None, PdfReader, StrByteType, Path] = None,
+        incremental: bool = False,
     ) -> None:
-        self._header = b"%PDF-1.3"
+        self.incremental = incremental
+        if self.incremental:
+            if isinstance(fileobj, (str, Path)):
+                with open(fileobj, "rb") as f:
+                    fileobj = BytesIO(f.read(-1))
+            if isinstance(fileobj, IO):
+                fileobj = BytesIO(fileobj.read(-1))
+            if isinstance(fileobj, BytesIO):
+                fileobj = PdfReader(fileobj)
+            else:
+                raise PyPdfError("Invalid type for incremental mode")
+            self._reader = fileobj  # prev content is in _reader.stream
+            self._header = fileobj.pdf_header.encode()
+            self._readonly = True  # !!!TODO: to be analysed
+        else:
+            self._header = b"%PDF-1.3"
+        """
+        The indirect objects in the PDF.
+        for the incremental it will be filled with None
+        in clone_reader_document_root
+        """
         self._objects: List[Optional[PdfObject]] = []
-        """The indirect objects in the PDF."""
+
+        """
+        list of hashes after import; used to identify changes
+        """
+        self._original_hash: List[int] = []
 
         """Maps hash values of indirect objects to the list of IndirectObjects.
            This is used for compression.
@@ -168,33 +196,7 @@ def __init__(
            dict[id(pdf)][(idnum, generation)]
         """
         self._id_translated: Dict[int, Dict[int, int]] = {}
-
-        # The root of our page tree node.
-        pages = DictionaryObject()
-        pages.update(
-            {
-                NameObject(PA.TYPE): NameObject("/Pages"),
-                NameObject(PA.COUNT): NumberObject(0),
-                NameObject(PA.KIDS): ArrayObject(),
-            }
-        )
-        self._pages = self._add_object(pages)
-        self.flattened_pages = []
-
-        # info object
-        info = DictionaryObject()
-        info.update({NameObject("/Producer"): create_string_object("pypdf")})
-        self._info_obj: PdfObject = self._add_object(info)
-
-        # root object
-        self._root_object = DictionaryObject()
-        self._root_object.update(
-            {
-                NameObject(PA.TYPE): NameObject(CO.CATALOG),
-                NameObject(CO.PAGES): self._pages,
-            }
-        )
-        self._root = self._add_object(self._root_object)
+        self._ID: Union[ArrayObject, None] = None
 
         def _get_clone_from(
             fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO],
@@ -227,14 +229,44 @@ def _get_clone_from(
         self.temp_fileobj = fileobj
         self.fileobj = ""
         self.with_as_usage = False
+        # The root of our page tree node.
+        pages = DictionaryObject()
+        pages.update(
+            {
+                NameObject(PA.TYPE): NameObject("/Pages"),
+                NameObject(PA.COUNT): NumberObject(0),
+                NameObject(PA.KIDS): ArrayObject(),
+            }
+        )
+        self.flattened_pages = []
+        self._encryption: Optional[Encryption] = None
+        self._encrypt_entry: Optional[DictionaryObject] = None
+        self._info_obj: PdfObject
+
         if clone_from is not None:
             if not isinstance(clone_from, PdfReader):
                 clone_from = PdfReader(clone_from)
             self.clone_document_from_reader(clone_from)
-
-        self._encryption: Optional[Encryption] = None
-        self._encrypt_entry: Optional[DictionaryObject] = None
-        self._ID: Union[ArrayObject, None] = None
+        else:
+            self._pages = self._add_object(pages)
+            # root object
+            self._root_object = DictionaryObject()
+            self._root_object.update(
+                {
+                    NameObject(PA.TYPE): NameObject(CO.CATALOG),
+                    NameObject(CO.PAGES): self._pages,
+                }
+            )
+            self._add_object(self._root_object)
+            # info object
+            info = DictionaryObject()
+            info.update({NameObject("/Producer"): create_string_object("pypdf")})
+            self._info_obj = self._add_object(info)
+        if isinstance(self._ID, list):
+            if isinstance(self._ID[0], TextStringObject):
+                self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes())
+            if isinstance(self._ID[1], TextStringObject):
+                self._ID[1] = ByteStringObject(self._ID[1].get_original_bytes())
 
     # for commonality
     @property
@@ -1115,18 +1147,29 @@ def clone_reader_document_root(self, reader: PdfReader) -> None:
         Args:
             reader: PdfReader from which the document root should be copied.
         """
-        self._objects.clear()
+        if self.incremental:
+            self._objects = [None] * cast(int, reader.trailer["/Size"])
+        else:
+            self._objects.clear()
         self._root_object = reader.root_object.clone(self)
-        self._root = self._root_object.indirect_reference  # type: ignore[assignment]
         self._pages = self._root_object.raw_get("/Pages")
+
+        assert len(self._objects) <= cast(int, reader.trailer["/Size"])  # for pytest
+        # must be done here before rewriting
+        if self.incremental:
+            self._original_hash = [
+                (obj.hash_bin() if obj is not None else 0) for obj in self._objects
+            ]
         self._flatten()
         assert self.flattened_pages is not None
         for p in self.flattened_pages:
-            p[NameObject("/Parent")] = self._pages
-            self._objects[cast(IndirectObject, p.indirect_reference).idnum - 1] = p
-        cast(DictionaryObject, self._pages.get_object())[
-            NameObject("/Kids")
-        ] = ArrayObject([p.indirect_reference for p in self.flattened_pages])
+            self._replace_object(cast(IndirectObject, p.indirect_reference).idnum, p)
+            if not self.incremental:
+                p[NameObject("/Parent")] = self._pages
+        if not self.incremental:
+            cast(DictionaryObject, self._pages.get_object())[
+                NameObject("/Kids")
+            ] = ArrayObject([p.indirect_reference for p in self.flattened_pages])
 
     def clone_document_from_reader(
         self,
@@ -1148,13 +1191,26 @@ def clone_document_from_reader(
                 document.
         """
         self.clone_reader_document_root(reader)
-        self._info_obj = self._add_object(DictionaryObject())
         if TK.INFO in reader.trailer:
-            self._info = reader._info  # actually copy fields
+            if self.incremental:
+                inf = reader._info
+                if inf is not None:
+                    self._info_obj = cast(
+                        IndirectObject, inf.clone(self).indirect_reference
+                    )
+                self._original_hash[
+                    cast(IndirectObject, self._info_obj.indirect_reference).idnum - 1
+                ] = self._info_obj.hash_bin()
+            else:
+                self._info = reader._info  # actually copy fields
+
+        else:
+            self._info_obj = self._add_object(DictionaryObject())
         try:
             self._ID = cast(ArrayObject, reader._ID).clone(self)
         except AttributeError:
             pass
+
         if callable(after_page_append):
             for page in cast(
                 ArrayObject, cast(DictionaryObject, self._pages.get_object())["/Kids"]
@@ -1257,9 +1313,17 @@ def write_stream(self, stream: StreamType) -> None:
         #   self._root = self._add_object(self._root_object)
         # self._sweep_indirect_references(self._root)
 
-        object_positions, free_objects = self._write_pdf_structure(stream)
-        xref_location = self._write_xref_table(stream, object_positions, free_objects)
-        self._write_trailer(stream, xref_location)
+        if self.incremental:
+            self._reader.stream.seek(0)
+            stream.write(self._reader.stream.read(-1))
+            xref_location = self._write_increment(stream)
+            self._write_trailer(stream, xref_location)
+        else:
+            object_positions, free_objects = self._write_pdf_structure(stream)
+            xref_location = self._write_xref_table(
+                stream, object_positions, free_objects
+            )
+            self._write_trailer(stream, xref_location)
 
     def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
         """
@@ -1291,6 +1355,75 @@ def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
 
         return my_file, stream
 
+    def _list_objects_in_increment(self) -> List[IndirectObject]:
+        """
+        For debug / analysis
+        Provides the list of new/modified objects that are to be written
+        """
+        ##        lst = []
+        ##        for i in range(len(self._objects)):
+        ##            if (self._objects[i] is not None and
+        ##                (i >= len(self._original_hash)
+        ##                    or cast(PdfObject,self._objects[i]).hash_bin() != self._original_hash[i]
+        ##                )):
+        ##                    lst.append(self._objects[i].indirect_reference)
+        return [
+            cast(IndirectObject, self._objects[i]).indirect_reference
+            for i in range(len(self._objects))
+            if (
+                self._objects[i] is not None
+                and (
+                    i >= len(self._original_hash)
+                    or cast(PdfObject, self._objects[i]).hash_bin()
+                    != self._original_hash[i]
+                )
+            )
+        ]
+
+    def _write_increment(self, stream: StreamType) -> int:
+        object_positions = {}
+        object_blocks = []
+        current_start = -1
+        current_stop = -2
+        for i, obj in enumerate(self._objects):
+            if self._objects[i] is not None and (
+                i >= len(self._original_hash)
+                or cast(PdfObject, self._objects[i]).hash_bin()
+                != self._original_hash[i]
+            ):
+                idnum = i + 1
+                assert isinstance(obj, PdfObject)  # mypy
+                # first write new/modified object
+                object_positions[idnum] = stream.tell()
+                stream.write(f"{idnum} 0 obj\n".encode())
+                if self._encryption and obj != self._encrypt_entry:
+                    obj = self._encryption.encrypt_object(obj, idnum, 0)
+                obj.write_to_stream(stream)
+                stream.write(b"\nendobj\n")
+
+                # prepare xref
+                if idnum != current_stop:
+                    if current_start > 0:
+                        object_blocks.append(
+                            [current_start, current_stop - current_start]
+                        )
+                    current_start = idnum
+                    current_stop = idnum + 1
+                else:
+                    current_stop = idnum + 1
+        if current_start > 0:
+            object_blocks.append([current_start, current_stop - current_start])
+        # write incremented xref
+        xref_location = stream.tell()
+        stream.write(b"xref\n")
+        stream.write(b"0 1\n")
+        stream.write(b"0000000000 65535 f \n")
+        for block in object_blocks:
+            stream.write(f"{block[0]} {block[1]}\n".encode())
+            for i in range(block[0], block[0] + block[1]):
+                stream.write(f"{object_positions[i]:0>10} {0:0>5} n \n".encode())
+        return xref_location
+
     def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]:
         object_positions = []
         free_objects = []  # will contain list of all free entries
@@ -1337,14 +1470,15 @@ def _write_trailer(self, stream: StreamType, xref_location: int) -> None:
             of certain special objects within the body of the file.
         """
         stream.write(b"trailer\n")
-        trailer = DictionaryObject()
-        trailer.update(
+        trailer = DictionaryObject(
             {
                 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1),
-                NameObject(TK.ROOT): self._root,
+                NameObject(TK.ROOT): self.root_object.indirect_reference,
                 NameObject(TK.INFO): self._info_obj,
             }
         )
+        if self.incremental:
+            trailer[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)
         if self._ID:
             trailer[NameObject(TK.ID)] = self._ID
         if self._encrypt_entry:
diff --git a/pypdf/constants.py b/pypdf/constants.py
index 745774e2a..a7e67aacc 100644
--- a/pypdf/constants.py
+++ b/pypdf/constants.py
@@ -33,6 +33,7 @@ class TrailerKeys:
     ID = "/ID"
     INFO = "/Info"
     SIZE = "/Size"
+    PREV = "/Prev"
 
 
 class CatalogAttributes:
@@ -209,7 +210,7 @@ class PagesAttributes:
     PARENT = "/Parent"  # dictionary, required; indirect reference to pages object
     KIDS = "/Kids"  # array, required; List of indirect references
     COUNT = "/Count"  # integer, required; the number of leaf nodes (page objects)
-                      # that are descendants of this node within the page tree
+    # that are descendants of this node within the page tree
 
 
 class PageAttributes:
@@ -217,7 +218,9 @@ class PageAttributes:
 
     TYPE = "/Type"  # name, required; must be /Page
     PARENT = "/Parent"  # dictionary, required; a pages object
-    LAST_MODIFIED = "/LastModified"  # date, optional; date and time of last modification
+    LAST_MODIFIED = (
+        "/LastModified"  # date, optional; date and time of last modification
+    )
     RESOURCES = "/Resources"  # dictionary, required if there are any
     MEDIABOX = "/MediaBox"  # rectangle, required; rectangle specifying page size
     CROPBOX = "/CropBox"  # rectangle, optional
diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py
index f48dc66c3..9dfb25a29 100644
--- a/pypdf/generic/_base.py
+++ b/pypdf/generic/_base.py
@@ -53,6 +53,16 @@ class PdfObject(PdfObjectProtocol):
     hash_func: Callable[..., "hashlib._Hash"] = hashlib.sha1
     indirect_reference: Optional["IndirectObject"]
 
+    def hash_bin(self) -> int:
+        """
+        Returns:
+            hash considering type and value
+        used to detect modified object
+        """
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not implement .hash_bin() so far"
+        )
+
     def hash_value_data(self) -> bytes:
         return ("%s" % self).encode()
 
@@ -121,7 +131,15 @@ def _reference_clone(
             ind = self.indirect_reference
         except AttributeError:
             return clone
-        i = len(pdf_dest._objects) + 1
+        if (
+            pdf_dest.incremental
+            and ind is not None
+            and ind.pdf == pdf_dest._reader
+            and ind.idnum <= len(pdf_dest._objects)
+        ):
+            i = ind.idnum
+        else:
+            i = len(pdf_dest._objects) + 1
         if ind is not None:
             if id(ind.pdf) not in pdf_dest._id_translated:
                 pdf_dest._id_translated[id(ind.pdf)] = {}
@@ -136,7 +154,11 @@ def _reference_clone(
                 assert obj is not None
                 return obj
             pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i
-        pdf_dest._objects.append(clone)
+        try:
+            pdf_dest._objects[i - 1] = clone
+        except IndexError:
+            pdf_dest._objects.append(clone)
+            i = len(pdf_dest._objects)
         clone.indirect_reference = IndirectObject(i, 0, pdf_dest)
         return clone
 
@@ -162,6 +184,14 @@ def clone(
             "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate)
         )
 
+    def hash_bin(self) -> int:
+        """
+        Returns:
+            hash considering type and value
+        used to detect modified object
+        """
+        return hash((self.__class__,))
+
     def write_to_stream(
         self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
     ) -> None:
@@ -198,6 +228,14 @@ def clone(
             self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate),
         )
 
+    def hash_bin(self) -> int:
+        """
+        Returns:
+            hash considering type and value
+        used to detect modified object
+        """
+        return hash((self.__class__, self.value))
+
     def __eq__(self, __o: object) -> bool:
         if isinstance(__o, BooleanObject):
             return self.value == __o.value
@@ -242,6 +280,14 @@ def __init__(self, idnum: int, generation: int, pdf: Any) -> None:  # PdfReader
     def __hash__(self) -> int:
         return hash((self.idnum, self.generation, id(self.pdf)))
 
+    def hash_bin(self) -> int:
+        """
+        Returns:
+            hash considering type and value
+        used to detect modified object
+        """
+        return hash((self.__class__, self.idnum, self.generation, id(self.pdf)))
+
     def clone(
         self,
         pdf_dest: PdfWriterProtocol,
@@ -400,6 +446,14 @@ def clone(
             self._reference_clone(FloatObject(self), pdf_dest, force_duplicate),
         )
 
+    def hash_bin(self) -> int:
+        """
+        Returns:
+            hash considering type and value
+        used to detect modified object
+        """
+        return hash((self.__class__, self.as_numeric))
+
     def myrepr(self) -> str:
         if self == 0:
             return "0.0"
@@ -445,6 +499,14 @@ def clone(
             self._reference_clone(NumberObject(self), pdf_dest, force_duplicate),
         )
 
+    def hash_bin(self) -> int:
+        """
+        Returns:
+            hash considering type and value
+        used to detect modified object
+        """
+        return hash((self.__class__, self.as_numeric()))
+
     def as_numeric(self) -> int:
         return int(repr(self).encode("utf8"))
 
@@ -488,6 +550,14 @@ def clone(
             ),
         )
 
+    def hash_bin(self) -> int:
+        """
+        Returns:
+            hash considering type and value
+        used to detect modified object
+        """
+        return hash((self.__class__, bytes(self)))
+
     @property
     def original_bytes(self) -> bytes:
         """For compatibility with TextStringObject.original_bytes."""
@@ -567,6 +637,14 @@ def clone(
             "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate)
         )
 
+    def hash_bin(self) -> int:
+        """
+        Returns:
+            hash considering type and value
+        used to detect modified object
+        """
+        return hash((self.__class__, self.original_bytes))
+
     @property
     def original_bytes(self) -> bytes:
         """
@@ -663,6 +741,14 @@ def clone(
             self._reference_clone(NameObject(self), pdf_dest, force_duplicate),
         )
 
+    def hash_bin(self) -> int:
+        """
+        Returns:
+            hash considering type and value
+        used to detect modified object
+        """
+        return hash((self.__class__, self))
+
     def write_to_stream(
         self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
     ) -> None:
diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
index 399836be5..e53129a48 100644
--- a/pypdf/generic/_data_structures.py
+++ b/pypdf/generic/_data_structures.py
@@ -131,6 +131,14 @@ def clone(
                 arr.append(data)
         return arr
 
+    def hash_bin(self) -> int:
+        """
+        Returns:
+            hash considering type and value
+        used to detect modified object
+        """
+        return hash((self.__class__, tuple(x.hash_bin() for x in self)))
+
     def items(self) -> Iterable[Any]:
         """Emulate DictionaryObject.items for a list (index, object)."""
         return enumerate(self)
@@ -371,6 +379,16 @@ def _clone(
                         else v
                     )
 
+    def hash_bin(self) -> int:
+        """
+        Returns:
+            hash considering type and value
+        used to detect modified object
+        """
+        return hash(
+            (self.__class__, tuple(((k, v.hash_bin()) for k, v in self.items())))
+        )
+
     def raw_get(self, key: Any) -> Any:
         return dict.__getitem__(self, key)
 
@@ -876,6 +894,14 @@ def _clone(
             pass
         super()._clone(src, pdf_dest, force_duplicate, ignore_fields, visited)
 
+    def hash_bin(self) -> int:
+        """
+        Returns:
+            hash considering type and value
+        used to detect modified object
+        """
+        return hash((super().hash_bin(), self.get_data()))
+
     def get_data(self) -> bytes:
         return self._data
 
diff --git a/tests/test_reader.py b/tests/test_reader.py
index 0413a9135..c1bdff944 100644
--- a/tests/test_reader.py
+++ b/tests/test_reader.py
@@ -212,7 +212,7 @@ def test_get_outline(src, outline_elements):
         pytest.param(
             "imagemagick-ASCII85Decode.pdf",
             ["Im0.png"],
-            marks=pytest.mark.xfail(reason="broken image extraction"),
+            # marks=pytest.mark.xfail(reason="broken image extraction"),
         ),
         ("imagemagick-CCITTFaxDecode.pdf", ["Im0.tiff"]),
         (SAMPLE_ROOT / "019-grayscale-image/grayscale-image.pdf", ["X0.png"]),

From 0543709a702921f767ec04aaa9ea40db1b7272bc Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sat, 24 Aug 2024 11:21:22 +0200
Subject: [PATCH 02/40] fix test

---
 pypdf/_writer.py                              |  52 ++++++++++--------
 ..._Vicksburg_Sample_OCR-crazyones-merged.pdf | Bin 217093 -> 217093 bytes
 2 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index e47679d45..dd96251de 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -160,21 +160,6 @@ def __init__(
         incremental: bool = False,
     ) -> None:
         self.incremental = incremental
-        if self.incremental:
-            if isinstance(fileobj, (str, Path)):
-                with open(fileobj, "rb") as f:
-                    fileobj = BytesIO(f.read(-1))
-            if isinstance(fileobj, IO):
-                fileobj = BytesIO(fileobj.read(-1))
-            if isinstance(fileobj, BytesIO):
-                fileobj = PdfReader(fileobj)
-            else:
-                raise PyPdfError("Invalid type for incremental mode")
-            self._reader = fileobj  # prev content is in _reader.stream
-            self._header = fileobj.pdf_header.encode()
-            self._readonly = True  # !!!TODO: to be analysed
-        else:
-            self._header = b"%PDF-1.3"
         """
         The indirect objects in the PDF.
         for the incremental it will be filled with None
@@ -197,6 +182,28 @@ def __init__(
         """
         self._id_translated: Dict[int, Dict[int, int]] = {}
         self._ID: Union[ArrayObject, None] = None
+        self._info_obj: PdfObject
+
+        if self.incremental:
+            if isinstance(fileobj, (str, Path)):
+                with open(fileobj, "rb") as f:
+                    fileobj = BytesIO(f.read(-1))
+            if isinstance(fileobj, IO):
+                fileobj = BytesIO(fileobj.read(-1))
+            if isinstance(fileobj, BytesIO):
+                fileobj = PdfReader(fileobj)
+            else:
+                raise PyPdfError("Invalid type for incremental mode")
+            self._reader = fileobj  # prev content is in _reader.stream
+            self._header = fileobj.pdf_header.encode()
+            self._readonly = True  # !!!TODO: to be analysed
+        else:
+            self._header = b"%PDF-1.3"
+            self._info_obj = self._add_object(
+                DictionaryObject(
+                    {NameObject("/Producer"): create_string_object("pypdf")}
+                )
+            )
 
         def _get_clone_from(
             fileobj: Union[None, PdfReader, str, Path, IO[Any], BytesIO],
@@ -241,7 +248,6 @@ def _get_clone_from(
         self.flattened_pages = []
         self._encryption: Optional[Encryption] = None
         self._encrypt_entry: Optional[DictionaryObject] = None
-        self._info_obj: PdfObject
 
         if clone_from is not None:
             if not isinstance(clone_from, PdfReader):
@@ -258,10 +264,6 @@ def _get_clone_from(
                 }
             )
             self._add_object(self._root_object)
-            # info object
-            info = DictionaryObject()
-            info.update({NameObject("/Producer"): create_string_object("pypdf")})
-            self._info_obj = self._add_object(info)
         if isinstance(self._ID, list):
             if isinstance(self._ID[0], TextStringObject):
                 self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes())
@@ -1192,8 +1194,8 @@ def clone_document_from_reader(
         """
         self.clone_reader_document_root(reader)
         if TK.INFO in reader.trailer:
+            inf = reader._info
             if self.incremental:
-                inf = reader._info
                 if inf is not None:
                     self._info_obj = cast(
                         IndirectObject, inf.clone(self).indirect_reference
@@ -1201,11 +1203,13 @@ def clone_document_from_reader(
                 self._original_hash[
                     cast(IndirectObject, self._info_obj.indirect_reference).idnum - 1
                 ] = self._info_obj.hash_bin()
-            else:
-                self._info = reader._info  # actually copy fields
-
+            elif inf is not None:
+                self._info_obj = self._add_object(
+                    DictionaryObject(cast(DictionaryObject, inf.get_object()))
+                )
         else:
             self._info_obj = self._add_object(DictionaryObject())
+
         try:
             self._ID = cast(ArrayObject, reader._ID).clone(self)
         except AttributeError:
diff --git a/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf b/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf
index a53f28f0be432c38a1fff33672a2170eeb5f553f..8a04001ddae371fa756d1dc2f607fd42965f0f8f 100644
GIT binary patch
delta 94
zcmZo&z}vcjcY^f925Cm4i2^ATXDb5fy%!jbHuo{k?gCO;J%WZn%4YBN)r>~lMKT!=
s&IQuf*qG*o1L@*8CW#mz-Ls2HXaci=smb<*Ud%46jE2*n`7_G{06VfE8vp<R

delta 94
zcmZo&z}vcjcY^dpr$|Pli2`a9XDc!qPTYHe(QtDg<LoXVrPU*72&8QGUSG{<xLqWZ
t@!(t_eT|K2PB@S*j$@LD0n$CYn1m)U8ycEzU+Bf`!pdkg{h2?rJOC!NAJqT=


From 29030d4ca121dc3d216bc21815b4f5a209342b4e Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 25 Aug 2024 17:10:22 +0200
Subject: [PATCH 03/40] fixes + first test

---
 pypdf/_page.py                    |  2 ++
 pypdf/_writer.py                  | 20 ++++++++---------
 pypdf/generic/_data_structures.py |  3 ++-
 tests/test_writer.py              | 36 +++++++++++++++++++++++++++++++
 4 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index 8a8c47eec..79cdb7adf 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -492,6 +492,8 @@ def __init__(
         self.inline_images: Optional[Dict[str, ImageFile]] = None
         # below Union for mypy but actually Optional[List[str]]
         self.indirect_reference = indirect_reference
+        if indirect_reference is not None:
+            self.update(cast(DictionaryObject, indirect_reference.get_object()))
 
     def hash_bin(self) -> int:
         """
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index dd96251de..24da87337 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -1202,7 +1202,7 @@ def clone_document_from_reader(
                     )
                 self._original_hash[
                     cast(IndirectObject, self._info_obj.indirect_reference).idnum - 1
-                ] = self._info_obj.hash_bin()
+                ] = cast(DictionaryObject, self._info_obj.get_object()).hash_bin()
             elif inf is not None:
                 self._info_obj = self._add_object(
                     DictionaryObject(cast(DictionaryObject, inf.get_object()))
@@ -1359,18 +1359,16 @@ def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
 
         return my_file, stream
 
-    def _list_objects_in_increment(self) -> List[IndirectObject]:
+    def list_objects_in_increment(self) -> List[IndirectObject]:
         """
         For debug / analysis
-        Provides the list of new/modified objects that are to be written
-        """
-        ##        lst = []
-        ##        for i in range(len(self._objects)):
-        ##            if (self._objects[i] is not None and
-        ##                (i >= len(self._original_hash)
-        ##                    or cast(PdfObject,self._objects[i]).hash_bin() != self._original_hash[i]
-        ##                )):
-        ##                    lst.append(self._objects[i].indirect_reference)
+        Provides the list of new/modified objects that will be written
+        in the increment
+        Deleted Objects will not be freeed but will become orphans
+
+        Returns:
+            List of (new / modified) IndirectObjects
+        """
         return [
             cast(IndirectObject, self._objects[i]).indirect_reference
             for i in range(len(self._objects))
diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
index e53129a48..00f4ceab8 100644
--- a/pypdf/generic/_data_structures.py
+++ b/pypdf/generic/_data_structures.py
@@ -900,7 +900,8 @@ def hash_bin(self) -> int:
             hash considering type and value
         used to detect modified object
         """
-        return hash((super().hash_bin(), self.get_data()))
+        # use of _data to prevent errors on non decoded stream such as JBIG2
+        return hash((super().hash_bin(), self._data))
 
     def get_data(self) -> bytes:
         return self._data
diff --git a/tests/test_writer.py b/tests/test_writer.py
index b6a47a18c..3ac1f06da 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -2354,3 +2354,39 @@ def test_utf16_metadata():
         b"/Subject (\\376\\377\\000I\\000n\\000v\\000o\\000i\\000c\\000e"
         b"\\000 \\041\\026\\000A\\000I\\000\\137\\0000\\0004\\0007)"
     )
+
+
+def test_list_objects_in_increment(caplog):
+    """Tests for #2811"""
+    writer = PdfWriter(
+        RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf",
+        incremental=True,
+    )
+    # Contains JBIG2 not decoded for the moment
+    assert writer.list_objects_in_increment() == []  # no flowdown of properties
+    # modify one object
+    writer.pages[0][NameObject("/MediaBox")] = ArrayObject(
+        [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)]
+    )
+    assert writer.list_objects_in_increment() == [IndirectObject(4, 0, writer)]
+    b = BytesIO()
+    writer.write(b)
+    assert b.getvalue().startswith(writer._reader.stream.getvalue())
+    b.seek(0)
+    reader = PdfReader(b)
+    assert reader.pages[0]["/MediaBox"] == ArrayObject(
+        [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)]
+    )
+    with pytest.raises(PyPdfError):
+        writer = PdfWriter(reader, incremental=True)
+    b.seek(0)
+    writer = PdfWriter(b, incremental=True)
+    assert writer.list_objects_in_increment() == []  # no flowdown of properties
+
+    writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=True)
+    # 1 object is modified: page 0  inherits MediaBox so is changed
+    assert len(writer.list_objects_in_increment()) == 1
+
+    writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=False)
+    # 1 object is modified: page 0  inherits MediaBox so is changed
+    assert len(writer.list_objects_in_increment()) == len(writer._objects)

From 1067b744eeac6374344a8c63ddce742d87d49d91 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 25 Aug 2024 19:06:16 +0200
Subject: [PATCH 04/40] coverage

---
 pypdf/_page.py        | 2 +-
 pypdf/_writer.py      | 2 --
 tests/test_generic.py | 6 ++++++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index 79cdb7adf..c81eeb8cd 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -501,7 +501,7 @@ def hash_bin(self) -> int:
             hash considering type and value
         used to detect modified object
         Note: this function is overloaded to return the same results
-            as a DictionaryObject
+        as a DictionaryObject
         """
         return hash(
             (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 24da87337..e052b94ae 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -188,8 +188,6 @@ def __init__(
             if isinstance(fileobj, (str, Path)):
                 with open(fileobj, "rb") as f:
                     fileobj = BytesIO(f.read(-1))
-            if isinstance(fileobj, IO):
-                fileobj = BytesIO(fileobj.read(-1))
             if isinstance(fileobj, BytesIO):
                 fileobj = PdfReader(fileobj)
             else:
diff --git a/tests/test_generic.py b/tests/test_generic.py
index 6b8ae0151..bc83ea4fe 100644
--- a/tests/test_generic.py
+++ b/tests/test_generic.py
@@ -1472,3 +1472,9 @@ def test_unitary_extract_inline():
     ec.set_data(b)
     co = ContentStream(ec, None)
     assert co.operations[7][0]["data"] == b"abcdefghijklmnop"
+
+
+def test_missing_hashbin():
+    assert NullObject().hash_bin() == hash((NullObject,))
+    t = ByteStringObject(b"123")
+    assert t.hash_bin() == hash((ByteStringObject, b"123"))

From f1d3fbe6367e0fcc1e2efc79c1932643851dd455 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 25 Aug 2024 19:39:37 +0200
Subject: [PATCH 05/40] coverage

---
 pypdf/_page.py       | 1 +
 pypdf/_writer.py     | 2 ++
 tests/test_writer.py | 4 ++++
 3 files changed, 7 insertions(+)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index c81eeb8cd..aebe9ebbd 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -500,6 +500,7 @@ def hash_bin(self) -> int:
         Returns:
             hash considering type and value
         used to detect modified object
+
         Note: this function is overloaded to return the same results
         as a DictionaryObject
         """
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index e052b94ae..a0d55e3c5 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -1396,8 +1396,10 @@ def _write_increment(self, stream: StreamType) -> int:
                 # first write new/modified object
                 object_positions[idnum] = stream.tell()
                 stream.write(f"{idnum} 0 obj\n".encode())
+                """ encryption is not operational
                 if self._encryption and obj != self._encrypt_entry:
                     obj = self._encryption.encrypt_object(obj, idnum, 0)
+                """
                 obj.write_to_stream(stream)
                 stream.write(b"\nendobj\n")
 
diff --git a/tests/test_writer.py b/tests/test_writer.py
index 3ac1f06da..1a172e8c3 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -2369,6 +2369,10 @@ def test_list_objects_in_increment(caplog):
         [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)]
     )
     assert writer.list_objects_in_increment() == [IndirectObject(4, 0, writer)]
+    writer.pages[5][NameObject("/MediaBox")] = ArrayObject(
+        [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)]
+    )
+    assert len(writer.list_objects_in_increment()) == 2
     b = BytesIO()
     writer.write(b)
     assert b.getvalue().startswith(writer._reader.stream.getvalue())

From ae97bc73b4f6b0b2653009b47b5b6ead47e13424 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 26 Aug 2024 13:06:03 +0200
Subject: [PATCH 06/40] cope with multiple level pages

---
 pypdf/_doc_common.py | 41 ++++++++++++++++++++++++++++++++++++++---
 pypdf/_page.py       | 24 +++++++++++++++---------
 pypdf/_writer.py     | 43 ++++++++++++++++++++++++++++++++-----------
 tests/test_page.py   |  4 +++-
 4 files changed, 88 insertions(+), 24 deletions(-)

diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
index 12848fb8e..ea3c93aab 100644
--- a/pypdf/_doc_common.py
+++ b/pypdf/_doc_common.py
@@ -65,9 +65,7 @@
 from .constants import FieldDictionaryAttributes as FA
 from .constants import PageAttributes as PG
 from .constants import PagesAttributes as PA
-from .errors import (
-    PdfReadError,
-)
+from .errors import PdfReadError, PyPdfError
 from .generic import (
     ArrayObject,
     BooleanObject,
@@ -372,6 +370,43 @@ def get_page(self, page_number: int) -> PageObject:
         assert self.flattened_pages is not None, "hint for mypy"
         return self.flattened_pages[page_number]
 
+    def _get_page_in_node(
+        self,
+        page_number: int,
+    ) -> Tuple[DictionaryObject, int]:
+        """
+        Retrieve the node and position within the /Kids containing the page
+        if page_number is greater than the number of page, it returns top node, -1
+        """
+        top = cast(DictionaryObject, self.root_object["/Pages"])
+
+        def recurs(node: DictionaryObject, mi: int) -> Tuple[Optional[PdfObject], int]:
+            ma = cast(int, node.get("/Count", 1))  # default 1 for /Page types
+            if node["/Type"] == "/Page":
+                if page_number == mi:
+                    return node, -1
+                # else:
+                return None, mi + 1
+            if (page_number - mi) >= ma:  # not in nodes below
+                if node == top:
+                    return top, -1
+                # else
+                return None, mi + ma
+            for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])):
+                kid = cast(DictionaryObject, kid.get_object())
+                n, i = recurs(kid, mi)
+                if n is not None:  # page has just been found ...
+                    if i < 0:  # ... just below!
+                        return node, idx
+                    # else:  # ... at lower levels
+                    return n, i
+                mi = i
+            raise PyPdfError("abnormal, can not find the node")
+
+        node, idx = recurs(top, 0)
+        assert isinstance(node, DictionaryObject)
+        return node, idx
+
     @property
     def named_destinations(self) -> Dict[str, Any]:
         """
diff --git a/pypdf/_page.py b/pypdf/_page.py
index aebe9ebbd..b9f6e012b 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -2414,27 +2414,33 @@ def __delitem__(self, index: Union[int, slice]) -> None:
             raise IndexError("index out of range")
         ind = self[index].indirect_reference
         assert ind is not None
-        parent = cast(DictionaryObject, ind.get_object()).get("/Parent", None)
+        parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(
+            "/Parent", None
+        )
+        first = True
         while parent is not None:
             parent = cast(DictionaryObject, parent.get_object())
             try:
-                i = parent["/Kids"].index(ind)
-                del parent["/Kids"][i]
+                i = cast(ArrayObject, parent["/Kids"]).index(ind)
+                del cast(ArrayObject, parent["/Kids"])[i]
+                first = False
                 try:
                     assert ind is not None
                     del ind.pdf.flattened_pages[index]  # case of page in a Reader
                 except Exception:  # pragma: no cover
                     pass
                 if "/Count" in parent:
-                    parent[NameObject("/Count")] = NumberObject(parent["/Count"] - 1)
-                if len(parent["/Kids"]) == 0:
+                    parent[NameObject("/Count")] = NumberObject(
+                        cast(int, parent["/Count"]) - 1
+                    )
+                if len(cast(ArrayObject, parent["/Kids"])) == 0:
                     # No more objects in this part of this sub tree
                     ind = parent.indirect_reference
-                    parent = cast(DictionaryObject, parent.get("/Parent", None))
-                else:
-                    parent = None
+                parent = parent.get("/Parent", None)
             except ValueError:  # from index
-                raise PdfReadError(f"Page Not Found in Page Tree {ind}")
+                if first:
+                    raise PdfReadError(f"Page Not Found in Page Tree {ind}")
+                break
 
     def __iter__(self) -> Iterator[PageObject]:
         for i in range(len(self)):
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index a0d55e3c5..e2747c153 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -439,10 +439,12 @@ def _replace_object(
     def _add_page(
         self,
         page: PageObject,
-        action: Callable[[Any, Union[PageObject, IndirectObject]], None],
+        index: int,
         excluded_keys: Iterable[str] = (),
     ) -> PageObject:
-        assert cast(str, page[PA.TYPE]) == CO.PAGE
+        if not isinstance(page, PageObject) or page.get(PA.TYPE, None) != CO.PAGE:
+            raise ValueError("Invalid page Object")
+        assert self.flattened_pages is not None, "for mypy"
         page_org = page
         excluded_keys = list(excluded_keys)
         excluded_keys += [PA.PARENT, "/StructParents"]
@@ -460,13 +462,23 @@ def _add_page(
         if page_org.pdf is not None:
             other = page_org.pdf.pdf_header
             self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)
-        page[NameObject(PA.PARENT)] = self._pages
-        pages = cast(DictionaryObject, self.get_object(self._pages))
-        assert page.indirect_reference is not None
-        action(pages[PA.KIDS], page.indirect_reference)
-        action(self.flattened_pages, page)
-        page_count = cast(int, pages[PA.COUNT])
-        pages[NameObject(PA.COUNT)] = NumberObject(page_count + 1)
+        node, idx = self._get_page_in_node(index)
+        page[NameObject(PA.PARENT)] = node.indirect_reference
+        if idx >= 0:  # to be a
+            cast(ArrayObject, node[PA.KIDS]).insert(idx, page.indirect_reference)
+            if self.flattened_pages != node[PA.KIDS]:
+                self.flattened_pages.insert(index, page)
+        else:
+            cast(ArrayObject, node[PA.KIDS]).append(page.indirect_reference)
+            if self.flattened_pages != node[PA.KIDS]:
+                self.flattened_pages.append(page)
+        cpt = 1000
+        while node is not None:
+            node[NameObject(PA.COUNT)] = NumberObject(cast(int, node[PA.COUNT]) + 1)
+            node = node.get(PA.PARENT, None)
+            cpt -= 1
+            if cpt < 0:
+                raise PyPdfError("Recursive Error detected")
         return page
 
     def set_need_appearances_writer(self, state: bool = True) -> None:
@@ -529,7 +541,8 @@ def add_page(
         Returns:
             The added PageObject.
         """
-        return self._add_page(page, list.append, excluded_keys)
+        assert self.flattened_pages is not None
+        return self._add_page(page, len(self.flattened_pages), excluded_keys)
 
     def insert_page(
         self,
@@ -549,7 +562,15 @@ def insert_page(
         Returns:
             The added PageObject.
         """
-        return self._add_page(page, lambda kids, p: kids.insert(index, p))
+        assert self.flattened_pages is not None
+        if index < 0:
+            index = len(self.flattened_pages) + index
+        if index < 0:
+            raise ValueError("invalid index value")
+        if index >= len(self.flattened_pages):
+            return self.add_page(page, excluded_keys)
+        else:
+            return self._add_page(page, index, excluded_keys)
 
     def _get_page_number_by_indirect(
         self, indirect_reference: Union[None, int, NullObject, IndirectObject]
diff --git a/tests/test_page.py b/tests/test_page.py
index 72df648e4..8bde3e82e 100644
--- a/tests/test_page.py
+++ b/tests/test_page.py
@@ -1251,7 +1251,9 @@ def test_del_pages():
     del pp["/Parent"].get_object()["/Kids"][i]
     with pytest.raises(PdfReadError):
         del reader.pages[2]
-    # reader is corrupted we have to reload it
+
+    url = "https://github.com/py-pdf/pypdf/files/13679585/test2_P038-038.pdf"
+    name = "iss2343.pdf"
     reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
     del reader.pages[:]
     assert len(reader.pages) == 0

From d9a99d9e4415a188b45dbf37e79925e9cac9193a Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 26 Aug 2024 13:24:52 +0200
Subject: [PATCH 07/40] test + doc

---
 pypdf/_writer.py   | 27 +++++++++++++++++++++------
 tests/test_page.py |  5 +++--
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index e2747c153..59d6b3822 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -151,6 +151,15 @@ class PdfWriter(PdfDocCommon):
     cloning a PDF file during initialization.
 
     Typically data is added from a :class:`PdfReader<pypdf.PdfReader>`.
+
+    clone_from: identical to fileobj (for compatibility)
+
+    incremental: `bool`
+        If true, loads the document and set the PdfWriter in incremental mode
+
+        When writing the original document is written first and new/modified
+        are appened. to be used for signed document/forms to keep signature
+        valid.
     """
 
     def __init__(
@@ -161,26 +170,32 @@ def __init__(
     ) -> None:
         self.incremental = incremental
         """
+        Returns if the PdfWriter object has been started in incremental mode
+        """
+
+        self._objects: List[Optional[PdfObject]] = []
+        """
         The indirect objects in the PDF.
         for the incremental it will be filled with None
         in clone_reader_document_root
         """
-        self._objects: List[Optional[PdfObject]] = []
 
+        self._original_hash: List[int] = []
         """
         list of hashes after import; used to identify changes
         """
-        self._original_hash: List[int] = []
 
-        """Maps hash values of indirect objects to the list of IndirectObjects.
-           This is used for compression.
-        """
         self._idnum_hash: Dict[bytes, Tuple[IndirectObject, List[IndirectObject]]] = {}
+        """
+        Maps hash values of indirect objects to the list of IndirectObjects.
+        This is used for compression.
+        """
 
+        self._id_translated: Dict[int, Dict[int, int]] = {}
         """List of already translated IDs.
            dict[id(pdf)][(idnum, generation)]
         """
-        self._id_translated: Dict[int, Dict[int, int]] = {}
+
         self._ID: Union[ArrayObject, None] = None
         self._info_obj: PdfObject
 
diff --git a/tests/test_page.py b/tests/test_page.py
index 8bde3e82e..ac9d241a7 100644
--- a/tests/test_page.py
+++ b/tests/test_page.py
@@ -1252,9 +1252,10 @@ def test_del_pages():
     with pytest.raises(PdfReadError):
         del reader.pages[2]
 
-    url = "https://github.com/py-pdf/pypdf/files/13679585/test2_P038-038.pdf"
-    name = "iss2343.pdf"
+    url = "https://github.com/py-pdf/pypdf/files/13946477/panda.pdf"
+    name = "iss2343b.pdf"
     reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    del reader.pages[4]  # to propagate among /Pages
     del reader.pages[:]
     assert len(reader.pages) == 0
     assert len(reader.trailer["/Root"]["/Pages"]["/Kids"]) == 0

From 3c4cfdc2510587c8a75cbe6d6760362db44a2fa1 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 26 Aug 2024 13:30:20 +0200
Subject: [PATCH 08/40] coverage

---
 tests/test_page.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_page.py b/tests/test_page.py
index ac9d241a7..d9efd4992 100644
--- a/tests/test_page.py
+++ b/tests/test_page.py
@@ -1254,11 +1254,11 @@ def test_del_pages():
 
     url = "https://github.com/py-pdf/pypdf/files/13946477/panda.pdf"
     name = "iss2343b.pdf"
-    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    reader = PdfWriter(BytesIO(get_data_from_url(url, name=name)), incremental=True)
     del reader.pages[4]  # to propagate among /Pages
     del reader.pages[:]
     assert len(reader.pages) == 0
-    assert len(reader.trailer["/Root"]["/Pages"]["/Kids"]) == 0
+    assert len(reader.root_object["/Pages"]["/Kids"]) == 0
     assert len(reader.flattened_pages) == 0
 
 
From 38d4b351d81719ed774476f0cf7ee7187ff55a9e Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 26 Aug 2024 13:57:20 +0200
Subject: [PATCH 09/40] coverage

---
 pypdf/_writer.py   | 11 +++--------
 tests/test_page.py | 23 ++++++++++++++++-------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 59d6b3822..b981cb0d5 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -1,6 +1,3 @@
-# TODO : thing about pages to have a global soluce without rework;
-# consider question about heritage of properties
-
 # Copyright (c) 2006, Mathieu Fenniak
 # Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
 #
@@ -154,12 +151,10 @@ class PdfWriter(PdfDocCommon):
 
     clone_from: identical to fileobj (for compatibility)
 
-    incremental: `bool`
-        If true, loads the document and set the PdfWriter in incremental mode
+    incremental: If true, loads the document and set the PdfWriter in incremental mode
 
-        When writing the original document is written first and new/modified
-        are appened. to be used for signed document/forms to keep signature
-        valid.
+    When writing in incremental the original document is written first and new/modified
+    are appened. to be used for signed document/forms to keep signature valid.
     """
 
     def __init__(
diff --git a/tests/test_page.py b/tests/test_page.py
index d9efd4992..dc3ec9c55 100644
--- a/tests/test_page.py
+++ b/tests/test_page.py
@@ -12,7 +12,7 @@
 from pypdf import PdfReader, PdfWriter, Transformation
 from pypdf._page import PageObject
 from pypdf.constants import PageAttributes as PG
-from pypdf.errors import PdfReadError, PdfReadWarning
+from pypdf.errors import PdfReadError, PdfReadWarning, PyPdfError
 from pypdf.generic import (
     ArrayObject,
     ContentStream,
@@ -887,6 +887,8 @@ def test_annotation_setter(pdf_file_path):
     page = reader.pages[0]
     writer = PdfWriter()
     writer.add_page(page)
+    with pytest.raises(ValueError):
+        writer.add_page(DictionaryObject())
 
     # Act
     page_number = 0
@@ -1254,12 +1256,19 @@ def test_del_pages():
 
     url = "https://github.com/py-pdf/pypdf/files/13946477/panda.pdf"
     name = "iss2343b.pdf"
-    reader = PdfWriter(BytesIO(get_data_from_url(url, name=name)), incremental=True)
-    del reader.pages[4]  # to propagate among /Pages
-    del reader.pages[:]
-    assert len(reader.pages) == 0
-    assert len(reader.root_object["/Pages"]["/Kids"]) == 0
-    assert len(reader.flattened_pages) == 0
+    writer = PdfWriter(BytesIO(get_data_from_url(url, name=name)), incremental=True)
+    node, idx = writer._get_page_in_node(53)
+    assert (node.indirect_reference.idnum, idx) == (11776, 1)
+    node, idx = writer._get_page_in_node(10000)
+    assert (node.indirect_reference.idnum, idx) == (11769, -1)
+    with pytest.raises(PyPdfError):
+        writer._get_page_in_node(-1)
+
+    del writer.pages[4]  # to propagate among /Pages
+    del writer.pages[:]
+    assert len(writer.pages) == 0
+    assert len(writer.root_object["/Pages"]["/Kids"]) == 0
+    assert len(writer.flattened_pages) == 0
 
 
 def test_pdf_pages_missing_type():

From 79eca73b7774dadedac01c188681b4559e6cfcaf Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 26 Aug 2024 15:38:02 +0200
Subject: [PATCH 10/40] coverage

---
 pypdf/_writer.py     |  5 ++---
 tests/test_page.py   | 13 +++++++++++++
 tests/test_writer.py |  4 +++-
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index b981cb0d5..b532b6446 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -484,6 +484,7 @@ def _add_page(
                 self.flattened_pages.append(page)
         cpt = 1000
         while node is not None:
+            node = cast(DictionaryObject, node.get_object())
             node[NameObject(PA.COUNT)] = NumberObject(cast(int, node[PA.COUNT]) + 1)
             node = node.get(PA.PARENT, None)
             cpt -= 1
@@ -1441,9 +1442,7 @@ def _write_increment(self, stream: StreamType) -> int:
                             [current_start, current_stop - current_start]
                         )
                     current_start = idnum
-                    current_stop = idnum + 1
-                else:
-                    current_stop = idnum + 1
+                current_stop = idnum + 1
         if current_start > 0:
             object_blocks.append([current_start, current_stop - current_start])
         # write incremented xref
diff --git a/tests/test_page.py b/tests/test_page.py
index dc3ec9c55..39b1f4ec5 100644
--- a/tests/test_page.py
+++ b/tests/test_page.py
@@ -1459,3 +1459,16 @@ def test_get_contents_as_bytes():
     assert writer.pages[0]._get_contents_as_bytes() == expected
     writer.pages[0][NameObject("/Contents")] = writer.pages[0]["/Contents"][0]
     assert writer.pages[0]._get_contents_as_bytes() == expected
+
+
+def test_recursive_get_page_from_node():
+    writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=True)
+    writer.root_object["/Pages"].get_object()[
+        NameObject("/Parent")
+    ] = writer.root_object["/Pages"].indirect_reference
+    with pytest.raises(PyPdfError):
+        writer.add_page(writer.pages[0])
+    writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=True)
+    writer.insert_page(writer.pages[0], -1)
+    with pytest.raises(ValueError):
+        writer.insert_page(writer.pages[0], -10)
diff --git a/tests/test_writer.py b/tests/test_writer.py
index 1a172e8c3..160ef4023 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -2356,7 +2356,7 @@ def test_utf16_metadata():
     )
 
 
-def test_list_objects_in_increment(caplog):
+def test_increment_writer(caplog):
     """Tests for #2811"""
     writer = PdfWriter(
         RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf",
@@ -2369,6 +2369,8 @@ def test_list_objects_in_increment(caplog):
         [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)]
     )
     assert writer.list_objects_in_increment() == [IndirectObject(4, 0, writer)]
+    b = BytesIO()
+    writer.write(b)
     writer.pages[5][NameObject("/MediaBox")] = ArrayObject(
         [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)]
     )

From 290c5a6f423ab1af59431bdc76243c0b3a4a63c1 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 26 Aug 2024 15:52:07 +0200
Subject: [PATCH 11/40] coverage

---
 tests/test_writer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test_writer.py b/tests/test_writer.py
index 160ef4023..64d06d9b6 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -1795,6 +1795,9 @@ def test_missing_info():
 
     writer = PdfWriter(clone_from=reader)
     assert len(writer.pages) == len(reader.pages)
+    reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf")
+    writer._info = reader._info
+    assert dict(writer._info) == dict(reader._info)
 
 
 @pytest.mark.enable_socket()

From 173578d43011132197a44d5e16d225b5e7a9a3df Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 26 Aug 2024 16:24:39 +0200
Subject: [PATCH 12/40] coverage

---
 pypdf/_writer.py     |  4 +++-
 tests/test_writer.py | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index b532b6446..4850f251b 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -468,7 +468,9 @@ def _add_page(
             ]
         except Exception:
             pass
-        page = cast("PageObject", page_org.clone(self, False, excluded_keys))
+        page = cast(
+            "PageObject", page_org.clone(self, False, excluded_keys).get_object()
+        )
         if page_org.pdf is not None:
             other = page_org.pdf.pdf_header
             self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)
diff --git a/tests/test_writer.py b/tests/test_writer.py
index 64d06d9b6..25fb30623 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -2399,3 +2399,22 @@ def test_increment_writer(caplog):
     writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=False)
     # 1 object is modified: page 0  inherits MediaBox so is changed
     assert len(writer.list_objects_in_increment()) == len(writer._objects)
+
+    # insert pages in a tree
+    url = "https://github.com/py-pdf/pypdf/files/13946477/panda.pdf"
+    name = "iss2343b.pdf"
+    writer = PdfWriter(BytesIO(get_data_from_url(url, name=name)), incremental=True)
+    reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf")
+    pg = writer.insert_page(reader.pages[0], 4)
+    assert (
+        pg.raw_get("/Parent")
+        == writer.root_object["/Pages"]["/Kids"][0].get_object()["/Kids"][0]
+    )
+    assert pg["/Parent"]["/Count"] == 8
+    assert writer.root_object["/Pages"]["/Count"] == 285
+    assert len(writer.flattened_pages) == 285
+
+    # clone without info
+    writer = PdfWriter(RESOURCE_ROOT / "missing_info.pdf", incremental=True)
+    assert len(writer.list_objects_in_increment()) == 1
+    assert writer._info == {}

From 1a6eda51cb215eefd18619d988facf8a84c5f2ae Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 26 Aug 2024 21:16:49 +0200
Subject: [PATCH 13/40] simplification

---
 pypdf/_writer.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 4850f251b..d400cf5f0 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -476,14 +476,13 @@ def _add_page(
             self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)
         node, idx = self._get_page_in_node(index)
         page[NameObject(PA.PARENT)] = node.indirect_reference
+
         if idx >= 0:  # to be a
             cast(ArrayObject, node[PA.KIDS]).insert(idx, page.indirect_reference)
-            if self.flattened_pages != node[PA.KIDS]:
-                self.flattened_pages.insert(index, page)
+            self.flattened_pages.insert(index, page)
         else:
             cast(ArrayObject, node[PA.KIDS]).append(page.indirect_reference)
-            if self.flattened_pages != node[PA.KIDS]:
-                self.flattened_pages.append(page)
+            self.flattened_pages.append(page)
         cpt = 1000
         while node is not None:
             node = cast(DictionaryObject, node.get_object())

From d43d25b6f6c4fdd09424ccb369e14177175921c8 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Tue, 27 Aug 2024 09:39:21 +0200
Subject: [PATCH 14/40] coverage

---
 tests/test_writer.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tests/test_writer.py b/tests/test_writer.py
index 25fb30623..794dd0469 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -2367,6 +2367,16 @@ def test_increment_writer(caplog):
     )
     # Contains JBIG2 not decoded for the moment
     assert writer.list_objects_in_increment() == []  # no flowdown of properties
+
+    # test writing with empty increment
+    b = BytesIO()
+    writer.write(b)
+    b.seek(0)
+    writer2 = PdfWriter(b, incremental=True)
+    assert len([x for x in writer2._objects if x is not None]) == len(
+        [x for x in writer._objects if x is not None]
+    )
+
     # modify one object
     writer.pages[0][NameObject("/MediaBox")] = ArrayObject(
         [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)]
@@ -2378,6 +2388,9 @@ def test_increment_writer(caplog):
         [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)]
     )
     assert len(writer.list_objects_in_increment()) == 2
+    # modify object IndirectObject(5,0) : for coverage
+    writer.get_object(5)[NameObject("/ForTestOnly")] = NameObject("/ForTestOnly")
+
     b = BytesIO()
     writer.write(b)
     assert b.getvalue().startswith(writer._reader.stream.getvalue())
@@ -2386,6 +2399,7 @@ def test_increment_writer(caplog):
     assert reader.pages[0]["/MediaBox"] == ArrayObject(
         [NumberObject(0), NumberObject(0), NumberObject(864), NumberObject(648)]
     )
+    assert "/ForTestOnly" in reader.get_object(5)
     with pytest.raises(PyPdfError):
         writer = PdfWriter(reader, incremental=True)
     b.seek(0)

From c9a6c95e06fcd090155f98cba4f90164fb30da9c Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Wed, 28 Aug 2024 11:59:32 +0200
Subject: [PATCH 15/40] ENH: add capability to remove /Info from pypdf

to be merged after #2811
---
 docs/user/metadata.md | 24 ++++++++++++
 pypdf/_writer.py      | 88 ++++++++++++++++++++++++++++++++-----------
 tests/test_writer.py  | 33 +++++++++++++++-
 3 files changed, 120 insertions(+), 25 deletions(-)

diff --git a/docs/user/metadata.md b/docs/user/metadata.md
index 7f0a57694..a2bbdf9f0 100644
--- a/docs/user/metadata.md
+++ b/docs/user/metadata.md
@@ -76,6 +76,30 @@ writer.add_metadata(
     }
 )
 
+# Clear all data but keep the entry in PDF
+writer.metadata = {}
+
+# Replace all entries with new set of entries
+writer.metadata = {
+    "/Author": "Martin",
+    "/Producer": "Libre Writer",
+}
+
+# Save the new PDF to a file
+with open("meta-pdf.pdf", "wb") as f:
+    writer.write(f)
+```
+
+## Removing metadata entry
+
+```python
+from pypdf import PdfWriter
+
+writer = PdfWriter("example.pdf")
+
+# Remove Metadata (/Info entry)
+writer.metadata = None
+
 # Save the new PDF to a file
 with open("meta-pdf.pdf", "wb") as f:
     writer.write(f)
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index d400cf5f0..ca3ab9030 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -52,7 +52,7 @@
 )
 
 from ._cmap import _default_fonts_space_width, build_char_map_from_dict
-from ._doc_common import PdfDocCommon
+from ._doc_common import DocumentInformation, PdfDocCommon
 from ._encryption import EncryptAlgorithm, Encryption
 from ._page import PageObject
 from ._page_labels import nums_clear_range, nums_insert, nums_next
@@ -192,7 +192,7 @@ def __init__(
         """
 
         self._ID: Union[ArrayObject, None] = None
-        self._info_obj: PdfObject
+        self._info_obj: Optional[PdfObject]
 
         if self.incremental:
             if isinstance(fileobj, (str, Path)):
@@ -307,13 +307,26 @@ def _info(self) -> Optional[DictionaryObject]:
         Returns:
             /Info Dictionary; None if the entry does not exist
         """
-        return cast(DictionaryObject, self._info_obj.get_object())
+        return (
+            None
+            if self._info_obj is None
+            else cast(DictionaryObject, self._info_obj.get_object())
+        )
 
     @_info.setter
-    def _info(self, value: Union[IndirectObject, DictionaryObject]) -> None:
-        obj = cast(DictionaryObject, self._info_obj.get_object())
-        obj.clear()
-        obj.update(cast(DictionaryObject, value.get_object()))
+    def _info(self, value: Optional[Union[IndirectObject, DictionaryObject]]) -> None:
+        if value is None:
+            try:
+                self._objects[self._info_obj.indirect_reference.idnum - 1] = None  # type: ignore
+            except (KeyError, AttributeError):
+                pass
+            self._info_obj = None
+        else:
+            if self._info_obj is None:
+                self._info_obj = self._add_object(DictionaryObject())
+            obj = cast(DictionaryObject, self._info_obj.get_object())
+            obj.clear()
+            obj.update(cast(DictionaryObject, value.get_object()))
 
     @property
     def xmp_metadata(self) -> Optional[XmpInformation]:
@@ -1184,6 +1197,7 @@ def clone_reader_document_root(self, reader: PdfReader) -> None:
             self._objects = [None] * cast(int, reader.trailer["/Size"])
         else:
             self._objects.clear()
+        self._info_obj = None
         self._root_object = reader.root_object.clone(self)
         self._pages = self._root_object.raw_get("/Pages")
 
@@ -1224,22 +1238,21 @@ def clone_document_from_reader(
                 document.
         """
         self.clone_reader_document_root(reader)
-        if TK.INFO in reader.trailer:
-            inf = reader._info
-            if self.incremental:
-                if inf is not None:
-                    self._info_obj = cast(
-                        IndirectObject, inf.clone(self).indirect_reference
-                    )
-                self._original_hash[
-                    cast(IndirectObject, self._info_obj.indirect_reference).idnum - 1
-                ] = cast(DictionaryObject, self._info_obj.get_object()).hash_bin()
-            elif inf is not None:
-                self._info_obj = self._add_object(
-                    DictionaryObject(cast(DictionaryObject, inf.get_object()))
+        inf = reader._info
+        if self.incremental:
+            if inf is not None:
+                self._info_obj = cast(
+                    IndirectObject, inf.clone(self).indirect_reference
                 )
-        else:
-            self._info_obj = self._add_object(DictionaryObject())
+                assert isinstance(self._info, DictionaryObject), "for mypy"
+                self._original_hash[
+                    self._info_obj.indirect_reference.idnum - 1
+                ] = self._info.hash_bin()
+        elif inf is not None:
+            self._info_obj = self._add_object(
+                DictionaryObject(cast(DictionaryObject, inf.get_object()))
+            )
+        # else: _info_obj = None done in clone_reader_document_root()
 
         try:
             self._ID = cast(ArrayObject, reader._ID).clone(self)
@@ -1507,9 +1520,10 @@ def _write_trailer(self, stream: StreamType, xref_location: int) -> None:
             {
                 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1),
                 NameObject(TK.ROOT): self.root_object.indirect_reference,
-                NameObject(TK.INFO): self._info_obj,
             }
         )
+        if self._info is not None:
+            trailer[NameObject(TK.INFO)] = self._info.indirect_reference
         if self.incremental:
             trailer[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)
         if self._ID:
@@ -1519,6 +1533,34 @@ def _write_trailer(self, stream: StreamType, xref_location: int) -> None:
         trailer.write_to_stream(stream)
         stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode())  # eof
 
+    @property
+    def metadata(self) -> Optional[DocumentInformation]:
+        """
+        Retrieve/set the PDF file's document information dictionary, if it exists.
+
+        Args:
+            value: dict with the entries to be set. if None : remove the /Info entry from the pdf.
+
+        Note that some PDF files use (xmp)metadata streams instead of document
+        information dictionaries, and these metadata streams will not be
+        accessed by this function.
+        """
+        return super().metadata
+
+    @metadata.setter
+    def metadata(
+        self,
+        value: Optional[Union[DocumentInformation, DictionaryObject, Dict[Any, Any]]],
+    ) -> None:
+        if value is None:
+            self._info = None
+        else:
+            if self._info is not None:
+                self._info.clear()
+            else:
+                self._info = DictionaryObject()
+            self.add_metadata(value)
+
     def add_metadata(self, infos: Dict[str, Any]) -> None:
         """
         Add custom metadata to the output.
diff --git a/tests/test_writer.py b/tests/test_writer.py
index 794dd0469..21e1e5538 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -1795,9 +1795,32 @@ def test_missing_info():
 
     writer = PdfWriter(clone_from=reader)
     assert len(writer.pages) == len(reader.pages)
+    assert writer.metadata is None
+    b = BytesIO()
+    writer.write(b)
+    assert b"/Info" not in b.getvalue()
+
     reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf")
-    writer._info = reader._info
+    writer.metadata = reader.metadata
     assert dict(writer._info) == dict(reader._info)
+    assert writer.metadata == reader.metadata
+    b = BytesIO()
+    writer.write(b)
+    assert b"/Info" in b.getvalue()
+
+    writer.metadata = {}
+    b = BytesIO()
+    writer.write(b)
+    assert b"/Info" in b.getvalue()
+    assert writer.metadata == {}
+
+    writer.metadata = None
+    writer.metadata = None  # for code checking
+    assert writer.metadata is None
+    assert PdfWriter().metadata == {"/Producer": "pypdf"}
+    b = BytesIO()
+    writer.write(b)
+    assert b"/Info" not in b.getvalue()
 
 
 @pytest.mark.enable_socket()
@@ -2430,5 +2453,11 @@ def test_increment_writer(caplog):
 
     # clone without info
     writer = PdfWriter(RESOURCE_ROOT / "missing_info.pdf", incremental=True)
+    assert len(writer.list_objects_in_increment()) == 0
+    assert writer.metadata is None
+    writer.metadata = {}
+    assert writer.metadata == {}
     assert len(writer.list_objects_in_increment()) == 1
-    assert writer._info == {}
+    writer.metadata = None
+    assert len(writer.list_objects_in_increment()) == 0
+    assert writer.metadata is None

From 5147266a2948b9b91decee9f4924be48bd102d32 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Wed, 28 Aug 2024 12:19:16 +0200
Subject: [PATCH 16/40] coverage

---
 tests/test_writer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_writer.py b/tests/test_writer.py
index 21e1e5538..28782fbcd 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -1809,13 +1809,14 @@ def test_missing_info():
     assert b"/Info" in b.getvalue()
 
     writer.metadata = {}
+    writer._info = {}  # for code coverage
     b = BytesIO()
     writer.write(b)
     assert b"/Info" in b.getvalue()
     assert writer.metadata == {}
 
     writer.metadata = None
-    writer.metadata = None  # for code checking
+    writer.metadata = None  # for code coverage
     assert writer.metadata is None
     assert PdfWriter().metadata == {"/Producer": "pypdf"}
     b = BytesIO()

From ec9aafe247b2d383d978882716f79fbf868883c3 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Wed, 28 Aug 2024 12:43:41 +0200
Subject: [PATCH 17/40] oups

---
 tests/test_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_writer.py b/tests/test_writer.py
index 28782fbcd..b5dcd3357 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -1809,7 +1809,7 @@ def test_missing_info():
     assert b"/Info" in b.getvalue()
 
     writer.metadata = {}
-    writer._info = {}  # for code coverage
+    writer._info = DictionaryObject()  # for code coverage
     b = BytesIO()
     writer.write(b)
     assert b"/Info" in b.getvalue()

From 14a93f1718b40beafea976e77ca9f2e71f2a1c4b Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 1 Sep 2024 15:28:41 +0200
Subject: [PATCH 18/40] move to X-reference stream for increment

this prevents "repairation" within acrobat
---
 pypdf/_writer.py                  | 57 +++++++++++++++++++++++--------
 pypdf/generic/_data_structures.py |  3 +-
 2 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index d400cf5f0..86aa120c0 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -31,6 +31,7 @@
 import enum
 import hashlib
 import re
+import struct
 import uuid
 from io import BytesIO, FileIO, IOBase
 from itertools import compress
@@ -1351,8 +1352,8 @@ def write_stream(self, stream: StreamType) -> None:
         if self.incremental:
             self._reader.stream.seek(0)
             stream.write(self._reader.stream.read(-1))
-            xref_location = self._write_increment(stream)
-            self._write_trailer(stream, xref_location)
+            if len(self.list_objects_in_increment()) > 0:
+                self._write_increment(stream)  # writes objs, Xref stream and startx
         else:
             object_positions, free_objects = self._write_pdf_structure(stream)
             xref_location = self._write_xref_table(
@@ -1413,7 +1414,7 @@ def list_objects_in_increment(self) -> List[IndirectObject]:
             )
         ]
 
-    def _write_increment(self, stream: StreamType) -> int:
+    def _write_increment(self, stream: StreamType) -> None:
         object_positions = {}
         object_blocks = []
         current_start = -1
@@ -1448,14 +1449,41 @@ def _write_increment(self, stream: StreamType) -> int:
             object_blocks.append([current_start, current_stop - current_start])
         # write incremented xref
         xref_location = stream.tell()
-        stream.write(b"xref\n")
-        stream.write(b"0 1\n")
-        stream.write(b"0000000000 65535 f \n")
-        for block in object_blocks:
-            stream.write(f"{block[0]} {block[1]}\n".encode())
-            for i in range(block[0], block[0] + block[1]):
-                stream.write(f"{object_positions[i]:0>10} {0:0>5} n \n".encode())
-        return xref_location
+        xr_id = len(self._objects) + 1
+        stream.write(f"{xr_id} 0 obj".encode())
+        init_data = {
+            NameObject("/Type"): NameObject("/XRef"),
+            NameObject("/Size"): NumberObject(xr_id + 1),
+            NameObject("/Root"): self.root_object.indirect_reference,
+            NameObject("/Filter"): NameObject("/FlateDecode"),
+            NameObject("/Index"): ArrayObject(
+                [NumberObject(_it) for _su in object_blocks for _it in _su]
+            ),
+            NameObject("/W"): ArrayObject(
+                [NumberObject(1), NumberObject(4), NumberObject(1)]
+            ),
+            "__streamdata__": b"",
+        }
+        if self._info is not None and (
+            not self.incremental
+            or self._info.hash_bin()  # kept for future
+            != self._original_hash[
+                cast(IndirectObject, self._info.indirect_reference).idnum - 1
+            ]
+        ):
+            init_data[NameObject(TK.INFO)] = self._info.indirect_reference
+        if self.incremental:  # kept for future
+            init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)
+        elif self._ID:
+            init_data[NameObject(TK.ID)] = self._ID
+        xr = StreamObject.initialize_from_dictionary(init_data)
+        xr.set_data(
+            b"".join(
+                [struct.pack(b">BIB", 1, _pos, 0) for _pos in object_positions.values()]
+            )
+        )
+        xr.write_to_stream(stream)
+        stream.write(f"\nstartxref\n{xref_location}\n%%EOF\n".encode())  # eof
 
     def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]:
         object_positions = []
@@ -1507,12 +1535,11 @@ def _write_trailer(self, stream: StreamType, xref_location: int) -> None:
             {
                 NameObject(TK.SIZE): NumberObject(len(self._objects) + 1),
                 NameObject(TK.ROOT): self.root_object.indirect_reference,
-                NameObject(TK.INFO): self._info_obj,
             }
         )
-        if self.incremental:
-            trailer[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)
-        if self._ID:
+        if self._info is not None:
+            trailer[NameObject(TK.INFO)] = self._info.indirect_reference
+        if self._ID is not None:
             trailer[NameObject(TK.ID)] = self._ID
         if self._encrypt_entry:
             trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference
diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
index d048da8cb..fc71bf5bf 100644
--- a/pypdf/generic/_data_structures.py
+++ b/pypdf/generic/_data_structures.py
@@ -948,7 +948,8 @@ def initialize_from_dictionary(
             retval = DecodedStreamObject()
         retval._data = data["__streamdata__"]
         del data["__streamdata__"]
-        del data[SA.LENGTH]
+        if SA.LENGTH in data:
+            del data[SA.LENGTH]
         retval.update(data)
         return retval
 

From 53e141fe12f05b633f0289bbb5d3ad35d51a3e13 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 1 Sep 2024 15:54:09 +0200
Subject: [PATCH 19/40] coverage

---
 pypdf/_writer.py     | 11 +++++------
 tests/test_writer.py |  8 ++++++++
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 86aa120c0..ad48882dc 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -1464,17 +1464,16 @@ def _write_increment(self, stream: StreamType) -> None:
             ),
             "__streamdata__": b"",
         }
-        if self._info is not None and (
-            not self.incremental
-            or self._info.hash_bin()  # kept for future
+        if (
+            self._info is not None
+            and self._info.hash_bin()  # kept for future
             != self._original_hash[
                 cast(IndirectObject, self._info.indirect_reference).idnum - 1
             ]
         ):
             init_data[NameObject(TK.INFO)] = self._info.indirect_reference
-        if self.incremental:  # kept for future
-            init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)
-        elif self._ID:
+        init_data[NameObject(TK.PREV)] = NumberObject(self._reader._startxref)
+        if self._ID:
             init_data[NameObject(TK.ID)] = self._ID
         xr = StreamObject.initialize_from_dictionary(init_data)
         xr.set_data(
diff --git a/tests/test_writer.py b/tests/test_writer.py
index 794dd0469..6cedc9443 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -2371,11 +2371,19 @@ def test_increment_writer(caplog):
     # test writing with empty increment
     b = BytesIO()
     writer.write(b)
+    with open(
+        RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf", "rb"
+    ) as f:
+        assert b.getvalue() == f.read(-1)
     b.seek(0)
     writer2 = PdfWriter(b, incremental=True)
     assert len([x for x in writer2._objects if x is not None]) == len(
         [x for x in writer._objects if x is not None]
     )
+    writer2.add_metadata({"/Author": "test"})
+    assert len(writer2.list_objects_in_increment()) == 1
+    b = BytesIO()
+    writer2.write(b)
 
     # modify one object
     writer.pages[0][NameObject("/MediaBox")] = ArrayObject(

From b4b7c1bf96cd468fdb7687f391c0238cfc38ad57 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 1 Sep 2024 16:09:25 +0200
Subject: [PATCH 20/40] coverage

---
 tests/test_writer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_writer.py b/tests/test_writer.py
index 6cedc9443..7b9cbf003 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -2440,3 +2440,5 @@ def test_increment_writer(caplog):
     writer = PdfWriter(RESOURCE_ROOT / "missing_info.pdf", incremental=True)
     assert len(writer.list_objects_in_increment()) == 1
     assert writer._info == {}
+    b = BytesIO()
+    writer.write(b)

From 7bc3abddae4fa04f4e8d416bb4280c1d0444bc38 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 1 Sep 2024 16:12:13 +0200
Subject: [PATCH 21/40] coverage

---
 pypdf/_writer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index ad48882dc..74c066e50 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -1445,8 +1445,8 @@ def _write_increment(self, stream: StreamType) -> None:
                         )
                     current_start = idnum
                 current_stop = idnum + 1
-        if current_start > 0:
-            object_blocks.append([current_start, current_stop - current_start])
+        assert current_start > 0, "for pytest only"
+        object_blocks.append([current_start, current_stop - current_start])
         # write incremented xref
         xref_location = stream.tell()
         xr_id = len(self._objects) + 1

From ffa2f0c5506a0aeae6139f606006df85cf05c421 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 1 Sep 2024 16:55:09 +0200
Subject: [PATCH 22/40] fix

---
 pypdf/_writer.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 74c066e50..409244727 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -1464,11 +1464,16 @@ def _write_increment(self, stream: StreamType) -> None:
             ),
             "__streamdata__": b"",
         }
-        if (
-            self._info is not None
-            and self._info.hash_bin()  # kept for future
+        # below just to trick mypy for code simplification : will be reworked in next PR
+        assert isinstance(
+            cast(IndirectObject, self._info).indirect_reference, IndirectObject
+        ), "for mypy"
+        if self._info is not None and (
+            cast(IndirectObject, self._info).indirect_reference.idnum - 1
+            >= len(self._original_hash)
+            or cast(IndirectObject, self._info).hash_bin()  # kept for future
             != self._original_hash[
-                cast(IndirectObject, self._info.indirect_reference).idnum - 1
+                cast(IndirectObject, self._info).indirect_reference.idnum - 1
             ]
         ):
             init_data[NameObject(TK.INFO)] = self._info.indirect_reference

From b072952b9c101a3530d07b1d4c1c975f1153352f Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 1 Sep 2024 16:57:48 +0200
Subject: [PATCH 23/40] mypy

---
 pypdf/_writer.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 409244727..886fcbca7 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -1464,16 +1464,12 @@ def _write_increment(self, stream: StreamType) -> None:
             ),
             "__streamdata__": b"",
         }
-        # below just to trick mypy for code simplification : will be reworked in next PR
-        assert isinstance(
-            cast(IndirectObject, self._info).indirect_reference, IndirectObject
-        ), "for mypy"
         if self._info is not None and (
-            cast(IndirectObject, self._info).indirect_reference.idnum - 1
+            self._info.indirect_reference.idnum - 1  # type: ignore
             >= len(self._original_hash)
             or cast(IndirectObject, self._info).hash_bin()  # kept for future
             != self._original_hash[
-                cast(IndirectObject, self._info).indirect_reference.idnum - 1
+                self._info.indirect_reference.idnum - 1  # type: ignore
             ]
         ):
             init_data[NameObject(TK.INFO)] = self._info.indirect_reference

From 454c4fe026d2f076adb985bb46307cd2661b082e Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 1 Sep 2024 17:30:44 +0200
Subject: [PATCH 24/40] coverage

---
 tests/test_writer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_writer.py b/tests/test_writer.py
index 0d8e61554..e06db389b 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -2441,6 +2441,8 @@ def test_increment_writer(caplog):
     writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=True)
     # 1 object is modified: page 0  inherits MediaBox so is changed
     assert len(writer.list_objects_in_increment()) == 1
+    b = BytesIO()
+    writer.write(b)
 
     writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf", incremental=False)
     # 1 object is modified: page 0  inherits MediaBox so is changed

From 494e00ae27f6b71ee503517b5bb48809866a57e2 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 8 Sep 2024 09:18:08 +0200
Subject: [PATCH 25/40] Update pypdf/_doc_common.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
---
 pypdf/_doc_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
index ea3c93aab..fcbc9904f 100644
--- a/pypdf/_doc_common.py
+++ b/pypdf/_doc_common.py
@@ -375,7 +375,7 @@ def _get_page_in_node(
         page_number: int,
     ) -> Tuple[DictionaryObject, int]:
         """
-        Retrieve the node and position within the /Kids containing the page
+        Retrieve the node and position within the /Kids containing the page.
         if page_number is greater than the number of page, it returns top node, -1
         """
         top = cast(DictionaryObject, self.root_object["/Pages"])

From eba1c9f4639b694e8e20d08d0478b53520e57d05 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 8 Sep 2024 09:18:24 +0200
Subject: [PATCH 26/40] Update pypdf/_doc_common.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
---
 pypdf/_doc_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
index fcbc9904f..aecef700f 100644
--- a/pypdf/_doc_common.py
+++ b/pypdf/_doc_common.py
@@ -385,7 +385,7 @@ def recurs(node: DictionaryObject, mi: int) -> Tuple[Optional[PdfObject], int]:
             if node["/Type"] == "/Page":
                 if page_number == mi:
                     return node, -1
-                # else:
+                # else
                 return None, mi + 1
             if (page_number - mi) >= ma:  # not in nodes below
                 if node == top:

From d68db51fe5342b7564e8df95a429c9c2927ea522 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 8 Sep 2024 09:18:39 +0200
Subject: [PATCH 27/40] Update pypdf/_doc_common.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
---
 pypdf/_doc_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
index aecef700f..edaae356c 100644
--- a/pypdf/_doc_common.py
+++ b/pypdf/_doc_common.py
@@ -376,7 +376,7 @@ def _get_page_in_node(
     ) -> Tuple[DictionaryObject, int]:
         """
         Retrieve the node and position within the /Kids containing the page.
-        if page_number is greater than the number of page, it returns top node, -1
+        If page_number is greater than the number of pages, it returns the top node, -1
         """
         top = cast(DictionaryObject, self.root_object["/Pages"])
 

From 8b3182dcb7e04fca88a244f6e021b39579cfa150 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 8 Sep 2024 09:24:41 +0200
Subject: [PATCH 28/40] Update pypdf/_doc_common.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
---
 pypdf/_doc_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
index edaae356c..2abcc52fe 100644
--- a/pypdf/_doc_common.py
+++ b/pypdf/_doc_common.py
@@ -401,7 +401,7 @@ def recurs(node: DictionaryObject, mi: int) -> Tuple[Optional[PdfObject], int]:
                     # else:  # ... at lower levels
                     return n, i
                 mi = i
-            raise PyPdfError("abnormal, can not find the node")
+            raise PyPdfError("Unexpectedly cannot find the node.")
 
         node, idx = recurs(top, 0)
         assert isinstance(node, DictionaryObject)

From fe6aac7201c2b08f2c9c6bdc413efd427cfab49c Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 8 Sep 2024 09:24:57 +0200
Subject: [PATCH 29/40] Update pypdf/_writer.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
---
 pypdf/_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 886fcbca7..f214bac75 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -1393,7 +1393,7 @@ def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
 
     def list_objects_in_increment(self) -> List[IndirectObject]:
         """
-        For debug / analysis
+        For debugging/analysis.
         Provides the list of new/modified objects that will be written
         in the increment
         Deleted Objects will not be freeed but will become orphans

From 0be4bb4469db39f34d9ac0dd6a6eaa3ff9762338 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 8 Sep 2024 09:25:09 +0200
Subject: [PATCH 30/40] Update pypdf/_writer.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
---
 pypdf/_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index f214bac75..faa56aa69 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -1395,7 +1395,7 @@ def list_objects_in_increment(self) -> List[IndirectObject]:
         """
         For debugging/analysis.
         Provides the list of new/modified objects that will be written
-        in the increment
+        in the increment.
         Deleted Objects will not be freeed but will become orphans
 
         Returns:

From 4c585c0244a8ae8afbdfed807cc4a8ca12e04e14 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 8 Sep 2024 09:28:27 +0200
Subject: [PATCH 31/40] Update pypdf/_writer.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
---
 pypdf/_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index faa56aa69..19e643503 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -579,7 +579,7 @@ def insert_page(
         if index < 0:
             index = len(self.flattened_pages) + index
         if index < 0:
-            raise ValueError("invalid index value")
+            raise ValueError("Invalid index value")
         if index >= len(self.flattened_pages):
             return self.add_page(page, excluded_keys)
         else:

From fbe54d0f0c4316911fbd60450c49ca405bb84243 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 8 Sep 2024 09:37:06 +0200
Subject: [PATCH 32/40] Update pypdf/_writer.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
---
 pypdf/_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 19e643503..e8af7fcc7 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -154,7 +154,7 @@ class PdfWriter(PdfDocCommon):
 
     incremental: If true, loads the document and set the PdfWriter in incremental mode
 
-    When writing in incremental the original document is written first and new/modified
+    When writing incrementally, the original document is written first and new/modified
     are appened. to be used for signed document/forms to keep signature valid.
     """
 

From e3c1e2c670b0d4f9e9ecead768164d69ae2b4630 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 8 Sep 2024 09:37:31 +0200
Subject: [PATCH 33/40] Update pypdf/_writer.py

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
---
 pypdf/_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index e8af7fcc7..9d828baad 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -491,7 +491,7 @@ def _add_page(
             node = node.get(PA.PARENT, None)
             cpt -= 1
             if cpt < 0:
-                raise PyPdfError("Recursive Error detected")
+                raise PyPdfError("Too many recursive calls!")
         return page
 
     def set_need_appearances_writer(self, state: bool = True) -> None:

From 6e659431392c1ecf2d3b563f23f67294c0f46ada Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 8 Sep 2024 09:43:19 +0200
Subject: [PATCH 34/40] clarify assert mypy

---
 pypdf/_writer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 9d828baad..ef4bbfbfd 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -554,7 +554,7 @@ def add_page(
         Returns:
             The added PageObject.
         """
-        assert self.flattened_pages is not None
+        assert self.flattened_pages is not None, "mypy"
         return self._add_page(page, len(self.flattened_pages), excluded_keys)
 
     def insert_page(
@@ -575,7 +575,7 @@ def insert_page(
         Returns:
             The added PageObject.
         """
-        assert self.flattened_pages is not None
+        assert self.flattened_pages is not None, "mypy"
         if index < 0:
             index = len(self.flattened_pages) + index
         if index < 0:

From 412167298ebe5e8c55996f0b3ee24bcc6e1b8838 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 8 Sep 2024 10:24:26 +0200
Subject: [PATCH 35/40] doc hash_bin

---
 pypdf/generic/_base.py | 45 +++++++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py
index 9dfb25a29..d02a79810 100644
--- a/pypdf/generic/_base.py
+++ b/pypdf/generic/_base.py
@@ -55,9 +55,10 @@ class PdfObject(PdfObjectProtocol):
 
     def hash_bin(self) -> int:
         """
+        Used to detect modified object.
+
         Returns:
-            hash considering type and value
-        used to detect modified object
+            Hash considering type and value.
         """
         raise NotImplementedError(
             f"{self.__class__.__name__} does not implement .hash_bin() so far"
@@ -186,9 +187,10 @@ def clone(
 
     def hash_bin(self) -> int:
         """
+        Used to detect modified object.
+
         Returns:
-            hash considering type and value
-        used to detect modified object
+            Hash considering type and value.
         """
         return hash((self.__class__,))
 
@@ -230,9 +232,10 @@ def clone(
 
     def hash_bin(self) -> int:
         """
+        Used to detect modified object.
+
         Returns:
-            hash considering type and value
-        used to detect modified object
+            Hash considering type and value.
         """
         return hash((self.__class__, self.value))
 
@@ -282,9 +285,10 @@ def __hash__(self) -> int:
 
     def hash_bin(self) -> int:
         """
+        Used to detect modified object.
+
         Returns:
-            hash considering type and value
-        used to detect modified object
+            Hash considering type and value.
         """
         return hash((self.__class__, self.idnum, self.generation, id(self.pdf)))
 
@@ -448,9 +452,10 @@ def clone(
 
     def hash_bin(self) -> int:
         """
+        Used to detect modified object.
+
         Returns:
-            hash considering type and value
-        used to detect modified object
+            Hash considering type and value.
         """
         return hash((self.__class__, self.as_numeric))
 
@@ -501,9 +506,10 @@ def clone(
 
     def hash_bin(self) -> int:
         """
+        Used to detect modified object.
+
         Returns:
-            hash considering type and value
-        used to detect modified object
+            Hash considering type and value.
         """
         return hash((self.__class__, self.as_numeric()))
 
@@ -552,9 +558,10 @@ def clone(
 
     def hash_bin(self) -> int:
         """
+        Used to detect modified object.
+
         Returns:
-            hash considering type and value
-        used to detect modified object
+            Hash considering type and value.
         """
         return hash((self.__class__, bytes(self)))
 
@@ -639,9 +646,10 @@ def clone(
 
     def hash_bin(self) -> int:
         """
+        Used to detect modified object.
+
         Returns:
-            hash considering type and value
-        used to detect modified object
+            Hash considering type and value.
         """
         return hash((self.__class__, self.original_bytes))
 
@@ -743,9 +751,10 @@ def clone(
 
     def hash_bin(self) -> int:
         """
+        Used to detect modified object.
+
         Returns:
-            hash considering type and value
-        used to detect modified object
+            Hash considering type and value.
         """
         return hash((self.__class__, self))
 

From bcc5c1da7ecede2c9b7d07de545827bf57823107 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 8 Sep 2024 10:25:57 +0200
Subject: [PATCH 36/40] doc hash_bin

---
 pypdf/generic/_data_structures.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
index fc71bf5bf..08bc2806d 100644
--- a/pypdf/generic/_data_structures.py
+++ b/pypdf/generic/_data_structures.py
@@ -133,9 +133,10 @@ def clone(
 
     def hash_bin(self) -> int:
         """
+        Used to detect modified object.
+
         Returns:
-            hash considering type and value
-        used to detect modified object
+            Hash considering type and value.
         """
         return hash((self.__class__, tuple(x.hash_bin() for x in self)))
 
@@ -381,9 +382,10 @@ def _clone(
 
     def hash_bin(self) -> int:
         """
+        Used to detect modified object.
+
         Returns:
-            hash considering type and value
-        used to detect modified object
+            Hash considering type and value.
         """
         return hash(
             (self.__class__, tuple(((k, v.hash_bin()) for k, v in self.items())))
@@ -896,9 +898,10 @@ def _clone(
 
     def hash_bin(self) -> int:
         """
+        Used to detect modified object.
+
         Returns:
-            hash considering type and value
-        used to detect modified object
+            Hash considering type and value.
         """
         # use of _data to prevent errors on non decoded stream such as JBIG2
         return hash((super().hash_bin(), self._data))

From bc6cabab1045b462a735a7124b504452ca737fd5 Mon Sep 17 00:00:00 2001
From: Stefan <96178532+stefan6419846@users.noreply.github.com>
Date: Sun, 8 Sep 2024 16:39:14 +0200
Subject: [PATCH 37/40] Update pypdf/_page.py

---
 pypdf/_page.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index 3888dcf6c..88943c3de 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -2439,7 +2439,7 @@ def __delitem__(self, index: Union[int, slice]) -> None:
                 parent = parent.get("/Parent", None)
             except ValueError:  # from index
                 if first:
-                    raise PdfReadError(f"Page Not Found in Page Tree {ind}")
+                    raise PdfReadError(f"Page not found in page tree: {ind}")
                 break
 
     def __iter__(self) -> Iterator[PageObject]:

From 8659de278b987fad07c3358c96a2a881c4d4949d Mon Sep 17 00:00:00 2001
From: Stefan <96178532+stefan6419846@users.noreply.github.com>
Date: Sun, 8 Sep 2024 16:39:54 +0200
Subject: [PATCH 38/40] Update pypdf/_writer.py

---
 pypdf/_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index ef4bbfbfd..e0a680972 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -155,7 +155,7 @@ class PdfWriter(PdfDocCommon):
     incremental: If true, loads the document and set the PdfWriter in incremental mode
 
     When writing incrementally, the original document is written first and new/modified
-    are appened. to be used for signed document/forms to keep signature valid.
+    content is appended. To be used for signed document/forms to keep signature valid.
     """
 
     def __init__(

From 99e6dfc93abdf931fc89485ffa5ddf7a49a7010d Mon Sep 17 00:00:00 2001
From: Stefan <96178532+stefan6419846@users.noreply.github.com>
Date: Sun, 8 Sep 2024 16:42:51 +0200
Subject: [PATCH 39/40] Apply suggestions from code review

---
 pypdf/_writer.py   | 10 +++++-----
 pypdf/constants.py |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index e0a680972..8d6d9f390 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -172,13 +172,13 @@ def __init__(
         self._objects: List[Optional[PdfObject]] = []
         """
         The indirect objects in the PDF.
-        for the incremental it will be filled with None
-        in clone_reader_document_root
+        For the incremental case, it will be filled with None
+        in clone_reader_document_root.
         """
 
         self._original_hash: List[int] = []
         """
-        list of hashes after import; used to identify changes
+        List of hashes after import; used to identify changes.
         """
 
         self._idnum_hash: Dict[bytes, Tuple[IndirectObject, List[IndirectObject]]] = {}
@@ -454,7 +454,7 @@ def _add_page(
         excluded_keys: Iterable[str] = (),
     ) -> PageObject:
         if not isinstance(page, PageObject) or page.get(PA.TYPE, None) != CO.PAGE:
-            raise ValueError("Invalid page Object")
+            raise ValueError("Invalid page object")
         assert self.flattened_pages is not None, "for mypy"
         page_org = page
         excluded_keys = list(excluded_keys)
@@ -1396,7 +1396,7 @@ def list_objects_in_increment(self) -> List[IndirectObject]:
         For debugging/analysis.
         Provides the list of new/modified objects that will be written
         in the increment.
-        Deleted Objects will not be freeed but will become orphans
+        Deleted objects will not be freed but will become orphans.
 
         Returns:
             List of (new / modified) IndirectObjects
diff --git a/pypdf/constants.py b/pypdf/constants.py
index a7e67aacc..d7a8e310f 100644
--- a/pypdf/constants.py
+++ b/pypdf/constants.py
@@ -210,7 +210,7 @@ class PagesAttributes:
     PARENT = "/Parent"  # dictionary, required; indirect reference to pages object
     KIDS = "/Kids"  # array, required; List of indirect references
     COUNT = "/Count"  # integer, required; the number of leaf nodes (page objects)
-    # that are descendants of this node within the page tree
+                      # that are descendants of this node within the page tree
 
 
 class PageAttributes:

From be488722c96e4da09a254401686fdfbe54bd33e6 Mon Sep 17 00:00:00 2001
From: Stefan <96178532+stefan6419846@users.noreply.github.com>
Date: Sat, 14 Sep 2024 13:17:21 +0200
Subject: [PATCH 40/40] improve docs

---
 pypdf/_writer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 73fcbee15..edcd391e4 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -1566,9 +1566,9 @@ def metadata(self) -> Optional[DocumentInformation]:
         Retrieve/set the PDF file's document information dictionary, if it exists.
 
         Args:
-            value: dict with the entries to be set. if None : remove the /Info entry from the pdf.
+            value: Dictionary with the entries to set. If None, remove the /Info entry from the PDF.
 
-        Note that some PDF files use (xmp)metadata streams instead of document
+        Note that some PDF files use (XMP) metadata streams instead of document
         information dictionaries, and these metadata streams will not be
         accessed by this function.
         """