BUG: Process CMYK images without a filter correctly (#2557)

Closes #2522
py-pdf · Mar 30, 2024 · 7883580 · 7883580
1 parent 42f970e
commit 7883580
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 2 deletions.
diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -817,9 +817,16 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
             ".tiff",
             False,
         )
+    elif mode == "CMYK":
+        img, image_format, extension, invert_color = (
+            Image.frombytes(mode, size, data),
+            "TIFF",
+            ".tif",
+            False,
+        )
+    elif mode == "":
+        raise PdfReadError(f"ColorSpace field not found in {x_object_obj}")
     else:
-        if mode == "":
-            raise PdfReadError(f"ColorSpace field not found in {x_object_obj}")
         img, image_format, extension, invert_color = (
             Image.frombytes(mode, size, data),
             "PNG",

diff --git a/tests/test_images.py b/tests/test_images.py
@@ -246,3 +246,12 @@ def test_bi_in_text():
     reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
     assert reader.pages[0].images.keys() == ["~0~"]
     assert reader.pages[0].images[0].name == "~0~.png"
+
+
+@pytest.mark.enable_socket()
+def test_cmyk_no_filter():
+    """Cf #2522"""
+    url = "https://github.com/py-pdf/pypdf/files/14614887/out3.pdf"
+    name = "iss2522.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    reader.pages[0].images[0].image