Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix images issue 4 bits encoding and LUT starting with UTF16_BOM #2675

Merged
merged 9 commits into from
May 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 1 addition & 8 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

import codecs
import collections
import decimal
import enum
Expand Down Expand Up @@ -180,13 +179,7 @@ def __init__(

# info object
info = DictionaryObject()
info.update(
{
NameObject("/Producer"): create_string_object(
codecs.BOM_UTF16_BE + "pypdf".encode("utf-16be")
)
}
)
info.update({NameObject("/Producer"): create_string_object("pypdf")})
stefan6419846 marked this conversation as resolved.
Show resolved Hide resolved
self._info_obj: PdfObject = self._add_object(info)

# root object
Expand Down
44 changes: 38 additions & 6 deletions pypdf/generic/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,28 @@ class TextStringObject(str, PdfObject): # noqa: SLOT000
to occur.
"""

autodetect_pdfdocencoding: bool
autodetect_utf16: bool
utf16_bom: bytes

def __new__(cls, value: Any) -> "TextStringObject":
if isinstance(value, bytes):
value = value.decode("charmap")
o = str.__new__(cls, value)
o.autodetect_utf16 = False
o.autodetect_pdfdocencoding = False
o.utf16_bom = b""
if value.startswith(("\xfe\xff", "\xff\xfe")):
o.autodetect_utf16 = True
o.utf16_bom = value[:2].encode("charmap")
else:
try:
encode_pdfdocencoding(o)
o.autodetect_pdfdocencoding = True
except UnicodeEncodeError:
o.autodetect_utf16 = True
return o

def clone(
self,
pdf_dest: Any,
Expand All @@ -518,13 +540,11 @@ def clone(
obj = TextStringObject(self)
obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding
obj.autodetect_utf16 = self.autodetect_utf16
obj.utf16_bom = self.utf16_bom
return cast(
"TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate)
)

autodetect_pdfdocencoding = False
autodetect_utf16 = False

@property
def original_bytes(self) -> bytes:
"""
Expand All @@ -542,20 +562,32 @@ def get_original_bytes(self) -> bytes:
# would have been used to create this object, based upon the autodetect
# method.
if self.autodetect_utf16:
return codecs.BOM_UTF16_BE + self.encode("utf-16be")
if self.utf16_bom == codecs.BOM_UTF16_LE:
return codecs.BOM_UTF16_LE + self.encode("utf-16le")
elif self.utf16_bom == codecs.BOM_UTF16_BE:
return codecs.BOM_UTF16_BE + self.encode("utf-16be")
else:
return self.encode("utf-16be")
elif self.autodetect_pdfdocencoding:
return encode_pdfdocencoding(self)
else:
raise Exception("no information about original bytes")
raise Exception("no information about original bytes") # pragma: no cover

def get_encoded_bytes(self) -> bytes:
# Try to write the string out as a PDFDocEncoding encoded string. It's
# nicer to look at in the PDF file. Sadly, we take a performance hit
# here for trying...
try:
if self.autodetect_utf16:
raise UnicodeEncodeError("", "forced", -1, -1, "")
bytearr = encode_pdfdocencoding(self)
except UnicodeEncodeError:
bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
if self.utf16_bom == codecs.BOM_UTF16_LE:
bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le")
elif self.utf16_bom == codecs.BOM_UTF16_BE:
bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be")
else:
bytearr = self.encode("utf-16be")
return bytearr

def write_to_stream(
Expand Down
1 change: 1 addition & 0 deletions pypdf/generic/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ def create_string_object(
if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):
retval = TextStringObject(string.decode("utf-16"))
retval.autodetect_utf16 = True
retval.utf16_bom = string[:2]
return retval
else:
# This is probably a big performance hit here, but we need
Expand Down
29 changes: 23 additions & 6 deletions tests/test_generic.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Test the pypdf.generic module."""

import codecs
from base64 import a85encode
from copy import deepcopy
from io import BytesIO
Expand Down Expand Up @@ -485,14 +486,13 @@ def test_rectangleobject():

def test_textstringobject_exc():
tso = TextStringObject("foo")
with pytest.raises(Exception) as exc:
tso.get_original_bytes()
assert exc.value.args[0] == "no information about original bytes"
assert tso.get_original_bytes() == b"foo"


def test_textstringobject_autodetect_utf16():
tso = TextStringObject("foo")
tso.autodetect_utf16 = True
tso.utf16_bom = codecs.BOM_UTF16_BE
assert tso.get_original_bytes() == b"\xfe\xff\x00f\x00o\x00o"


Expand Down Expand Up @@ -1107,20 +1107,37 @@ def test_indirect_object_invalid_read():
assert exc.value.args[0] == "Error reading indirect object reference at byte 0x5"


def test_create_string_object_utf16be_bom():
def test_create_string_object_utf16_bom():
# utf16-be
result = create_string_object(
b"\xfe\xff\x00P\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00"
)
assert result == "PaperPort 14\x00"
assert result.autodetect_utf16 is True
assert result.utf16_bom == b"\xfe\xff"
assert (
result.get_encoded_bytes()
== b"\xfe\xff\x00P\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00"
)


def test_create_string_object_utf16le_bom():
# utf16-le
result = create_string_object(
b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00"
)
assert result == "PaperPort 14\x00"
assert result.autodetect_utf16 is True
assert result.utf16_bom == b"\xff\xfe"
assert (
result.get_encoded_bytes()
== b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00"
)

# utf16-be without bom
result = TextStringObject("ÿ")
result.autodetect_utf16 = True
result.utf16_bom = b""
assert result.get_encoded_bytes() == b"\x00\xFF"
assert result.original_bytes == b"\x00\xFF"


def test_create_string_object_force():
Expand Down
16 changes: 16 additions & 0 deletions tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,22 @@ def test_large_compressed_image():
list(reader.pages[0].images)


@pytest.mark.enable_socket()
def test_ff_fe_starting_lut():
"""Cf issue #2660"""
url = "https://github.com/py-pdf/pypdf/files/15385628/original_before_merge.pdf"
name = "iss2660.pdf"
writer = PdfWriter(BytesIO(get_data_from_url(url, name=name)))
b = BytesIO()
writer.write(b)
reader = PdfReader(b)
url = "https://github.com/py-pdf/pypdf/assets/4083478/6150700d-87fd-43a2-8695-c2c05a44838c"
name = "iss2660.png"
img = Image.open(BytesIO(get_data_from_url(url, name=name)))
assert image_similarity(writer.pages[1].images[0].image, img) == 1.0
assert image_similarity(reader.pages[1].images[0].image, img) == 1.0


@pytest.mark.enable_socket()
def test_inline_image_extraction():
"""Cf #2598"""
Expand Down
Loading