Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improved ExtractText(3) #969

Merged
merged 45 commits into from
Jun 13, 2022
Merged
Show file tree
Hide file tree
Changes from 42 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
48421df
Relative import
pubpub-zz Jun 10, 2022
c7829d8
improve TextExtraction
pubpub-zz Jun 10, 2022
7a9c22c
Extend testing
pubpub-zz Jun 10, 2022
b0a7736
improve readability of BooleanObjects
pubpub-zz Jun 10, 2022
d7f84d0
Apply Black
pubpub-zz Jun 10, 2022
59504ec
fix early mypy
pubpub-zz Jun 10, 2022
58bd0e5
fix mypy2
pubpub-zz Jun 10, 2022
941461a
attempt fix iss with test_utils
pubpub-zz Jun 10, 2022
e4c37cb
Merge branch 'main' into ExtractText
pubpub-zz Jun 10, 2022
39e94f9
Minor flake8 fix
MartinThoma Jun 10, 2022
9763868
Adjust mypy types
MartinThoma Jun 10, 2022
53294f2
Merge branch 'pubpub-zz-ExtractText' into origin/ExtractText
pubpub-zz Jun 10, 2022
744464f
revert in test_utils
pubpub-zz Jun 10, 2022
0ed4d9a
paste error
pubpub-zz Jun 10, 2022
5b96216
flake 8
pubpub-zz Jun 10, 2022
b2830e9
flake8
pubpub-zz Jun 10, 2022
1223d75
Add 'test_previous_line' back
MartinThoma Jun 11, 2022
f5c6406
Merge remote-tracking branch 'py-pdf/pubpub-zz-ExtractText' into Extr…
pubpub-zz Jun 11, 2022
284519b
fix iss with test_util
pubpub-zz Jun 11, 2022
6dd00e2
Update tests/test_workflows.py
pubpub-zz Jun 11, 2022
921d396
flake8
pubpub-zz Jun 11, 2022
ee06f4f
flake8
pubpub-zz Jun 11, 2022
d90109d
flake8
pubpub-zz Jun 11, 2022
8ea99ef
ROB : extract text of empty page
pubpub-zz Jun 11, 2022
e5948fc
ROB : cope with missing cr in cmap
pubpub-zz Jun 11, 2022
0bd3f04
Fix intrepretation for /Symbol and /Zapfdingbats
pubpub-zz Jun 11, 2022
e34b4a9
default encoding for missing Fonts
pubpub-zz Jun 11, 2022
932e103
TD operator forgotten
pubpub-zz Jun 11, 2022
2b423d8
Remove Tm scaling for lf detection
pubpub-zz Jun 11, 2022
b7ebc55
mix betwen str and bytes
pubpub-zz Jun 11, 2022
249d812
default encodings fix
pubpub-zz Jun 11, 2022
7e3ce51
extend tests
pubpub-zz Jun 11, 2022
114fe68
fix Flake8 and mypy
pubpub-zz Jun 11, 2022
ee8d4b6
replace test files with links
pubpub-zz Jun 11, 2022
9768d5f
Fix xform in xfoms inducing loop (#966)
pubpub-zz Jun 11, 2022
e9fd89c
ROB : cope with \r crlf
pubpub-zz Jun 12, 2022
e60a1fa
FIX : Incorrect decoding of Zapfdingbats
pubpub-zz Jun 12, 2022
2146056
ROB: cope with both /Encoding and /ToUnicode
pubpub-zz Jun 12, 2022
cd26abc
flake8
pubpub-zz Jun 12, 2022
534a8bb
fix Encoding / ToUnicode at the same time
pubpub-zz Jun 12, 2022
d92597a
Merge branch 'main' into ExtractText
MartinThoma Jun 13, 2022
0ba91aa
Apply suggestions from code review
MartinThoma Jun 13, 2022
88f1298
typo
pubpub-zz Jun 13, 2022
de7ddc0
typoUpdate PyPDF2/_cmap.py
pubpub-zz Jun 13, 2022
2aea3e9
fix 'utf-16-be' codec can't decode bytes in position 0-1: unexpected…
pubpub-zz Jun 13, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 157 additions & 47 deletions PyPDF2/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,41 +10,115 @@
# code freely inspired from @twiggy ; see #711
def build_char_map(
font_name: str, space_width: float, obj: DictionaryObject
) -> Tuple[str, float, Dict[int, str], Dict]:
) -> Tuple[
str, float, Union[str, Dict[int, str]], Dict
]: # font_type,space_width /2, encoding, cmap
ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore
font_type: str = cast(str, ft["/Subtype"])

space_code = 32
encoding, space_code = parse_encoding(ft, space_code)
map_dict, space_code = parse_to_unicode(ft, space_code)
map_dict, space_code, int_entry = parse_to_unicode(ft, space_code)

# encoding can be either a string for decode (on 1,2 or a variable number of bytes) of a char table (for 1 byte only for me)
# if empty string, it means it is than encoding field is not present and we have to select the good encoding from cmap input data
if encoding == "":
if -1 not in map_dict or map_dict[-1] == 1:
# I have not been able to find any rule fo no /Encoding nor /ToUnicode
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
# One example shows /Symbol,bold I consider 8 bits encoding default
encoding = "charmap"
else:
encoding = "utf-16-be"
# apply rule from PDF ref 1.7 §5.9.1, 1st bullet : if cmap not empty encoding should be discarded (here transformed into identity for those characters)
# if encoding is an str it is expected to be a identity translation
elif isinstance(encoding, dict):
for x in int_entry:
if x <= 255:
encoding[x] = chr(x)
if font_name in _default_fonts_space_width:
# override space_width with new params
space_width = _default_fonts_space_width[font_name]
sp_width = compute_space_width(ft, space_code, space_width)

return (
font_type,
float(sp_width / 2),
dict(zip(range(256), encoding)),
encoding,
# https://github.com/python/mypy/issues/4374
"".maketrans(map_dict), # type: ignore
)
map_dict, # type: ignore
) # type: ignore


# used when missing data :eg font def missing
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
unknown_char_map : Tuple[str, float, Union[str, Dict[int, str]], Dict] = (
"Unknown", 9999, dict(zip(range(256), ["�"] * 256)), {}
)


_predefined_cmap: Dict[str, str] = {
"/Identity-H": "utf-16-be",
"/Identity-V": "utf-16-be",
"/GB-EUC-H": "gbk", # TBC
"/GB-EUC-V": "gbk", # TBC
"/GBpc-EUC-H": "gb2312", # TBC
"/GBpc-EUC-V": "gb2312", # TBC
}


def parse_encoding(ft: DictionaryObject, space_code: int) -> Tuple[List[str], int]:
encoding: List[str] = []
# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
_default_fonts_space_width: Dict[str, int] = {
"/Courrier": 600,
"/Courier-Bold": 600,
"/Courier-BoldOblique": 600,
"/Courier-Oblique": 600,
"/Helvetica": 278,
"/Helvetica-Bold": 278,
"/Helvetica-BoldOblique": 278,
"/Helvetica-Oblique": 278,
"/Helvetica-Narrow": 228,
"/Helvetica-NarrowBold": 228,
"/Helvetica-NarrowBoldOblique": 228,
"/Helvetica-NarrowOblique": 228,
"/Times-Roman": 250,
"/Times-Bold": 250,
"/Times-BoldItalic": 250,
"/Times-Italic": 250,
"/Symbol": 250,
"/ZapfDingbats": 278,
}


def parse_encoding(
ft: DictionaryObject, space_code: int
) -> Tuple[Union[str, Dict[int, str]], int]:
encoding: Union[str, List[str], Dict[int, str]] = []
if "/Encoding" not in ft:
return encoding, space_code
try:
if "/BaseFont" in ft and ft["/BaseFont"] in charset_encoding:
encoding = dict(zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])]))
else:
encoding = "charmap"
return encoding, _default_fonts_space_width[cast(str, ft["/BaseFont"])]
except Exception:
if ft["/Subtype"] == "/Type1":
return "charmap", space_code
else:
return "", space_code
enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore
if isinstance(enc, str):
try:
if enc in ("/Identity-H", "/Identity-V"):
encoding = []
else:
if enc in charset_encoding:
encoding = charset_encoding[enc].copy()
elif enc in _predefined_cmap:
encoding = _predefined_cmap[enc]
else:
raise Exception("not found")
except Exception:
warnings.warn(
f"Advanced encoding {encoding} not implemented yet",
f"Advanced encoding {enc} not implemented yet",
PdfReadWarning,
)
encoding = charset_encoding["/StandardCoding"].copy()
encoding = enc
elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
try:
encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
Expand All @@ -57,76 +131,112 @@ def parse_encoding(ft: DictionaryObject, space_code: int) -> Tuple[List[str], in
else:
encoding = charset_encoding["/StandardCoding"].copy()
if "/Differences" in enc:
x = 0
x: int = 0
o: Union[int, str]
for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]):
if isinstance(o, int):
x = o
else:
else: # isinstance(o,str):
try:
encoding[x] = adobe_glyphs[o]
encoding[x] = adobe_glyphs[o] # type: ignore
except Exception:
encoding[x] = o
encoding[x] = o # type: ignore
if o == " ":
space_code = x
x += 1
if isinstance(encoding, list):
encoding = dict(zip(range(256), encoding))
return encoding, space_code


def parse_to_unicode(ft: DictionaryObject, space_code: int) -> Tuple[Dict, int]:
map_dict: Dict[Any, Any] = {}
def parse_to_unicode(ft: DictionaryObject, space_code: int) -> Tuple[Dict, int, List[int]]:
map_dict: Dict[
Any, Any
] = (
{}
) # will store all translation code and map_dict[-1] we will have the number of bytes to convert
int_entry : List[int] = [] # will provide the list of cmap keys as int to correct encoding
if "/ToUnicode" not in ft:
return map_dict, space_code
return {}, space_code, []
process_rg: bool = False
process_char: bool = False
cm: str = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data().decode("utf-8")
for l in (
cm: bytes = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
# we need to prepare cm before due to missing return line in pdf printed to pdf from word
cm = (
cm.strip()
.replace("<", " ")
.replace(">", "")
.replace("[", " [ ")
.replace("]", " ] ")
.split("\n")
):
if l == "":
.replace(b"beginbfchar", b"\nbeginbfchar\n")
.replace(b"endbfchar", b"\nendbfchar\n")
.replace(b"beginbfrange", b"\nbeginbfrange\n")
.replace(b"endbfrange", b"\nendbfrange\n")
.replace(b"<<", b"\n{\n") # text between << and >> not used but
.replace(b">>", b"\n}\n") # some solution to find it back
)
ll = cm.split(b"<")
for i in range(len(ll)):
j = ll[i].find(b">")
if j >= 0:
ll[i] = ll[i][:j].replace(b" ", b"") + b" " + ll[i][j + 1 :]
cm = (b" ".join(ll)).replace(b"[", b" [ ").replace(b"]", b" ]\n ").replace(b"\r", b"\n")

for l in cm.split(b"\n"):
if l in (b"", b" "):
continue
if "beginbfrange" in l:
if b"beginbfrange" in l:
process_rg = True
elif "endbfrange" in l:
elif b"endbfrange" in l:
process_rg = False
elif "beginbfchar" in l:
elif b"beginbfchar" in l:
process_char = True
elif "endbfchar" in l:
elif b"endbfchar" in l:
process_char = False
elif process_rg:
lst = [x for x in l.split(" ") if x]
lst = [x for x in l.split(b" ") if x]
a = int(lst[0], 16)
b = int(lst[1], 16)
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
if lst[2] == "[":
nbi = len(lst[0])
map_dict[-1] = nbi // 2
fmt = b"%%0%dX" % nbi
if lst[2] == b"[":
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
for sq in lst[3:]:
if "]":
if sq == b"]":
break
map_dict[a] = unhexlify(sq).decode("utf-16-be")
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be"
)
] = unhexlify(sq).decode("utf-16-be")
Copy link
Member

@MartinThoma MartinThoma Jun 12, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be"
)
] = unhexlify(sq).decode("utf-16-be")
from binascii import Error as BinasciiError
hexlified = fmt % a
try:
unhexlified = unhexlify(hexlified)
except BinasciiError:
break
try:
key = unhexlified.decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be"
)
except UnicodeDecodeError:
break
try:
map_dict[key] = unhexlify(fmt2 % c).decode("utf-16-be")
except UnicodeDecodeError:
map_dict[key] = key

First part is triggered by https://corpora.tika.apache.org/base/docs/govdocs1/985/985942.pdf

  File "/home/moose/Github/py-pdf/PyPDF2/PyPDF2/_page.py", line 1303, in extract_text
    return self._extract_text(self, self.pdf, space_width, PG.CONTENTS)
  File "/home/moose/Github/py-pdf/PyPDF2/PyPDF2/_page.py", line 1126, in _extract_text
    cmaps[f] = build_char_map(f, space_width, obj)
  File "/home/moose/Github/py-pdf/PyPDF2/PyPDF2/_cmap.py", line 20, in build_char_map
    map_dict, space_code = parse_to_unicode(ft, space_code)
  File "/home/moose/Github/py-pdf/PyPDF2/PyPDF2/_cmap.py", line 215, in parse_to_unicode
    key = unhexlify(hexlified).decode(
binascii.Error: Odd-length string

Second part is triggered by https://corpora.tika.apache.org/base/docs/govdocs1/999/999448.pdf

  File "/home/moose/Github/py-pdf/PyPDF2/PyPDF2/_page.py", line 1303, in extract_text
    return self._extract_text(self, self.pdf, space_width, PG.CONTENTS)
  File "/home/moose/Github/py-pdf/PyPDF2/PyPDF2/_page.py", line 1126, in _extract_text
    cmaps[f] = build_char_map(f, space_width, obj)
  File "/home/moose/Github/py-pdf/PyPDF2/PyPDF2/_cmap.py", line 20, in build_char_map
    map_dict, space_code = parse_to_unicode(ft, space_code)
  File "/home/moose/Github/py-pdf/PyPDF2/PyPDF2/_cmap.py", line 220, in parse_to_unicode
    key = unhexlified.decode(
  File "/home/moose/.pyenv/versions/3.10.2/lib/python3.10/encodings/utf_16_be.py", line 16, in decode
    return codecs.utf_16_be_decode(input, errors, True)
UnicodeDecodeError: 'utf-16-be' codec can't decode bytes in position 0-1: unexpected end of data

Third part is triggered by https://corpora.tika.apache.org/base/docs/govdocs1/971/971703.pdf

  File "/home/moose/Github/py-pdf/PyPDF2/PyPDF2/_page.py", line 1302, in extract_text
    return self._extract_text(self, self.pdf, space_width, PG.CONTENTS)
  File "/home/moose/Github/py-pdf/PyPDF2/PyPDF2/_page.py", line 1125, in _extract_text
    cmaps[f] = build_char_map(f, space_width, obj)
  File "/home/moose/Github/py-pdf/PyPDF2/PyPDF2/_cmap.py", line 20, in build_char_map
    map_dict, space_code = parse_to_unicode(ft, space_code)
  File "/home/moose/Github/py-pdf/PyPDF2/PyPDF2/_cmap.py", line 212, in parse_to_unicode
    ] = unhexlify(fmt2 % c).decode("utf-16-be")
  File "/home/moose/.pyenv/versions/3.10.2/lib/python3.10/encodings/utf_16_be.py", line 16, in decode
    return codecs.utf_16_be_decode(input, errors, True)
UnicodeDecodeError: 'utf-16-be' codec can't decode byte 0x20 in position 0: truncated data

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, I'm pretty certain there is a better way to deal with that. I just don't know how.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've retested the file and (with latests mods) the issue is somewhere else

  File "C:\Python310\lib\site-packages\PyPDF2\_page.py", line 1302, in extract_text
    return self._extract_text(self, self.pdf, space_width, PG.CONTENTS)
  File "C:\Python310\lib\site-packages\PyPDF2\_page.py", line 1284, in _extract_text
    process_operation(operator, operands)
  File "C:\Python310\lib\site-packages\PyPDF2\_page.py", line 1222, in process_operation
    t = tt.decode(cmap[0])  # apply str encoding
  File "C:\Python310\lib\encodings\utf_16_be.py", line 16, in decode
    return codecs.utf_16_be_decode(input, errors, True)
UnicodeDecodeError: 'utf-16-be' codec can't decode byte 0x20 in position 0: truncated data

For this latest issue, I've checked the PDF and the font transcoding do expect 2 bytes standard. The file at one place is showing only b" " (just before I've noted a <0055>). Don't know how to cope with this situation

int_entry.append(a)
a += 1
assert a > b
else:
c = int(lst[2], 16)
fmt = b"%%0%dX" % len(lst[2])
fmt2 = b"%%0%dX" % len(lst[2])
while a <= b:
map_dict[a] = unhexlify(fmt % c).decode("utf-16-be")
map_dict[
unhexlify(fmt % a).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be"
)
] = unhexlify(fmt2 % c).decode("utf-16-be")
int_entry.append(a)
a += 1
c += 1
elif process_char:
lst = [x for x in l.split(" ") if x]
a = int(lst[0], 16)
map_dict[a] = unhexlify("".join(lst[1:])).decode(
"utf-16-be"
) # join is here as some cases where the code was split

# get
lst = [x for x in l.split(b" ") if x]
map_dict[-1] = len(lst[0]) // 2
while len(lst) > 0:
map_dict[
unhexlify(lst[0]).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be"
)
] = unhexlify(lst[1]).decode(
"utf-16-be"
) # join is here as some cases where the code was split
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
map_dict[
unhexlify(lst[0]).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be"
)
] = unhexlify(lst[1]).decode(
"utf-16-be"
) # join is here as some cases where the code was split
hexlified = lst[0]
try:
unhexlified = unhexlify(hexlified)
except BinasciiError:
# Odd-length string
break
key = unhexlified.decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be"
)
# join is here as some cases where the code was split
map_dict[key] = unhexlify(lst[1]).decode("utf-16-be")

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you show me an exemple of issue inhere ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure:

  File "/home/moose/Github/py-pdf/PyPDF2/PyPDF2/_page.py", line 1302, in extract_text
    return self._extract_text(self, self.pdf, space_width, PG.CONTENTS)
  File "/home/moose/Github/py-pdf/PyPDF2/PyPDF2/_page.py", line 1125, in _extract_text
    cmaps[f] = build_char_map(f, space_width, obj)
  File "/home/moose/Github/py-pdf/PyPDF2/PyPDF2/_cmap.py", line 21, in build_char_map
    map_dict, space_code = parse_to_unicode(ft, space_code)
  File "/home/moose/Github/py-pdf/PyPDF2/PyPDF2/_cmap.py", line 209, in parse_to_unicode
    unhexlify(fmt % a).decode(
binascii.Error: Odd-length string

for https://corpora.tika.apache.org/base/docs/govdocs1/999/999097.pdf

int_entry.append(int(lst[0], 16))
lst = lst[2:]
for a in map_dict:
if map_dict[a] == " ":
space_code = a
return map_dict, space_code
return map_dict, space_code, int_entry


def compute_space_width(
Expand Down
Loading