Skip to content

Commit

Permalink
ROB : cope with 2 digit codes in bfchar
Browse files Browse the repository at this point in the history
  • Loading branch information
pubpub-zz committed Sep 1, 2022
1 parent 7a95708 commit 40df2fd
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 2 deletions.
4 changes: 2 additions & 2 deletions PyPDF2/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,11 +289,11 @@ def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> No
map_to = ""
# placeholder (see above) means empty string
if lst[1] != b".":
map_to = unhexlify(lst[1]).decode(
map_to = unhexlify(b"0" * max(0, 4 - len(lst[1])) + lst[1]).decode(
"utf-16-be", "surrogatepass"
) # join is here as some cases where the code was split
map_dict[
unhexlify(lst[0]).decode(
unhexlify(b"0" * max(0, 4 - len(lst[0])) + lst[0]).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
)
] = map_to
Expand Down
9 changes: 9 additions & 0 deletions tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,12 @@ def test_get_font_width_from_default(): # L40
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()


def test_bfchar_on_2_chars():
# iss #1293
url = "https://github.com/xyegithub/myBlog/raw/main/posts/c94b2364/paper_pdfs/ImageClassification/2007%2CASurveyofImageClassificationBasedTechniques.pdf"
name = "ASurveyofImageClassificationBasedTechniques.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()

0 comments on commit 40df2fd

Please sign in to comment.