ROB : cope with 2 digit codes in bfchar

fixes py-pdf#1293
pubpub-zz · Sep 1, 2022 · 40df2fd · 40df2fd
1 parent 7a95708
commit 40df2fd
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 2 deletions.
diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py
@@ -289,11 +289,11 @@ def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> No
         map_to = ""
         # placeholder (see above) means empty string
         if lst[1] != b".":
-            map_to = unhexlify(lst[1]).decode(
+            map_to = unhexlify(b"0" * max(0, 4 - len(lst[1])) + lst[1]).decode(
                 "utf-16-be", "surrogatepass"
             )  # join is here as some cases where the code was split
         map_dict[
-            unhexlify(lst[0]).decode(
+            unhexlify(b"0" * max(0, 4 - len(lst[0])) + lst[0]).decode(
                 "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
             )
         ] = map_to

diff --git a/tests/test_cmap.py b/tests/test_cmap.py
@@ -46,3 +46,12 @@ def test_get_font_width_from_default():  # L40
     reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
     for page in reader.pages:
         page.extract_text()
+
+
+def test_bfchar_on_2_chars():
+    # iss #1293
+    url = "https://github.com/xyegithub/myBlog/raw/main/posts/c94b2364/paper_pdfs/ImageClassification/2007%2CASurveyofImageClassificationBasedTechniques.pdf"
+    name = "ASurveyofImageClassificationBasedTechniques.pdf"
+    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
+    for page in reader.pages:
+        page.extract_text()