py-pdf · Lucas-C · Aug 2, 2023 · Jun 14, 2023 · Jun 14, 2023 · Jun 16, 2023
@@ -6,13 +6,20 @@
 
 from collections import defaultdict
 from dataclasses import dataclass, replace
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple, Union
 
 from fontTools import ttLib
 
+try:
+    import uharfbuzz as hb
+except:
+    hb = None
+
 from .drawing import DeviceGray, DeviceRGB
 from .enums import FontDescriptorFlags, TextEmphasis
+from .errors import FPDFException
 from .syntax import Name, PDFObject
+from .util import escape_parens
 
 
 @dataclass
@@ -61,19 +68,28 @@ def __init__(self, fpdf, fontkey, style):
         self.fontkey = fontkey
         self.emphasis = TextEmphasis.coerce(style)
 
+    def get_text_width(self, text, font_size_pt):
+        return (len(text), sum(self.cw[c] for c in text) * font_size_pt * 0.001)
+
+    def convert_pdf_text(self, text):
+        return f"({escape_parens(text)}) Tj"
+
 
 class TTFFont:
     __slots__ = (
         "i",
         "type",
         "name",
         "desc",
+        "glyph_ids",
+        "hbfont",
         "up",
         "ut",
         "cw",
         "ttffile",
         "fontkey",
         "emphasis",
+        "scale",
         "subset",
         "cmap",
     )
@@ -86,8 +102,8 @@ def __init__(self, fpdf, font_file_path, fontkey, style):
 
         font = ttLib.TTFont(self.ttffile, fontNumber=0, lazy=True)
 
-        scale = 1000 / font["head"].unitsPerEm
-        default_width = round(scale * font["hmtx"].metrics[".notdef"][0])
+        self.scale = 1000 / font["head"].unitsPerEm
+        default_width = round(self.scale * font["hmtx"].metrics[".notdef"][0])
 
         try:
             cap_height = font["OS/2"].sCapHeight
@@ -104,13 +120,13 @@ def __init__(self, fpdf, font_file_path, fontkey, style):
             flags |= FontDescriptorFlags.FORCE_BOLD
 
         self.desc = PDFFontDescriptor(
-            ascent=round(font["hhea"].ascent * scale),
-            descent=round(font["hhea"].descent * scale),
-            cap_height=round(cap_height * scale),
+            ascent=round(font["hhea"].ascent * self.scale),
+            descent=round(font["hhea"].descent * self.scale),
+            cap_height=round(cap_height * self.scale),
             flags=flags,
             font_b_box=(
-                f"[{font['head'].xMin * scale:.0f} {font['head'].yMin * scale:.0f}"
-                f" {font['head'].xMax * scale:.0f} {font['head'].yMax * scale:.0f}]"
+                f"[{font['head'].xMin * self.scale:.0f} {font['head'].yMin * self.scale:.0f}"
+                f" {font['head'].xMax * self.scale:.0f} {font['head'].yMax * self.scale:.0f}]"
             ),
             italic_angle=int(font["post"].italicAngle),
             stem_v=round(50 + int(pow((font["OS/2"].usWeightClass / 65), 2))),
@@ -119,7 +135,16 @@ def __init__(self, fpdf, font_file_path, fontkey, style):
 
         # a map unicode_char -> char_width
         self.cw = defaultdict(lambda: default_width)
+
+        # fonttools cmap = unicode char to glyph name
+        # saving only the keys we have a tuple with
+        # the unicode characters available on the font
         self.cmap = tuple(font.getBestCmap().keys())
+
+        # saving a list of glyph ids to char to allow
+        # subset by unicode (regular) and by glyph
+        # (shaped with harfbuz)
+        self.glyph_ids = {}
         for char in self.cmap:
             # take glyph associated to char
             glyph = font.getBestCmap()[char]
@@ -131,20 +156,81 @@ def __init__(self, fpdf, font_file_path, fontkey, style):
             if w == 65535:
                 w = 0
 
-            self.cw[char] = round(scale * w + 0.001)  # ROUND_HALF_UP
+            self.cw[char] = round(self.scale * w + 0.001)  # ROUND_HALF_UP
+
+            self.glyph_ids[char] = font.getGlyphID(glyph)
 
         # include numbers in the subset! (if alias present)
         # ensure that alias is mapped 1-by-1 additionally (must be replaceable)
-        sbarr = "\x00 "
+        sbarr = "\x00 \n\r"
         if fpdf.str_alias_nb_pages:
             sbarr += "0123456789"
             sbarr += fpdf.str_alias_nb_pages
 
         self.name = re.sub("[ ()]", "", font["name"].getBestFullName())
-        self.up = round(font["post"].underlinePosition * scale)
-        self.ut = round(font["post"].underlineThickness * scale)
+        self.up = round(font["post"].underlinePosition * self.scale)
+        self.ut = round(font["post"].underlineThickness * self.scale)
         self.emphasis = TextEmphasis.coerce(style)
-        self.subset = SubsetMap([ord(char) for char in sbarr])
+        self.subset = SubsetMap(self, [ord(char) for char in sbarr])
+
+    def get_text_width(self, text, font_size_pt, text_shaping):
+        if text_shaping:
+            return self.shaped_text_width(text, font_size_pt)
+        return (len(text), sum(self.cw[ord(c)] for c in text) * font_size_pt * 0.001)
+
+    def shaped_text_width(self, text, font_size_pt):
+        if hb == None:
+            raise FPDFException(
+                'uharfbuzz wasn\'t imported. Use "pip install uharfbuzz"'
+            )
+        if not hasattr(self, "hbfont"):
+            self.hbfont = hb.Font(hb.Face(hb.Blob.from_file_path(self.ttffile)))
+        self.hbfont.ptem = font_size_pt
+        buf = hb.Buffer()
+        buf.add_str("".join(text))
+        buf.guess_segment_properties()
+        features = {"kern": True, "liga": True}
+        hb.shape(self.hbfont, buf, features)
+        text_width = 0
+        for pos in buf.glyph_positions:
+            text_width += pos.x_advance
+            text_width += pos.x_offset
+        return (len(buf.glyph_positions), text_width)
+
+    def convert_pdf_text(self, text):
+        txt_mapped = ""
+        for char in text:
+            uni = ord(char)
+            # Instead of adding the actual character to the stream its code is
+            # mapped to a position in the font's subset
+            txt_mapped += chr(self.subset.pick(uni))
+        return f'({escape_parens(txt_mapped.encode("utf-16-be").decode("latin-1"))}) Tj'
+
+    def shape_text(self, text, font_size_pt):
+        if hb == None:
+            raise FPDFException(
+                'uharfbuzz wasn\'t imported. Use "pip install uharfbuzz"'
+            )
+        if not hasattr(self, "hbfont"):
+            self.hbfont = hb.Font(hb.Face(hb.Blob.from_file_path(self.ttffile)))
+        self.hbfont.ptem = font_size_pt
+        buf = hb.Buffer()
+        buf.add_str(text)
+        buf.guess_segment_properties()
+        features = {"kern": True, "liga": True}
+        hb.shape(self.hbfont, buf, features)
+        char_mapped = []
+        # TO DO : find cluster gaps
+        # Ex: text = "ABCD"
+        # glyph infos has cluster: 0, 2, 3 - it means A and B are together on the first glyph
+        # (ligature or substitution) - the glyph should have both unicodes and it should be translated
+        # properly on the CID to GID mapping
+        for gid in buf.glyph_infos:
+            glyph = self.subset.get_glyph(
+                glyph=gid.codepoint, unicode=[ord(text[gid.cluster])]
+            )
+            char_mapped.append(self.subset.pick_glyph(glyph))
+        return zip(char_mapped, buf.glyph_positions)
 
 
 class PDFFontDescriptor(PDFObject):
@@ -172,6 +258,18 @@ def __init__(
         self.font_name = None
 
 
+@dataclass(order=True, frozen=True)
+class Glyph:
+    """
+    This represents one glyph on the font
+    Unicode is a tuple because ligatures or character substitution
+    can map a sequence of unicode characters to a single glyph
+    """
+
+    glyph_id: int
+    unicode: Tuple
+
+
 class SubsetMap:
     """Holds a mapping of used characters and their position in the font's subset
 
@@ -183,35 +281,64 @@ class SubsetMap:
     the lowest possible representation.
     """
 
-    def __init__(self, identities: List[int]):
+    def __init__(self, font: TTFFont, identities: List[int]):
         super().__init__()
         self._next = 0
+        self.font = font
 
         # sort list to ease deletion once _next
         # becomes higher than first reservation
         self._reserved = sorted(identities)
 
         # int(x) to ensure values are integers
-        self._map = {x: int(x) for x in self._reserved}
+        self._map = {}
+        for x in self._reserved:
+            glyph = self.get_glyph(unicode=x)
+            if glyph:
+                self._map[glyph] = int(x)
 
     def __len__(self):
         return len(self._map)
 
     def pick(self, unicode: int):
-        if not unicode in self._map:
+        glyph = self.get_glyph(unicode=unicode)
+        return self.pick_glyph(glyph)
+
+    def pick_glyph(self, glyph):
+        if (glyph) and (glyph not in self._map):
             while self._next in self._reserved:
                 self._next += 1
                 if self._next > self._reserved[0]:
                     del self._reserved[0]
-
-            self._map[unicode] = self._next
+            self._map[glyph] = self._next
             self._next += 1
 
-        return self._map.get(unicode)
+        return self._map.get(glyph)
 
     def dict(self):
         return self._map.copy()
 
+    def get_glyph(self, glyph=None, unicode=None) -> Glyph:
+        if glyph:
+            return Glyph(glyph, tuple(unicode))
+        if unicode in self.font.glyph_ids:
+            return Glyph(self.font.glyph_ids[unicode], tuple([unicode]))
+        if unicode == 0x00:
+            return Glyph(self.font.cmap[0], tuple([0x00]))
+        return None
+
+    def get_glyph_by_id(self, cid) -> Glyph:
+        for glyph in self._map.keys():
+            if glyph.glyph_id == cid:
+                return glyph
+        return None
+
+    def get_glyph_by_unicode(self, cid) -> Glyph:
+        for glyph in self._map.keys():
+            if glyph.unicode[0] == cid:
+                return glyph
+        return None
+
 
 # Standard fonts
 CORE_FONTS = {

@@ -311,6 +311,7 @@ def __init__(
         self.current_font = (
             None  # current font, None or an instance of CoreFont or TTFFont
         )
+        self.text_shaping = False  # use text shaping engine (harbuzz) or not
         self.draw_color = self.DEFAULT_DRAW_COLOR
         self.fill_color = self.DEFAULT_FILL_COLOR
         self.text_color = self.DEFAULT_TEXT_COLOR
@@ -559,6 +560,12 @@ def set_display_mode(self, zoom, layout="continuous"):
             raise FPDFException(f"Incorrect zoom display mode: {zoom}")
         self.page_layout = LAYOUT_ALIASES.get(layout, layout)
 
+    def set_text_shaping(self, use_shaping_engine):
+        """
+        True or False value to enable or disable text shaping engine when rendering text
+        """
+        self.text_shaping = use_shaping_engine
+
     @property
     def page_layout(self):
         return self._page_layout
@@ -2298,20 +2305,10 @@ def text(self, x, y, txt=""):
         if not self.font_family:
             raise FPDFException("No font set, you need to call set_font() beforehand")
         txt = self.normalize_text(txt)
-        if self.is_ttf_font:
-            txt_mapped = ""
-            for char in txt:
-                uni = ord(char)
-                # Instead of adding the actual character to the stream its code is
-                # mapped to a position in the font's subset
-                txt_mapped += chr(self.current_font.subset.pick(uni))
-            txt2 = escape_parens(txt_mapped.encode("utf-16-be").decode("latin-1"))
-        else:
-            txt2 = escape_parens(txt)
         sl = [f"BT {x * self.k:.2f} {(self.h - y) * self.k:.2f} Td"]
         if self.text_mode != TextMode.FILL:
             sl.append(f" {self.text_mode} Tr {self.line_width:.2f} w")
-        sl.append(f"({txt2}) Tj ET")
+        sl.append(f"{self.current_font.convert_pdf_text(txt)} ET")
         if (self.underline and txt != "") or self._record_text_quad_points:
             w = self.get_string_width(txt, normalized=True, markdown=False)
             if self.underline and txt != "":
@@ -2851,8 +2848,6 @@ def _render_styled_text_line(
             if self.fill_color != self.text_color:
                 sl.append(self.text_color.serialize().lower())
 
-            # do this once in advance
-            u_space = escape_parens(" ".encode("utf-16-be").decode("latin-1"))
             word_spacing = 0
             if text_line.justify:
                 # Don't rely on align==Align.J here.
@@ -2891,43 +2886,15 @@ def _render_styled_text_line(
                 ):
                     current_text_mode = frag.text_mode
                     sl.append(f"{frag.text_mode} Tr {frag.line_width:.2f} w")
-
-                if frag.is_ttf_font:
-                    mapped_text = ""
-                    for char in frag.string:
-                        uni = ord(char)
-                        mapped_text += chr(frag.font.subset.pick(uni))
-                    if word_spacing:
-                        # "Tw" only has an effect on the ASCII space character and ignores
-                        # space characters from unicode (TTF) fonts. As a workaround,
-                        # we do word spacing using an adjustment before each space.
-                        # Determine the index of the space character (" ") in the current
-                        # subset and split words whenever this mapping code is found
-                        words = mapped_text.split(chr(frag.font.subset.pick(ord(" "))))
-                        words_strl = []
-                        for word_i, word in enumerate(words):
-                            # pylint: disable=redefined-loop-name
-                            word = escape_parens(
-                                word.encode("utf-16-be").decode("latin-1")
-                            )
-                            if word_i == 0:
-                                words_strl.append(f"({word})")
-                            else:
-                                adj = -(frag_ws * frag.k) * 1000 / frag.font_size_pt
-                                words_strl.append(f"{adj:.3f}({u_space}{word})")
-                        escaped_text = " ".join(words_strl)
-                        sl.append(f"[{escaped_text}] TJ")
-                    else:
-                        escaped_text = escape_parens(
-                            mapped_text.encode("utf-16-be").decode("latin-1")
-                        )
-                        sl.append(f"({escaped_text}) Tj")
-                else:  # core fonts
-                    if frag_ws != current_ws:
-                        sl.append(f"{frag_ws * frag.k:.3f} Tw")
-                        current_ws = frag_ws
-                    escaped_text = escape_parens(frag.string)
-                    sl.append(f"({escaped_text}) Tj")
+                adjust_x = dx
+                adjust_y = 0.5 * h + 0.3 * max_font_size
+                r_text = frag.render_pdf_text(
+                    frag_ws, current_ws, word_spacing, adjust_x, adjust_y, self
+                )
+                if r_text:
+                    sl.append(r_text)
+                if not frag.is_ttf_font:
+                    current_ws = frag_ws
                 frag_width = frag.get_width(
                     initial_cs=i != 0
                 ) + word_spacing * frag.characters.count(" ")