Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Text shaping #820

Merged
merged 40 commits into from
Aug 2, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
331f6a0
draft of text shaping
andersonhc Jun 14, 2023
9a8c68a
Merge branch 'PyFPDF:master' into master
andersonhc Jun 14, 2023
df321a1
Fix font fallback, missing glyphs, and descriptor
andersonhc Jun 16, 2023
9701b32
fix pylint alarms
andersonhc Jun 16, 2023
c04caee
add uharfbuzz on test/requirements.txt
andersonhc Jun 16, 2023
8118d7f
disable pylint check on a function
andersonhc Jun 16, 2023
431aeea
regenerate reference PDFs - requirements.txt chang
andersonhc Jun 16, 2023
be06b8e
implement char stretching, char spacing
andersonhc Jun 17, 2023
de3e4ed
Update test_text_shaping.py
andersonhc Jun 17, 2023
714b7c7
fix not calling render with length 0
andersonhc Jun 18, 2023
cd2533e
Merge branch 'master' of https://github.com/andersonhc/fpdf2
andersonhc Jun 18, 2023
40721a5
fix glyph with different widths
andersonhc Jun 19, 2023
a84dbbb
Merge branch 'PyFPDF:master' into master
andersonhc Jun 21, 2023
9e4d0aa
add documentation and parameters
andersonhc Jun 24, 2023
9bc083c
add test for features
andersonhc Jun 28, 2023
23eadff
format with black
andersonhc Jun 28, 2023
ae5f9c3
remove space between ligature codes to please Vera
andersonhc Jun 29, 2023
23f68d6
fix surogate pair bug
andersonhc Jun 29, 2023
eb19226
fix test with bad surrogate pair
andersonhc Jun 29, 2023
cc29efe
update docs, changelog and vera ignore codes
andersonhc Jun 29, 2023
50d1b03
Update CHANGELOG.md
andersonhc Jun 29, 2023
3cac887
Create close() method on TTFFont class
andersonhc Jul 24, 2023
a3350af
Merge branch 'master' into master
andersonhc Jul 24, 2023
93f3a2d
add __repr__
andersonhc Jul 24, 2023
99be58b
check if "meta" table exclusion is breaking tests
andersonhc Jul 24, 2023
f0f1ab8
reinstate meta exclusion
andersonhc Jul 24, 2023
b3c6802
test if new fonttools is causing the otf problem
andersonhc Jul 24, 2023
251732d
.
andersonhc Jul 24, 2023
1900a01
'rebase'
Jul 25, 2023
32b4c57
undo setup.cfg change
andersonhc Jul 25, 2023
2c02bcb
update tests for new fonttools
andersonhc Jul 25, 2023
62f6ea5
-a
Jul 26, 2023
c383483
Merge branch 'PyFPDF-master'
Jul 26, 2023
29d01fa
update test file
andersonhc Jul 26, 2023
04a1807
Merge pull request #4 from PyFPDF/master
andersonhc Aug 2, 2023
ac53867
Update docs/TextShaping.md
andersonhc Aug 2, 2023
f42ee3e
Update fpdf/fonts.py
andersonhc Aug 2, 2023
64d5481
Update fpdf/fonts.py
andersonhc Aug 2, 2023
acdc78e
implement reviewer suggestions
andersonhc Aug 2, 2023
5bb3c5e
Merge branch 'master' of https://github.com/andersonhc/fpdf2
andersonhc Aug 2, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 146 additions & 19 deletions fpdf/fonts.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,20 @@

from collections import defaultdict
from dataclasses import dataclass, replace
from typing import List, Optional, Union
from typing import List, Optional, Tuple, Union

from fontTools import ttLib

try:
import uharfbuzz as hb
except:
andersonhc marked this conversation as resolved.
Show resolved Hide resolved
hb = None

from .drawing import DeviceGray, DeviceRGB
from .enums import FontDescriptorFlags, TextEmphasis
from .errors import FPDFException
from .syntax import Name, PDFObject
from .util import escape_parens


@dataclass
Expand Down Expand Up @@ -61,19 +68,28 @@ def __init__(self, fpdf, fontkey, style):
self.fontkey = fontkey
self.emphasis = TextEmphasis.coerce(style)

def get_text_width(self, text, font_size_pt):
return (len(text), sum(self.cw[c] for c in text) * font_size_pt * 0.001)

def convert_pdf_text(self, text):
andersonhc marked this conversation as resolved.
Show resolved Hide resolved
return f"({escape_parens(text)}) Tj"


class TTFFont:
__slots__ = (
"i",
"type",
"name",
"desc",
"glyph_ids",
"hbfont",
"up",
"ut",
"cw",
"ttffile",
"fontkey",
"emphasis",
"scale",
"subset",
"cmap",
)
Expand All @@ -86,8 +102,8 @@ def __init__(self, fpdf, font_file_path, fontkey, style):

font = ttLib.TTFont(self.ttffile, fontNumber=0, lazy=True)

scale = 1000 / font["head"].unitsPerEm
default_width = round(scale * font["hmtx"].metrics[".notdef"][0])
self.scale = 1000 / font["head"].unitsPerEm
default_width = round(self.scale * font["hmtx"].metrics[".notdef"][0])

try:
cap_height = font["OS/2"].sCapHeight
Expand All @@ -104,13 +120,13 @@ def __init__(self, fpdf, font_file_path, fontkey, style):
flags |= FontDescriptorFlags.FORCE_BOLD

self.desc = PDFFontDescriptor(
ascent=round(font["hhea"].ascent * scale),
descent=round(font["hhea"].descent * scale),
cap_height=round(cap_height * scale),
ascent=round(font["hhea"].ascent * self.scale),
descent=round(font["hhea"].descent * self.scale),
cap_height=round(cap_height * self.scale),
flags=flags,
font_b_box=(
f"[{font['head'].xMin * scale:.0f} {font['head'].yMin * scale:.0f}"
f" {font['head'].xMax * scale:.0f} {font['head'].yMax * scale:.0f}]"
f"[{font['head'].xMin * self.scale:.0f} {font['head'].yMin * self.scale:.0f}"
f" {font['head'].xMax * self.scale:.0f} {font['head'].yMax * self.scale:.0f}]"
),
italic_angle=int(font["post"].italicAngle),
stem_v=round(50 + int(pow((font["OS/2"].usWeightClass / 65), 2))),
Expand All @@ -119,7 +135,16 @@ def __init__(self, fpdf, font_file_path, fontkey, style):

# a map unicode_char -> char_width
self.cw = defaultdict(lambda: default_width)

# fonttools cmap = unicode char to glyph name
# saving only the keys we have a tuple with
# the unicode characters available on the font
self.cmap = tuple(font.getBestCmap().keys())

# saving a list of glyph ids to char to allow
# subset by unicode (regular) and by glyph
# (shaped with harfbuz)
self.glyph_ids = {}
for char in self.cmap:
# take glyph associated to char
glyph = font.getBestCmap()[char]
Expand All @@ -131,20 +156,81 @@ def __init__(self, fpdf, font_file_path, fontkey, style):
if w == 65535:
w = 0

self.cw[char] = round(scale * w + 0.001) # ROUND_HALF_UP
self.cw[char] = round(self.scale * w + 0.001) # ROUND_HALF_UP

self.glyph_ids[char] = font.getGlyphID(glyph)

# include numbers in the subset! (if alias present)
# ensure that alias is mapped 1-by-1 additionally (must be replaceable)
sbarr = "\x00 "
sbarr = "\x00 \n\r"
if fpdf.str_alias_nb_pages:
sbarr += "0123456789"
sbarr += fpdf.str_alias_nb_pages

self.name = re.sub("[ ()]", "", font["name"].getBestFullName())
self.up = round(font["post"].underlinePosition * scale)
self.ut = round(font["post"].underlineThickness * scale)
self.up = round(font["post"].underlinePosition * self.scale)
self.ut = round(font["post"].underlineThickness * self.scale)
self.emphasis = TextEmphasis.coerce(style)
self.subset = SubsetMap([ord(char) for char in sbarr])
self.subset = SubsetMap(self, [ord(char) for char in sbarr])

def get_text_width(self, text, font_size_pt, text_shaping):
if text_shaping:
return self.shaped_text_width(text, font_size_pt)
return (len(text), sum(self.cw[ord(c)] for c in text) * font_size_pt * 0.001)

def shaped_text_width(self, text, font_size_pt):
if hb == None:
raise FPDFException(
'uharfbuzz wasn\'t imported. Use "pip install uharfbuzz"'
)
if not hasattr(self, "hbfont"):
self.hbfont = hb.Font(hb.Face(hb.Blob.from_file_path(self.ttffile)))
self.hbfont.ptem = font_size_pt
buf = hb.Buffer()
buf.add_str("".join(text))
buf.guess_segment_properties()
features = {"kern": True, "liga": True}
hb.shape(self.hbfont, buf, features)
text_width = 0
for pos in buf.glyph_positions:
text_width += pos.x_advance
text_width += pos.x_offset
return (len(buf.glyph_positions), text_width)

def convert_pdf_text(self, text):
andersonhc marked this conversation as resolved.
Show resolved Hide resolved
txt_mapped = ""
for char in text:
uni = ord(char)
# Instead of adding the actual character to the stream its code is
# mapped to a position in the font's subset
txt_mapped += chr(self.subset.pick(uni))
return f'({escape_parens(txt_mapped.encode("utf-16-be").decode("latin-1"))}) Tj'

def shape_text(self, text, font_size_pt):
if hb == None:
raise FPDFException(
'uharfbuzz wasn\'t imported. Use "pip install uharfbuzz"'
andersonhc marked this conversation as resolved.
Show resolved Hide resolved
)
if not hasattr(self, "hbfont"):
self.hbfont = hb.Font(hb.Face(hb.Blob.from_file_path(self.ttffile)))
self.hbfont.ptem = font_size_pt
buf = hb.Buffer()
buf.add_str(text)
buf.guess_segment_properties()
features = {"kern": True, "liga": True}
hb.shape(self.hbfont, buf, features)
char_mapped = []
# TO DO : find cluster gaps
# Ex: text = "ABCD"
# glyph infos has cluster: 0, 2, 3 - it means A and B are together on the first glyph
# (ligature or substitution) - the glyph should have both unicodes and it should be translated
# properly on the CID to GID mapping
for gid in buf.glyph_infos:
glyph = self.subset.get_glyph(
glyph=gid.codepoint, unicode=[ord(text[gid.cluster])]
)
char_mapped.append(self.subset.pick_glyph(glyph))
return zip(char_mapped, buf.glyph_positions)


class PDFFontDescriptor(PDFObject):
Expand Down Expand Up @@ -172,6 +258,18 @@ def __init__(
self.font_name = None


@dataclass(order=True, frozen=True)
class Glyph:
"""
This represents one glyph on the font
Unicode is a tuple because ligatures or character substitution
can map a sequence of unicode characters to a single glyph
"""

andersonhc marked this conversation as resolved.
Show resolved Hide resolved
glyph_id: int
unicode: Tuple


class SubsetMap:
"""Holds a mapping of used characters and their position in the font's subset

Expand All @@ -183,35 +281,64 @@ class SubsetMap:
the lowest possible representation.
"""

def __init__(self, identities: List[int]):
def __init__(self, font: TTFFont, identities: List[int]):
super().__init__()
self._next = 0
self.font = font

# sort list to ease deletion once _next
# becomes higher than first reservation
self._reserved = sorted(identities)

# int(x) to ensure values are integers
self._map = {x: int(x) for x in self._reserved}
self._map = {}
for x in self._reserved:
glyph = self.get_glyph(unicode=x)
if glyph:
self._map[glyph] = int(x)

def __len__(self):
return len(self._map)

def pick(self, unicode: int):
if not unicode in self._map:
glyph = self.get_glyph(unicode=unicode)
return self.pick_glyph(glyph)

def pick_glyph(self, glyph):
if (glyph) and (glyph not in self._map):
while self._next in self._reserved:
self._next += 1
if self._next > self._reserved[0]:
del self._reserved[0]

self._map[unicode] = self._next
self._map[glyph] = self._next
self._next += 1

return self._map.get(unicode)
return self._map.get(glyph)

def dict(self):
return self._map.copy()

def get_glyph(self, glyph=None, unicode=None) -> Glyph:
if glyph:
return Glyph(glyph, tuple(unicode))
if unicode in self.font.glyph_ids:
return Glyph(self.font.glyph_ids[unicode], tuple([unicode]))
if unicode == 0x00:
return Glyph(self.font.cmap[0], tuple([0x00]))
return None

def get_glyph_by_id(self, cid) -> Glyph:
for glyph in self._map.keys():
if glyph.glyph_id == cid:
return glyph
return None

def get_glyph_by_unicode(self, cid) -> Glyph:
for glyph in self._map.keys():
if glyph.unicode[0] == cid:
return glyph
return None


# Standard fonts
CORE_FONTS = {
Expand Down
67 changes: 17 additions & 50 deletions fpdf/fpdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,7 @@ def __init__(
self.current_font = (
None # current font, None or an instance of CoreFont or TTFFont
)
self.text_shaping = False # use text shaping engine (harbuzz) or not
self.draw_color = self.DEFAULT_DRAW_COLOR
self.fill_color = self.DEFAULT_FILL_COLOR
self.text_color = self.DEFAULT_TEXT_COLOR
Expand Down Expand Up @@ -559,6 +560,12 @@ def set_display_mode(self, zoom, layout="continuous"):
raise FPDFException(f"Incorrect zoom display mode: {zoom}")
self.page_layout = LAYOUT_ALIASES.get(layout, layout)

def set_text_shaping(self, use_shaping_engine):
"""
True or False value to enable or disable text shaping engine when rendering text
andersonhc marked this conversation as resolved.
Show resolved Hide resolved
"""
self.text_shaping = use_shaping_engine
andersonhc marked this conversation as resolved.
Show resolved Hide resolved

@property
def page_layout(self):
return self._page_layout
Expand Down Expand Up @@ -2298,20 +2305,10 @@ def text(self, x, y, txt=""):
if not self.font_family:
raise FPDFException("No font set, you need to call set_font() beforehand")
txt = self.normalize_text(txt)
if self.is_ttf_font:
txt_mapped = ""
for char in txt:
uni = ord(char)
# Instead of adding the actual character to the stream its code is
# mapped to a position in the font's subset
txt_mapped += chr(self.current_font.subset.pick(uni))
txt2 = escape_parens(txt_mapped.encode("utf-16-be").decode("latin-1"))
else:
txt2 = escape_parens(txt)
sl = [f"BT {x * self.k:.2f} {(self.h - y) * self.k:.2f} Td"]
if self.text_mode != TextMode.FILL:
sl.append(f" {self.text_mode} Tr {self.line_width:.2f} w")
sl.append(f"({txt2}) Tj ET")
sl.append(f"{self.current_font.convert_pdf_text(txt)} ET")
if (self.underline and txt != "") or self._record_text_quad_points:
w = self.get_string_width(txt, normalized=True, markdown=False)
if self.underline and txt != "":
Expand Down Expand Up @@ -2851,8 +2848,6 @@ def _render_styled_text_line(
if self.fill_color != self.text_color:
sl.append(self.text_color.serialize().lower())

# do this once in advance
u_space = escape_parens(" ".encode("utf-16-be").decode("latin-1"))
word_spacing = 0
if text_line.justify:
# Don't rely on align==Align.J here.
Expand Down Expand Up @@ -2891,43 +2886,15 @@ def _render_styled_text_line(
):
current_text_mode = frag.text_mode
sl.append(f"{frag.text_mode} Tr {frag.line_width:.2f} w")

if frag.is_ttf_font:
mapped_text = ""
for char in frag.string:
uni = ord(char)
mapped_text += chr(frag.font.subset.pick(uni))
if word_spacing:
# "Tw" only has an effect on the ASCII space character and ignores
# space characters from unicode (TTF) fonts. As a workaround,
# we do word spacing using an adjustment before each space.
# Determine the index of the space character (" ") in the current
# subset and split words whenever this mapping code is found
words = mapped_text.split(chr(frag.font.subset.pick(ord(" "))))
words_strl = []
for word_i, word in enumerate(words):
# pylint: disable=redefined-loop-name
word = escape_parens(
word.encode("utf-16-be").decode("latin-1")
)
if word_i == 0:
words_strl.append(f"({word})")
else:
adj = -(frag_ws * frag.k) * 1000 / frag.font_size_pt
words_strl.append(f"{adj:.3f}({u_space}{word})")
escaped_text = " ".join(words_strl)
sl.append(f"[{escaped_text}] TJ")
else:
escaped_text = escape_parens(
mapped_text.encode("utf-16-be").decode("latin-1")
)
sl.append(f"({escaped_text}) Tj")
else: # core fonts
if frag_ws != current_ws:
sl.append(f"{frag_ws * frag.k:.3f} Tw")
current_ws = frag_ws
escaped_text = escape_parens(frag.string)
sl.append(f"({escaped_text}) Tj")
adjust_x = dx
adjust_y = 0.5 * h + 0.3 * max_font_size
r_text = frag.render_pdf_text(
frag_ws, current_ws, word_spacing, adjust_x, adjust_y, self
)
if r_text:
sl.append(r_text)
if not frag.is_ttf_font:
current_ws = frag_ws
frag_width = frag.get_width(
initial_cs=i != 0
) + word_spacing * frag.characters.count(" ")
Expand Down
Loading