Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Let string splitters respect East_Asian_Width property #3445

Merged
merged 6 commits into from
Mar 19, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
entry (#3393)
- `with` statements that contain two context managers will be consistently wrapped in
parentheses (#3589)
- Let string splitters respect [East Asian Width](https://www.unicode.org/reports/tr11/)
(#3445)
- Now long string literals can be split after East Asian commas and periods (`、` U+3001
IDEOGRAPHIC COMMA, `。` U+3002 IDEOGRAPHIC FULL STOP, & `,` U+FF0C FULLWIDTH COMMA)
besides before spaces (#3445)

### Configuration

Expand Down
73 changes: 73 additions & 0 deletions scripts/make_width_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""Generates a width table for Unicode characters.

This script generates a width table for Unicode characters that are not
narrow (width 1). The table is written to src/black/_width_table.py (note
that although this file is generated, it is checked into Git) and is used
by the char_width() function in src/black/strings.py.

You should run this script when you upgrade wcwidth, which is expected to
happen when a new Unicode version is released. The generated table contains
the version of wcwidth and Unicode that it was generated for.

In order to run this script, you need to install the latest version of wcwidth.
You can do this by running:

pip install -U wcwidth

"""
import sys
JelleZijlstra marked this conversation as resolved.
Show resolved Hide resolved
from os.path import basename, dirname, join
from typing import Iterable, Tuple

import wcwidth


def make_width_table() -> Iterable[Tuple[int, int, int]]:
start_codepoint = -1
end_codepoint = -1
range_width = -2
for codepoint in range(0, sys.maxunicode + 1):
width = wcwidth.wcwidth(chr(codepoint))
if width <= 1:
# Ignore narrow characters along with zero-width characters so that
# they are treated as single-width. Note that treating zero-width
# characters as single-width is consistent with the heuristics built
# on top of str.isascii() in the str_width() function in strings.py.
continue
if start_codepoint < 0:
start_codepoint = codepoint
range_width = width
elif width != range_width or codepoint != end_codepoint + 1:
yield (start_codepoint, end_codepoint, range_width)
start_codepoint = codepoint
range_width = width
end_codepoint = codepoint
if start_codepoint >= 0:
yield (start_codepoint, end_codepoint, range_width)


def main() -> None:
table_path = join(dirname(__file__), "..", "src", "black", "_width_table.py")
with open(table_path, "w") as f:
f.write(
f"""# Generated by {basename(__file__)}
# wcwidth {wcwidth.__version__}
# Unicode {wcwidth.list_versions()[-1]}
import sys
JelleZijlstra marked this conversation as resolved.
Show resolved Hide resolved
from typing import List, Tuple

if sys.version_info < (3, 8):
from typing_extensions import Final
else:
from typing import Final

WIDTH_TABLE: Final[List[Tuple[int, int, int]]] = [
"""
)
for triple in make_width_table():
f.write(f" {triple!r},\n")
f.write("]\n")


if __name__ == "__main__":
main()
Loading