Let string splitters respect `East_Asian_Width` property (#3445)

author Hong Minhee (洪民憙) <hong@minhee.org>

Sun, 19 Mar 2023 23:09:57 +0000 (08:09 +0900)

committer GitHub <noreply@github.com>

Sun, 19 Mar 2023 23:09:57 +0000 (19:09 -0400)
author Hong Minhee (洪民憙) <hong@minhee.org>
Sun, 19 Mar 2023 23:09:57 +0000 (08:09 +0900)
committer GitHub <noreply@github.com>
Sun, 19 Mar 2023 23:09:57 +0000 (19:09 -0400)
diff --git a/CHANGES.md b/CHANGES.md

index f5c039f6509aa05226890f3152ed9babb26dd8bc..a429e32c8bd65ad53f5c5dc59e993652127c6c63 100644 (file)
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -23,6 +23,11 @@
    compared to their non-async version. (#3609)
  - `with` statements that contain two context managers will be consistently wrapped in
    parentheses (#3589)
+- Let string splitters respect [East Asian Width](https://www.unicode.org/reports/tr11/)
+  (#3445)
+- Now long string literals can be split after East Asian commas and periods (`、` U+3001
+  IDEOGRAPHIC COMMA, `。` U+3002 IDEOGRAPHIC FULL STOP, & `，` U+FF0C FULLWIDTH COMMA)
+  besides before spaces (#3445)
  - For stubs, enforce one blank line after a nested class with a body other than just
    `...` (#3564)
  
diff --git a/scripts/make_width_table.py b/scripts/make_width_table.py

new file mode 100644 (file)

index 0000000..09aca9c
--- /dev/null
+++ b/scripts/make_width_table.py
@@ -0,0 +1,73 @@
+"""Generates a width table for Unicode characters.
+
+This script generates a width table for Unicode characters that are not
+narrow (width 1). The table is written to src/black/_width_table.py (note
+that although this file is generated, it is checked into Git) and is used
+by the char_width() function in src/black/strings.py.
+
+You should run this script when you upgrade wcwidth, which is expected to
+happen when a new Unicode version is released. The generated table contains
+the version of wcwidth and Unicode that it was generated for.
+
+In order to run this script, you need to install the latest version of wcwidth.
+You can do this by running:
+
+    pip install -U wcwidth
+
+"""
+import sys
+from os.path import basename, dirname, join
+from typing import Iterable, Tuple
+
+import wcwidth
+
+
+def make_width_table() -> Iterable[Tuple[int, int, int]]:
+    start_codepoint = -1
+    end_codepoint = -1
+    range_width = -2
+    for codepoint in range(0, sys.maxunicode + 1):
+        width = wcwidth.wcwidth(chr(codepoint))
+        if width <= 1:
+            # Ignore narrow characters along with zero-width characters so that
+            # they are treated as single-width.  Note that treating zero-width
+            # characters as single-width is consistent with the heuristics built
+            # on top of str.isascii() in the str_width() function in strings.py.
+            continue
+        if start_codepoint < 0:
+            start_codepoint = codepoint
+            range_width = width
+        elif width != range_width or codepoint != end_codepoint + 1:
+            yield (start_codepoint, end_codepoint, range_width)
+            start_codepoint = codepoint
+            range_width = width
+        end_codepoint = codepoint
+    if start_codepoint >= 0:
+        yield (start_codepoint, end_codepoint, range_width)
+
+
+def main() -> None:
+    table_path = join(dirname(__file__), "..", "src", "black", "_width_table.py")
+    with open(table_path, "w") as f:
+        f.write(
+            f"""# Generated by {basename(__file__)}
+# wcwidth {wcwidth.__version__}
+# Unicode {wcwidth.list_versions()[-1]}
+import sys
+from typing import List, Tuple
+
+if sys.version_info < (3, 8):
+    from typing_extensions import Final
+else:
+    from typing import Final
+
+WIDTH_TABLE: Final[List[Tuple[int, int, int]]] = [
+"""
+        )
+        for triple in make_width_table():
+            f.write(f"    {triple!r},\n")
+        f.write("]\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/black/_width_table.py b/src/black/_width_table.py

new file mode 100644 (file)

index 0000000..6923f59
--- /dev/null
+++ b/src/black/_width_table.py
@@ -0,0 +1,484 @@
+# Generated by make_width_table.py
+# wcwidth 0.2.6
+# Unicode 15.0.0
+import sys
+from typing import List, Tuple
+
+if sys.version_info < (3, 8):
+    from typing_extensions import Final
+else:
+    from typing import Final
+
+WIDTH_TABLE: Final[List[Tuple[int, int, int]]] = [
+    (0, 0, 0),
+    (1, 31, -1),
+    (127, 159, -1),
+    (768, 879, 0),
+    (1155, 1161, 0),
+    (1425, 1469, 0),
+    (1471, 1471, 0),
+    (1473, 1474, 0),
+    (1476, 1477, 0),
+    (1479, 1479, 0),
+    (1552, 1562, 0),
+    (1611, 1631, 0),
+    (1648, 1648, 0),
+    (1750, 1756, 0),
+    (1759, 1764, 0),
+    (1767, 1768, 0),
+    (1770, 1773, 0),
+    (1809, 1809, 0),
+    (1840, 1866, 0),
+    (1958, 1968, 0),
+    (2027, 2035, 0),
+    (2045, 2045, 0),
+    (2070, 2073, 0),
+    (2075, 2083, 0),
+    (2085, 2087, 0),
+    (2089, 2093, 0),
+    (2137, 2139, 0),
+    (2200, 2207, 0),
+    (2250, 2273, 0),
+    (2275, 2306, 0),
+    (2362, 2362, 0),
+    (2364, 2364, 0),
+    (2369, 2376, 0),
+    (2381, 2381, 0),
+    (2385, 2391, 0),
+    (2402, 2403, 0),
+    (2433, 2433, 0),
+    (2492, 2492, 0),
+    (2497, 2500, 0),
+    (2509, 2509, 0),
+    (2530, 2531, 0),
+    (2558, 2558, 0),
+    (2561, 2562, 0),
+    (2620, 2620, 0),
+    (2625, 2626, 0),
+    (2631, 2632, 0),
+    (2635, 2637, 0),
+    (2641, 2641, 0),
+    (2672, 2673, 0),
+    (2677, 2677, 0),
+    (2689, 2690, 0),
+    (2748, 2748, 0),
+    (2753, 2757, 0),
+    (2759, 2760, 0),
+    (2765, 2765, 0),
+    (2786, 2787, 0),
+    (2810, 2815, 0),
+    (2817, 2817, 0),
+    (2876, 2876, 0),
+    (2879, 2879, 0),
+    (2881, 2884, 0),
+    (2893, 2893, 0),
+    (2901, 2902, 0),
+    (2914, 2915, 0),
+    (2946, 2946, 0),
+    (3008, 3008, 0),
+    (3021, 3021, 0),
+    (3072, 3072, 0),
+    (3076, 3076, 0),
+    (3132, 3132, 0),
+    (3134, 3136, 0),
+    (3142, 3144, 0),
+    (3146, 3149, 0),
+    (3157, 3158, 0),
+    (3170, 3171, 0),
+    (3201, 3201, 0),
+    (3260, 3260, 0),
+    (3263, 3263, 0),
+    (3270, 3270, 0),
+    (3276, 3277, 0),
+    (3298, 3299, 0),
+    (3328, 3329, 0),
+    (3387, 3388, 0),
+    (3393, 3396, 0),
+    (3405, 3405, 0),
+    (3426, 3427, 0),
+    (3457, 3457, 0),
+    (3530, 3530, 0),
+    (3538, 3540, 0),
+    (3542, 3542, 0),
+    (3633, 3633, 0),
+    (3636, 3642, 0),
+    (3655, 3662, 0),
+    (3761, 3761, 0),
+    (3764, 3772, 0),
+    (3784, 3790, 0),
+    (3864, 3865, 0),
+    (3893, 3893, 0),
+    (3895, 3895, 0),
+    (3897, 3897, 0),
+    (3953, 3966, 0),
+    (3968, 3972, 0),
+    (3974, 3975, 0),
+    (3981, 3991, 0),
+    (3993, 4028, 0),
+    (4038, 4038, 0),
+    (4141, 4144, 0),
+    (4146, 4151, 0),
+    (4153, 4154, 0),
+    (4157, 4158, 0),
+    (4184, 4185, 0),
+    (4190, 4192, 0),
+    (4209, 4212, 0),
+    (4226, 4226, 0),
+    (4229, 4230, 0),
+    (4237, 4237, 0),
+    (4253, 4253, 0),
+    (4352, 4447, 2),
+    (4957, 4959, 0),
+    (5906, 5908, 0),
+    (5938, 5939, 0),
+    (5970, 5971, 0),
+    (6002, 6003, 0),
+    (6068, 6069, 0),
+    (6071, 6077, 0),
+    (6086, 6086, 0),
+    (6089, 6099, 0),
+    (6109, 6109, 0),
+    (6155, 6157, 0),
+    (6159, 6159, 0),
+    (6277, 6278, 0),
+    (6313, 6313, 0),
+    (6432, 6434, 0),
+    (6439, 6440, 0),
+    (6450, 6450, 0),
+    (6457, 6459, 0),
+    (6679, 6680, 0),
+    (6683, 6683, 0),
+    (6742, 6742, 0),
+    (6744, 6750, 0),
+    (6752, 6752, 0),
+    (6754, 6754, 0),
+    (6757, 6764, 0),
+    (6771, 6780, 0),
+    (6783, 6783, 0),
+    (6832, 6862, 0),
+    (6912, 6915, 0),
+    (6964, 6964, 0),
+    (6966, 6970, 0),
+    (6972, 6972, 0),
+    (6978, 6978, 0),
+    (7019, 7027, 0),
+    (7040, 7041, 0),
+    (7074, 7077, 0),
+    (7080, 7081, 0),
+    (7083, 7085, 0),
+    (7142, 7142, 0),
+    (7144, 7145, 0),
+    (7149, 7149, 0),
+    (7151, 7153, 0),
+    (7212, 7219, 0),
+    (7222, 7223, 0),
+    (7376, 7378, 0),
+    (7380, 7392, 0),
+    (7394, 7400, 0),
+    (7405, 7405, 0),
+    (7412, 7412, 0),
+    (7416, 7417, 0),
+    (7616, 7679, 0),
+    (8203, 8207, 0),
+    (8232, 8238, 0),
+    (8288, 8291, 0),
+    (8400, 8432, 0),
+    (8986, 8987, 2),
+    (9001, 9002, 2),
+    (9193, 9196, 2),
+    (9200, 9200, 2),
+    (9203, 9203, 2),
+    (9725, 9726, 2),
+    (9748, 9749, 2),
+    (9800, 9811, 2),
+    (9855, 9855, 2),
+    (9875, 9875, 2),
+    (9889, 9889, 2),
+    (9898, 9899, 2),
+    (9917, 9918, 2),
+    (9924, 9925, 2),
+    (9934, 9934, 2),
+    (9940, 9940, 2),
+    (9962, 9962, 2),
+    (9970, 9971, 2),
+    (9973, 9973, 2),
+    (9978, 9978, 2),
+    (9981, 9981, 2),
+    (9989, 9989, 2),
+    (9994, 9995, 2),
+    (10024, 10024, 2),
+    (10060, 10060, 2),
+    (10062, 10062, 2),
+    (10067, 10069, 2),
+    (10071, 10071, 2),
+    (10133, 10135, 2),
+    (10160, 10160, 2),
+    (10175, 10175, 2),
+    (11035, 11036, 2),
+    (11088, 11088, 2),
+    (11093, 11093, 2),
+    (11503, 11505, 0),
+    (11647, 11647, 0),
+    (11744, 11775, 0),
+    (11904, 11929, 2),
+    (11931, 12019, 2),
+    (12032, 12245, 2),
+    (12272, 12283, 2),
+    (12288, 12329, 2),
+    (12330, 12333, 0),
+    (12334, 12350, 2),
+    (12353, 12438, 2),
+    (12441, 12442, 0),
+    (12443, 12543, 2),
+    (12549, 12591, 2),
+    (12593, 12686, 2),
+    (12688, 12771, 2),
+    (12784, 12830, 2),
+    (12832, 12871, 2),
+    (12880, 19903, 2),
+    (19968, 42124, 2),
+    (42128, 42182, 2),
+    (42607, 42610, 0),
+    (42612, 42621, 0),
+    (42654, 42655, 0),
+    (42736, 42737, 0),
+    (43010, 43010, 0),
+    (43014, 43014, 0),
+    (43019, 43019, 0),
+    (43045, 43046, 0),
+    (43052, 43052, 0),
+    (43204, 43205, 0),
+    (43232, 43249, 0),
+    (43263, 43263, 0),
+    (43302, 43309, 0),
+    (43335, 43345, 0),
+    (43360, 43388, 2),
+    (43392, 43394, 0),
+    (43443, 43443, 0),
+    (43446, 43449, 0),
+    (43452, 43453, 0),
+    (43493, 43493, 0),
+    (43561, 43566, 0),
+    (43569, 43570, 0),
+    (43573, 43574, 0),
+    (43587, 43587, 0),
+    (43596, 43596, 0),
+    (43644, 43644, 0),
+    (43696, 43696, 0),
+    (43698, 43700, 0),
+    (43703, 43704, 0),
+    (43710, 43711, 0),
+    (43713, 43713, 0),
+    (43756, 43757, 0),
+    (43766, 43766, 0),
+    (44005, 44005, 0),
+    (44008, 44008, 0),
+    (44013, 44013, 0),
+    (44032, 55203, 2),
+    (63744, 64255, 2),
+    (64286, 64286, 0),
+    (65024, 65039, 0),
+    (65040, 65049, 2),
+    (65056, 65071, 0),
+    (65072, 65106, 2),
+    (65108, 65126, 2),
+    (65128, 65131, 2),
+    (65281, 65376, 2),
+    (65504, 65510, 2),
+    (66045, 66045, 0),
+    (66272, 66272, 0),
+    (66422, 66426, 0),
+    (68097, 68099, 0),
+    (68101, 68102, 0),
+    (68108, 68111, 0),
+    (68152, 68154, 0),
+    (68159, 68159, 0),
+    (68325, 68326, 0),
+    (68900, 68903, 0),
+    (69291, 69292, 0),
+    (69373, 69375, 0),
+    (69446, 69456, 0),
+    (69506, 69509, 0),
+    (69633, 69633, 0),
+    (69688, 69702, 0),
+    (69744, 69744, 0),
+    (69747, 69748, 0),
+    (69759, 69761, 0),
+    (69811, 69814, 0),
+    (69817, 69818, 0),
+    (69826, 69826, 0),
+    (69888, 69890, 0),
+    (69927, 69931, 0),
+    (69933, 69940, 0),
+    (70003, 70003, 0),
+    (70016, 70017, 0),
+    (70070, 70078, 0),
+    (70089, 70092, 0),
+    (70095, 70095, 0),
+    (70191, 70193, 0),
+    (70196, 70196, 0),
+    (70198, 70199, 0),
+    (70206, 70206, 0),
+    (70209, 70209, 0),
+    (70367, 70367, 0),
+    (70371, 70378, 0),
+    (70400, 70401, 0),
+    (70459, 70460, 0),
+    (70464, 70464, 0),
+    (70502, 70508, 0),
+    (70512, 70516, 0),
+    (70712, 70719, 0),
+    (70722, 70724, 0),
+    (70726, 70726, 0),
+    (70750, 70750, 0),
+    (70835, 70840, 0),
+    (70842, 70842, 0),
+    (70847, 70848, 0),
+    (70850, 70851, 0),
+    (71090, 71093, 0),
+    (71100, 71101, 0),
+    (71103, 71104, 0),
+    (71132, 71133, 0),
+    (71219, 71226, 0),
+    (71229, 71229, 0),
+    (71231, 71232, 0),
+    (71339, 71339, 0),
+    (71341, 71341, 0),
+    (71344, 71349, 0),
+    (71351, 71351, 0),
+    (71453, 71455, 0),
+    (71458, 71461, 0),
+    (71463, 71467, 0),
+    (71727, 71735, 0),
+    (71737, 71738, 0),
+    (71995, 71996, 0),
+    (71998, 71998, 0),
+    (72003, 72003, 0),
+    (72148, 72151, 0),
+    (72154, 72155, 0),
+    (72160, 72160, 0),
+    (72193, 72202, 0),
+    (72243, 72248, 0),
+    (72251, 72254, 0),
+    (72263, 72263, 0),
+    (72273, 72278, 0),
+    (72281, 72283, 0),
+    (72330, 72342, 0),
+    (72344, 72345, 0),
+    (72752, 72758, 0),
+    (72760, 72765, 0),
+    (72767, 72767, 0),
+    (72850, 72871, 0),
+    (72874, 72880, 0),
+    (72882, 72883, 0),
+    (72885, 72886, 0),
+    (73009, 73014, 0),
+    (73018, 73018, 0),
+    (73020, 73021, 0),
+    (73023, 73029, 0),
+    (73031, 73031, 0),
+    (73104, 73105, 0),
+    (73109, 73109, 0),
+    (73111, 73111, 0),
+    (73459, 73460, 0),
+    (73472, 73473, 0),
+    (73526, 73530, 0),
+    (73536, 73536, 0),
+    (73538, 73538, 0),
+    (78912, 78912, 0),
+    (78919, 78933, 0),
+    (92912, 92916, 0),
+    (92976, 92982, 0),
+    (94031, 94031, 0),
+    (94095, 94098, 0),
+    (94176, 94179, 2),
+    (94180, 94180, 0),
+    (94192, 94193, 2),
+    (94208, 100343, 2),
+    (100352, 101589, 2),
+    (101632, 101640, 2),
+    (110576, 110579, 2),
+    (110581, 110587, 2),
+    (110589, 110590, 2),
+    (110592, 110882, 2),
+    (110898, 110898, 2),
+    (110928, 110930, 2),
+    (110933, 110933, 2),
+    (110948, 110951, 2),
+    (110960, 111355, 2),
+    (113821, 113822, 0),
+    (118528, 118573, 0),
+    (118576, 118598, 0),
+    (119143, 119145, 0),
+    (119163, 119170, 0),
+    (119173, 119179, 0),
+    (119210, 119213, 0),
+    (119362, 119364, 0),
+    (121344, 121398, 0),
+    (121403, 121452, 0),
+    (121461, 121461, 0),
+    (121476, 121476, 0),
+    (121499, 121503, 0),
+    (121505, 121519, 0),
+    (122880, 122886, 0),
+    (122888, 122904, 0),
+    (122907, 122913, 0),
+    (122915, 122916, 0),
+    (122918, 122922, 0),
+    (123023, 123023, 0),
+    (123184, 123190, 0),
+    (123566, 123566, 0),
+    (123628, 123631, 0),
+    (124140, 124143, 0),
+    (125136, 125142, 0),
+    (125252, 125258, 0),
+    (126980, 126980, 2),
+    (127183, 127183, 2),
+    (127374, 127374, 2),
+    (127377, 127386, 2),
+    (127488, 127490, 2),
+    (127504, 127547, 2),
+    (127552, 127560, 2),
+    (127568, 127569, 2),
+    (127584, 127589, 2),
+    (127744, 127776, 2),
+    (127789, 127797, 2),
+    (127799, 127868, 2),
+    (127870, 127891, 2),
+    (127904, 127946, 2),
+    (127951, 127955, 2),
+    (127968, 127984, 2),
+    (127988, 127988, 2),
+    (127992, 128062, 2),
+    (128064, 128064, 2),
+    (128066, 128252, 2),
+    (128255, 128317, 2),
+    (128331, 128334, 2),
+    (128336, 128359, 2),
+    (128378, 128378, 2),
+    (128405, 128406, 2),
+    (128420, 128420, 2),
+    (128507, 128591, 2),
+    (128640, 128709, 2),
+    (128716, 128716, 2),
+    (128720, 128722, 2),
+    (128725, 128727, 2),
+    (128732, 128735, 2),
+    (128747, 128748, 2),
+    (128756, 128764, 2),
+    (128992, 129003, 2),
+    (129008, 129008, 2),
+    (129292, 129338, 2),
+    (129340, 129349, 2),
+    (129351, 129535, 2),
+    (129648, 129660, 2),
+    (129664, 129672, 2),
+    (129680, 129725, 2),
+    (129727, 129733, 2),
+    (129742, 129755, 2),
+    (129760, 129768, 2),
+    (129776, 129784, 2),
+    (131072, 196605, 2),
+    (196608, 262141, 2),
+    (917760, 917999, 0),
+]
diff --git a/src/black/lines.py b/src/black/lines.py

index fb5933ecbfbf633dc1d701571d8abfa2ee2c6b58..bf4c12cb684dd160748ce5915e1424eda24bdd00 100644 (file)
--- a/src/black/lines.py
+++ b/src/black/lines.py
@@ -33,6 +33,7 @@ from black.nodes import (
      syms,
      whitespace,
  )
+from black.strings import str_width
  from blib2to3.pgen2 import token
  from blib2to3.pytree import Leaf, Node
  
@@ -759,9 +760,11 @@ def is_line_short_enough(  # noqa: C901
      if not line_str:
          line_str = line_to_string(line)
  
+    width = str_width if mode.preview else len
+
      if Preview.multiline_string_handling not in mode:
          return (
-            len(line_str) <= mode.line_length
+            width(line_str) <= mode.line_length
              and "\n" not in line_str  # multiline strings
              and not line.contains_standalone_comments()
          )
@@ -770,10 +773,10 @@ def is_line_short_enough(  # noqa: C901
          return False
      if "\n" not in line_str:
          # No multiline strings (MLS) present
-        return len(line_str) <= mode.line_length
+        return width(line_str) <= mode.line_length
  
      first, *_, last = line_str.split("\n")
-    if len(first) > mode.line_length or len(last) > mode.line_length:
+    if width(first) > mode.line_length or width(last) > mode.line_length:
          return False
  
      # Traverse the AST to examine the context of the multiline string (MLS),
diff --git a/src/black/strings.py b/src/black/strings.py

index 3e3bc12fe728cd4a23f3b19395506e88b109723c..ac18aef51ed7d7349f0f0bbb4e2dc3ce338c763b 100644 (file)
--- a/src/black/strings.py
+++ b/src/black/strings.py
@@ -14,6 +14,7 @@ if sys.version_info < (3, 8):
  else:
      from typing import Final
  
+from black._width_table import WIDTH_TABLE
  
  STRING_PREFIX_CHARS: Final = "furbFURB"  # All possible string prefix characters.
  STRING_PREFIX_RE: Final = re.compile(
@@ -278,3 +279,57 @@ def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
              return back_slashes + "N{" + groups["N"].upper() + "}"
  
      leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)
+
+
+@lru_cache(maxsize=4096)
+def char_width(char: str) -> int:
+    """Return the width of a single character as it would be displayed in a
+    terminal or editor (which respects Unicode East Asian Width).
+
+    Full width characters are counted as 2, while half width characters are
+    counted as 1.  Also control characters are counted as 0.
+    """
+    table = WIDTH_TABLE
+    codepoint = ord(char)
+    highest = len(table) - 1
+    lowest = 0
+    idx = highest // 2
+    while True:
+        start_codepoint, end_codepoint, width = table[idx]
+        if codepoint < start_codepoint:
+            highest = idx - 1
+        elif codepoint > end_codepoint:
+            lowest = idx + 1
+        else:
+            return 0 if width < 0 else width
+        if highest < lowest:
+            break
+        idx = (highest + lowest) // 2
+    return 1
+
+
+def str_width(line_str: str) -> int:
+    """Return the width of `line_str` as it would be displayed in a terminal
+    or editor (which respects Unicode East Asian Width).
+
+    You could utilize this function to determine, for example, if a string
+    is too wide to display in a terminal or editor.
+    """
+    if line_str.isascii():
+        # Fast path for a line consisting of only ASCII characters
+        return len(line_str)
+    return sum(map(char_width, line_str))
+
+
+def count_chars_in_width(line_str: str, max_width: int) -> int:
+    """Count the number of characters in `line_str` that would fit in a
+    terminal or editor of `max_width` (which respects Unicode East Asian
+    Width).
+    """
+    total_width = 0
+    for i, char in enumerate(line_str):
+        width = char_width(char)
+        if width + total_width > max_width:
+            return i
+        total_width += width
+    return len(line_str)
diff --git a/src/black/trans.py b/src/black/trans.py

index a6a416e71bc4c4e484e1a74c0949341e5908d67a..95695f32b14573138f1ccfdd02d2ce1dc3cb9d6b 100644 (file)
--- a/src/black/trans.py
+++ b/src/black/trans.py
@@ -48,9 +48,11 @@ from black.nodes import (
  from black.rusty import Err, Ok, Result
  from black.strings import (
      assert_is_leaf_string,
+    count_chars_in_width,
      get_string_prefix,
      has_triple_quotes,
      normalize_string_quotes,
+    str_width,
  )
  from blib2to3.pgen2 import token
  from blib2to3.pytree import Leaf, Node
@@ -71,6 +73,8 @@ StringID = int
  TResult = Result[T, CannotTransform]  # (T)ransform Result
  TMatchResult = TResult[List[Index]]
  
+SPLIT_SAFE_CHARS = frozenset(["\u3001", "\u3002", "\uff0c"])  # East Asian stops
+
  
  def TErr(err_msg: str) -> Err[CannotTransform]:
      """(T)ransform Err
@@ -1164,7 +1168,7 @@ class BaseStringSplitter(StringTransformer):
              # WMA4 the length of the inline comment.
              offset += len(comment_leaf.value)
  
-        max_string_length = self.line_length - offset
+        max_string_length = count_chars_in_width(str(line), self.line_length - offset)
          return max_string_length
  
      @staticmethod
@@ -1419,11 +1423,13 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
              is_valid_index(string_idx + 1) and LL[string_idx + 1].type == token.COMMA
          )
  
-        def max_last_string() -> int:
+        def max_last_string_column() -> int:
              """
              Returns:
-                The max allowed length of the string value used for the last
-                line we will construct.
+                The max allowed width of the string value used for the last
+                line we will construct.  Note that this value means the width
+                rather than the number of characters (e.g., many East Asian
+                characters expand to two columns).
              """
              result = self.line_length
              result -= line.depth * 4
@@ -1431,14 +1437,14 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
              result -= string_op_leaves_length
              return result
  
-        # --- Calculate Max Break Index (for string value)
+        # --- Calculate Max Break Width (for string value)
          # We start with the line length limit
-        max_break_idx = self.line_length
+        max_break_width = self.line_length
          # The last index of a string of length N is N-1.
-        max_break_idx -= 1
+        max_break_width -= 1
          # Leading whitespace is not present in the string value (e.g. Leaf.value).
-        max_break_idx -= line.depth * 4
-        if max_break_idx < 0:
+        max_break_width -= line.depth * 4
+        if max_break_width < 0:
              yield TErr(
                  f"Unable to split {LL[string_idx].value} at such high of a line depth:"
                  f" {line.depth}"
@@ -1451,7 +1457,7 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
          # line limit.
          use_custom_breakpoints = bool(
              custom_splits
-            and all(csplit.break_idx <= max_break_idx for csplit in custom_splits)
+            and all(csplit.break_idx <= max_break_width for csplit in custom_splits)
          )
  
          # Temporary storage for the remaining chunk of the string line that
@@ -1467,7 +1473,7 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
              if use_custom_breakpoints:
                  return len(custom_splits) > 1
              else:
-                return len(rest_value) > max_last_string()
+                return str_width(rest_value) > max_last_string_column()
  
          string_line_results: List[Ok[Line]] = []
          while more_splits_should_be_made():
@@ -1477,7 +1483,10 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
                  break_idx = csplit.break_idx
              else:
                  # Algorithmic Split (automatic)
-                max_bidx = max_break_idx - string_op_leaves_length
+                max_bidx = (
+                    count_chars_in_width(rest_value, max_break_width)
+                    - string_op_leaves_length
+                )
                  maybe_break_idx = self._get_break_idx(rest_value, max_bidx)
                  if maybe_break_idx is None:
                      # If we are unable to algorithmically determine a good split
@@ -1574,7 +1583,7 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
  
              # Try to fit them all on the same line with the last substring...
              if (
-                len(temp_value) <= max_last_string()
+                str_width(temp_value) <= max_last_string_column()
                  or LL[string_idx + 1].type == token.COMMA
              ):
                  last_line.append(rest_leaf)
@@ -1694,6 +1703,7 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
                  section of this classes' docstring would be be met by returning @i.
              """
              is_space = string[i] == " "
+            is_split_safe = is_valid_index(i - 1) and string[i - 1] in SPLIT_SAFE_CHARS
  
              is_not_escaped = True
              j = i - 1
@@ -1706,7 +1716,7 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
                  and len(string[:i]) >= self.MIN_SUBSTR_SIZE
              )
              return (
-                is_space
+                (is_space or is_split_safe)
                  and is_not_escaped
                  and is_big_enough
                  and not breaks_unsplittable_expression(i)
@@ -1851,11 +1861,13 @@ class StringParenWrapper(BaseStringSplitter, CustomSplitMapMixin):
  
          if string_idx is not None:
              string_value = line.leaves[string_idx].value
-            # If the string has no spaces...
-            if " " not in string_value:
+            # If the string has neither spaces nor East Asian stops...
+            if not any(
+                char == " " or char in SPLIT_SAFE_CHARS for char in string_value
+            ):
                  # And will still violate the line length limit when split...
-                max_string_length = self.line_length - ((line.depth + 1) * 4)
-                if len(string_value) > max_string_length:
+                max_string_width = self.line_length - ((line.depth + 1) * 4)
+                if str_width(string_value) > max_string_width:
                      # And has no associated custom splits...
                      if not self.has_custom_splits(string_value):
                          # Then we should NOT put this string on its own line.
diff --git a/tests/data/preview/long_strings__east_asian_width.py b/tests/data/preview/long_strings__east_asian_width.py

new file mode 100644 (file)

index 0000000..fb66a78
--- /dev/null
+++ b/tests/data/preview/long_strings__east_asian_width.py
@@ -0,0 +1,25 @@
+# The following strings do not have not-so-many chars, but are long enough\r
+# when these are rendered in a monospace font (if the renderer respects\r
+# Unicode East Asian Width properties).\r
+hangul = '코드포인트 수는 적으나 실제 터미널이나 에디터에서 렌더링될 땐 너무 길어서 줄바꿈이 필요한 문자열'\r
+hanzi = '中文測試：代碼點數量少，但在真正的終端模擬器或編輯器中呈現時太長，因此需要換行的字符串。'\r
+japanese = 'コードポイントの数は少ないが、実際の端末エミュレータやエディタでレンダリングされる時は長すぎる為、改行が要る文字列'\r
+\r
+# output\r
+\r
+# The following strings do not have not-so-many chars, but are long enough\r
+# when these are rendered in a monospace font (if the renderer respects\r
+# Unicode East Asian Width properties).\r
+hangul = (\r
+    "코드포인트 수는 적으나 실제 터미널이나 에디터에서 렌더링될 땐 너무 길어서 줄바꿈이"\r
+    " 필요한 문자열"\r
+)\r
+hanzi = (\r
+    "中文測試：代碼點數量少，但在真正的終端模擬器或編輯器中呈現時太長，"\r
+    "因此需要換行的字符串。"\r
+)\r
+japanese = (\r
+    "コードポイントの数は少ないが、"\r
+    "実際の端末エミュレータやエディタでレンダリングされる時は長すぎる為、"\r
+    "改行が要る文字列"\r
+)\r
author	Hong Minhee (洪民憙) <hong@minhee.org>
	Sun, 19 Mar 2023 23:09:57 +0000 (08:09 +0900)
committer	GitHub <noreply@github.com>
	Sun, 19 Mar 2023 23:09:57 +0000 (19:09 -0400)
CHANGES.md		patch \| blob \| history
scripts/make_width_table.py	[new file with mode: 0644]	patch \| blob
src/black/_width_table.py	[new file with mode: 0644]	patch \| blob
src/black/lines.py		patch \| blob \| history
src/black/strings.py		patch \| blob \| history
src/black/trans.py		patch \| blob \| history
tests/data/preview/long_strings__east_asian_width.py	[new file with mode: 0644]	patch \| blob