Simple formatting on strings. Further string formatting code is in trans.py.
"""
-import regex as re
+import re
import sys
-from typing import List, Pattern
-
-
-STRING_PREFIX_CHARS = "furbFURB" # All possible string prefix characters.
+from functools import lru_cache
+from typing import Final, List, Match, Pattern
+
+from black._width_table import WIDTH_TABLE
+from blib2to3.pytree import Leaf
+
+STRING_PREFIX_CHARS: Final = "furbFURB" # All possible string prefix characters.
+STRING_PREFIX_RE: Final = re.compile(
+ r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
+)
+FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
+UNICODE_ESCAPE_RE: Final = re.compile(
+ r"(?P<backslashes>\\+)(?P<body>"
+ r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx
+ r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx
+ r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh
+ r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database
+ r")",
+ re.VERBOSE,
+)
def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
for line in s.splitlines():
# Find the index of the first non-whitespace character after a string of
# whitespace that includes at least one tab
- match = re.match(r"\s*\t+\s*(\S)", line)
+ match = FIRST_NON_WHITESPACE_RE.match(line)
if match:
first_non_whitespace_idx = match.start(1)
), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
-def normalize_string_prefix(s: str, remove_u_prefix: bool = False) -> str:
- """Make all string prefixes lowercase.
-
- If remove_u_prefix is given, also removes any u prefix from the string.
- """
- match = re.match(r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", s, re.DOTALL)
+def normalize_string_prefix(s: str) -> str:
+ """Make all string prefixes lowercase."""
+ match = STRING_PREFIX_RE.match(s)
assert match is not None, f"failed to match string {s!r}"
orig_prefix = match.group(1)
- new_prefix = orig_prefix.replace("F", "f").replace("B", "b").replace("U", "u")
- if remove_u_prefix:
- new_prefix = new_prefix.replace("u", "")
+ new_prefix = (
+ orig_prefix.replace("F", "f")
+ .replace("B", "b")
+ .replace("U", "")
+ .replace("u", "")
+ )
+
+ # Python syntax guarantees max 2 prefixes and that one of them is "r"
+ if len(new_prefix) == 2 and "r" != new_prefix[0].lower():
+ new_prefix = new_prefix[::-1]
return f"{new_prefix}{match.group(2)}"
+# Re(gex) does actually cache patterns internally but this still improves
+# performance on a long list literal of strings by 5-9% since lru_cache's
+# caching overhead is much lower.
+@lru_cache(maxsize=64)
+def _cached_compile(pattern: str) -> Pattern[str]:
+ return re.compile(pattern)
+
+
def normalize_string_quotes(s: str) -> str:
"""Prefer double quotes but only if it doesn't cause more escaping.
return s # There's an internal error
prefix = s[:first_quote_pos]
- unescaped_new_quote = re.compile(rf"(([^\\]|^)(\\\\)*){new_quote}")
- escaped_new_quote = re.compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}")
- escaped_orig_quote = re.compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}")
+ unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")
+ escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}")
+ escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}")
body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)]
if "r" in prefix.casefold():
if unescaped_new_quote.search(body):
return s # Prefer double quotes
return f"{prefix}{new_quote}{new_body}{new_quote}"
+
+
+def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
+ """Replace hex codes in Unicode escape sequences with lowercase representation."""
+ text = leaf.value
+ prefix = get_string_prefix(text)
+ if "r" in prefix.lower():
+ return
+
+ def replace(m: Match[str]) -> str:
+ groups = m.groupdict()
+ back_slashes = groups["backslashes"]
+
+ if len(back_slashes) % 2 == 0:
+ return back_slashes + groups["body"]
+
+ if groups["u"]:
+ # \u
+ return back_slashes + "u" + groups["u"].lower()
+ elif groups["U"]:
+ # \U
+ return back_slashes + "U" + groups["U"].lower()
+ elif groups["x"]:
+ # \x
+ return back_slashes + "x" + groups["x"].lower()
+ else:
+ assert groups["N"], f"Unexpected match: {m}"
+ # \N{}
+ return back_slashes + "N{" + groups["N"].upper() + "}"
+
+ leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)
+
+
+@lru_cache(maxsize=4096)
+def char_width(char: str) -> int:
+ """Return the width of a single character as it would be displayed in a
+ terminal or editor (which respects Unicode East Asian Width).
+
+ Full width characters are counted as 2, while half width characters are
+ counted as 1. Also control characters are counted as 0.
+ """
+ table = WIDTH_TABLE
+ codepoint = ord(char)
+ highest = len(table) - 1
+ lowest = 0
+ idx = highest // 2
+ while True:
+ start_codepoint, end_codepoint, width = table[idx]
+ if codepoint < start_codepoint:
+ highest = idx - 1
+ elif codepoint > end_codepoint:
+ lowest = idx + 1
+ else:
+ return 0 if width < 0 else width
+ if highest < lowest:
+ break
+ idx = (highest + lowest) // 2
+ return 1
+
+
+def str_width(line_str: str) -> int:
+ """Return the width of `line_str` as it would be displayed in a terminal
+ or editor (which respects Unicode East Asian Width).
+
+ You could utilize this function to determine, for example, if a string
+ is too wide to display in a terminal or editor.
+ """
+ if line_str.isascii():
+ # Fast path for a line consisting of only ASCII characters
+ return len(line_str)
+ return sum(map(char_width, line_str))
+
+
+def count_chars_in_width(line_str: str, max_width: int) -> int:
+ """Count the number of characters in `line_str` that would fit in a
+ terminal or editor of `max_width` (which respects Unicode East Asian
+ Width).
+ """
+ total_width = 0
+ for i, char in enumerate(line_str):
+ width = char_width(char)
+ if width + total_width > max_width:
+ return i
+ total_width += width
+ return len(line_str)