X-Git-Url: https://git.madduck.net/etc/vim.git/blobdiff_plain/8c04847aa22d14f01bb206cfc1b1e1cebd2ae538..3dcacdda0d7f69a1705f3e2a151c24a6cf004171:/src/black/strings.py?ds=sidebyside diff --git a/src/black/strings.py b/src/black/strings.py index d7b6c24..0d30f09 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -2,12 +2,28 @@ Simple formatting on strings. Further string formatting code is in trans.py. """ -import regex as re +import re import sys -from typing import List, Pattern - - -STRING_PREFIX_CHARS = "furbFURB" # All possible string prefix characters. +from functools import lru_cache +from typing import Final, List, Match, Pattern + +from black._width_table import WIDTH_TABLE +from blib2to3.pytree import Leaf + +STRING_PREFIX_CHARS: Final = "furbFURB" # All possible string prefix characters. +STRING_PREFIX_RE: Final = re.compile( + r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL +) +FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)") +UNICODE_ESCAPE_RE: Final = re.compile( + r"(?P\\+)(?P" + r"(u(?P[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx + r"|(U(?P[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx + r"|(x(?P[a-fA-F0-9]{2}))" # Character with hex value hh + r"|(N\{(?P[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database + r")", + re.VERBOSE, +) def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str: @@ -37,7 +53,7 @@ def lines_with_leading_tabs_expanded(s: str) -> List[str]: for line in s.splitlines(): # Find the index of the first non-whitespace character after a string of # whitespace that includes at least one tab - match = re.match(r"\s*\t+\s*(\S)", line) + match = FIRST_NON_WHITESPACE_RE.match(line) if match: first_non_whitespace_idx = match.start(1) @@ -128,20 +144,32 @@ def assert_is_leaf_string(string: str) -> None: ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}." -def normalize_string_prefix(s: str, remove_u_prefix: bool = False) -> str: - """Make all string prefixes lowercase. - - If remove_u_prefix is given, also removes any u prefix from the string. - """ - match = re.match(r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", s, re.DOTALL) +def normalize_string_prefix(s: str) -> str: + """Make all string prefixes lowercase.""" + match = STRING_PREFIX_RE.match(s) assert match is not None, f"failed to match string {s!r}" orig_prefix = match.group(1) - new_prefix = orig_prefix.replace("F", "f").replace("B", "b").replace("U", "u") - if remove_u_prefix: - new_prefix = new_prefix.replace("u", "") + new_prefix = ( + orig_prefix.replace("F", "f") + .replace("B", "b") + .replace("U", "") + .replace("u", "") + ) + + # Python syntax guarantees max 2 prefixes and that one of them is "r" + if len(new_prefix) == 2 and "r" != new_prefix[0].lower(): + new_prefix = new_prefix[::-1] return f"{new_prefix}{match.group(2)}" +# Re(gex) does actually cache patterns internally but this still improves +# performance on a long list literal of strings by 5-9% since lru_cache's +# caching overhead is much lower. +@lru_cache(maxsize=64) +def _cached_compile(pattern: str) -> Pattern[str]: + return re.compile(pattern) + + def normalize_string_quotes(s: str) -> str: """Prefer double quotes but only if it doesn't cause more escaping. @@ -166,9 +194,9 @@ def normalize_string_quotes(s: str) -> str: return s # There's an internal error prefix = s[:first_quote_pos] - unescaped_new_quote = re.compile(rf"(([^\\]|^)(\\\\)*){new_quote}") - escaped_new_quote = re.compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}") - escaped_orig_quote = re.compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}") + unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}") + escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}") + escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}") body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)] if "r" in prefix.casefold(): if unescaped_new_quote.search(body): @@ -214,3 +242,88 @@ def normalize_string_quotes(s: str) -> str: return s # Prefer double quotes return f"{prefix}{new_quote}{new_body}{new_quote}" + + +def normalize_unicode_escape_sequences(leaf: Leaf) -> None: + """Replace hex codes in Unicode escape sequences with lowercase representation.""" + text = leaf.value + prefix = get_string_prefix(text) + if "r" in prefix.lower(): + return + + def replace(m: Match[str]) -> str: + groups = m.groupdict() + back_slashes = groups["backslashes"] + + if len(back_slashes) % 2 == 0: + return back_slashes + groups["body"] + + if groups["u"]: + # \u + return back_slashes + "u" + groups["u"].lower() + elif groups["U"]: + # \U + return back_slashes + "U" + groups["U"].lower() + elif groups["x"]: + # \x + return back_slashes + "x" + groups["x"].lower() + else: + assert groups["N"], f"Unexpected match: {m}" + # \N{} + return back_slashes + "N{" + groups["N"].upper() + "}" + + leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text) + + +@lru_cache(maxsize=4096) +def char_width(char: str) -> int: + """Return the width of a single character as it would be displayed in a + terminal or editor (which respects Unicode East Asian Width). + + Full width characters are counted as 2, while half width characters are + counted as 1. Also control characters are counted as 0. + """ + table = WIDTH_TABLE + codepoint = ord(char) + highest = len(table) - 1 + lowest = 0 + idx = highest // 2 + while True: + start_codepoint, end_codepoint, width = table[idx] + if codepoint < start_codepoint: + highest = idx - 1 + elif codepoint > end_codepoint: + lowest = idx + 1 + else: + return 0 if width < 0 else width + if highest < lowest: + break + idx = (highest + lowest) // 2 + return 1 + + +def str_width(line_str: str) -> int: + """Return the width of `line_str` as it would be displayed in a terminal + or editor (which respects Unicode East Asian Width). + + You could utilize this function to determine, for example, if a string + is too wide to display in a terminal or editor. + """ + if line_str.isascii(): + # Fast path for a line consisting of only ASCII characters + return len(line_str) + return sum(map(char_width, line_str)) + + +def count_chars_in_width(line_str: str, max_width: int) -> int: + """Count the number of characters in `line_str` that would fit in a + terminal or editor of `max_width` (which respects Unicode East Asian + Width). + """ + total_width = 0 + for i, char in enumerate(line_str): + width = char_width(char) + if width + total_width > max_width: + return i + total_width += width + return len(line_str)