src/black/strings.py

   1 """
   2 Simple formatting on strings. Further string formatting code is in trans.py.
   3 """
   4
   5 import re
   6 import sys
   7 from functools import lru_cache
   8 from typing import Final, List, Match, Pattern
   9
  10 from black._width_table import WIDTH_TABLE
  11 from blib2to3.pytree import Leaf
  12
  13 STRING_PREFIX_CHARS: Final = "furbFURB"  # All possible string prefix characters.
  14 STRING_PREFIX_RE: Final = re.compile(
  15     r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
  16 )
  17 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
  18 UNICODE_ESCAPE_RE: Final = re.compile(
  19     r"(?P<backslashes>\\+)(?P<body>"
  20     r"(u(?P<u>[a-fA-F0-9]{4}))"  # Character with 16-bit hex value xxxx
  21     r"|(U(?P<U>[a-fA-F0-9]{8}))"  # Character with 32-bit hex value xxxxxxxx
  22     r"|(x(?P<x>[a-fA-F0-9]{2}))"  # Character with hex value hh
  23     r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})"  # Character named name in the Unicode database
  24     r")",
  25     re.VERBOSE,
  26 )
  27
  28
  29 def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
  30     """Replace `regex` with `replacement` twice on `original`.
  31
  32     This is used by string normalization to perform replaces on
  33     overlapping matches.
  34     """
  35     return regex.sub(replacement, regex.sub(replacement, original))
  36
  37
  38 def has_triple_quotes(string: str) -> bool:
  39     """
  40     Returns:
  41         True iff @string starts with three quotation characters.
  42     """
  43     raw_string = string.lstrip(STRING_PREFIX_CHARS)
  44     return raw_string[:3] in {'"""', "'''"}
  45
  46
  47 def lines_with_leading_tabs_expanded(s: str) -> List[str]:
  48     """
  49     Splits string into lines and expands only leading tabs (following the normal
  50     Python rules)
  51     """
  52     lines = []
  53     for line in s.splitlines():
  54         # Find the index of the first non-whitespace character after a string of
  55         # whitespace that includes at least one tab
  56         match = FIRST_NON_WHITESPACE_RE.match(line)
  57         if match:
  58             first_non_whitespace_idx = match.start(1)
  59
  60             lines.append(
  61                 line[:first_non_whitespace_idx].expandtabs()
  62                 + line[first_non_whitespace_idx:]
  63             )
  64         else:
  65             lines.append(line)
  66     return lines
  67
  68
  69 def fix_docstring(docstring: str, prefix: str) -> str:
  70     # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation
  71     if not docstring:
  72         return ""
  73     lines = lines_with_leading_tabs_expanded(docstring)
  74     # Determine minimum indentation (first line doesn't count):
  75     indent = sys.maxsize
  76     for line in lines[1:]:
  77         stripped = line.lstrip()
  78         if stripped:
  79             indent = min(indent, len(line) - len(stripped))
  80     # Remove indentation (first line is special):
  81     trimmed = [lines[0].strip()]
  82     if indent < sys.maxsize:
  83         last_line_idx = len(lines) - 2
  84         for i, line in enumerate(lines[1:]):
  85             stripped_line = line[indent:].rstrip()
  86             if stripped_line or i == last_line_idx:
  87                 trimmed.append(prefix + stripped_line)
  88             else:
  89                 trimmed.append("")
  90     return "\n".join(trimmed)
  91
  92
  93 def get_string_prefix(string: str) -> str:
  94     """
  95     Pre-conditions:
  96         * assert_is_leaf_string(@string)
  97
  98     Returns:
  99         @string's prefix (e.g. '', 'r', 'f', or 'rf').
 100     """
 101     assert_is_leaf_string(string)
 102
 103     prefix = ""
 104     prefix_idx = 0
 105     while string[prefix_idx] in STRING_PREFIX_CHARS:
 106         prefix += string[prefix_idx]
 107         prefix_idx += 1
 108
 109     return prefix
 110
 111
 112 def assert_is_leaf_string(string: str) -> None:
 113     """
 114     Checks the pre-condition that @string has the format that you would expect
 115     of `leaf.value` where `leaf` is some Leaf such that `leaf.type ==
 116     token.STRING`. A more precise description of the pre-conditions that are
 117     checked are listed below.
 118
 119     Pre-conditions:
 120         * @string starts with either ', ", <prefix>', or <prefix>" where
 121         `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`.
 122         * @string ends with a quote character (' or ").
 123
 124     Raises:
 125         AssertionError(...) if the pre-conditions listed above are not
 126         satisfied.
 127     """
 128     dquote_idx = string.find('"')
 129     squote_idx = string.find("'")
 130     if -1 in [dquote_idx, squote_idx]:
 131         quote_idx = max(dquote_idx, squote_idx)
 132     else:
 133         quote_idx = min(squote_idx, dquote_idx)
 134
 135     assert (
 136         0 <= quote_idx < len(string) - 1
 137     ), f"{string!r} is missing a starting quote character (' or \")."
 138     assert string[-1] in (
 139         "'",
 140         '"',
 141     ), f"{string!r} is missing an ending quote character (' or \")."
 142     assert set(string[:quote_idx]).issubset(
 143         set(STRING_PREFIX_CHARS)
 144     ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
 145
 146
 147 def normalize_string_prefix(s: str) -> str:
 148     """Make all string prefixes lowercase."""
 149     match = STRING_PREFIX_RE.match(s)
 150     assert match is not None, f"failed to match string {s!r}"
 151     orig_prefix = match.group(1)
 152     new_prefix = (
 153         orig_prefix.replace("F", "f")
 154         .replace("B", "b")
 155         .replace("U", "")
 156         .replace("u", "")
 157     )
 158
 159     # Python syntax guarantees max 2 prefixes and that one of them is "r"
 160     if len(new_prefix) == 2 and "r" != new_prefix[0].lower():
 161         new_prefix = new_prefix[::-1]
 162     return f"{new_prefix}{match.group(2)}"
 163
 164
 165 # Re(gex) does actually cache patterns internally but this still improves
 166 # performance on a long list literal of strings by 5-9% since lru_cache's
 167 # caching overhead is much lower.
 168 @lru_cache(maxsize=64)
 169 def _cached_compile(pattern: str) -> Pattern[str]:
 170     return re.compile(pattern)
 171
 172
 173 def normalize_string_quotes(s: str) -> str:
 174     """Prefer double quotes but only if it doesn't cause more escaping.
 175
 176     Adds or removes backslashes as appropriate. Doesn't parse and fix
 177     strings nested in f-strings.
 178     """
 179     value = s.lstrip(STRING_PREFIX_CHARS)
 180     if value[:3] == '"""':
 181         return s
 182
 183     elif value[:3] == "'''":
 184         orig_quote = "'''"
 185         new_quote = '"""'
 186     elif value[0] == '"':
 187         orig_quote = '"'
 188         new_quote = "'"
 189     else:
 190         orig_quote = "'"
 191         new_quote = '"'
 192     first_quote_pos = s.find(orig_quote)
 193     if first_quote_pos == -1:
 194         return s  # There's an internal error
 195
 196     prefix = s[:first_quote_pos]
 197     unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")
 198     escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}")
 199     escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}")
 200     body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)]
 201     if "r" in prefix.casefold():
 202         if unescaped_new_quote.search(body):
 203             # There's at least one unescaped new_quote in this raw string
 204             # so converting is impossible
 205             return s
 206
 207         # Do not introduce or remove backslashes in raw strings
 208         new_body = body
 209     else:
 210         # remove unnecessary escapes
 211         new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body)
 212         if body != new_body:
 213             # Consider the string without unnecessary escapes as the original
 214             body = new_body
 215             s = f"{prefix}{orig_quote}{body}{orig_quote}"
 216         new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body)
 217         new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body)
 218     if "f" in prefix.casefold():
 219         matches = re.findall(
 220             r"""
 221             (?:(?<!\{)|^)\{  # start of the string or a non-{ followed by a single {
 222                 ([^{].*?)  # contents of the brackets except if begins with {{
 223             \}(?:(?!\})|$)  # A } followed by end of the string or a non-}
 224             """,
 225             new_body,
 226             re.VERBOSE,
 227         )
 228         for m in matches:
 229             if "\\" in str(m):
 230                 # Do not introduce backslashes in interpolated expressions
 231                 return s
 232
 233     if new_quote == '"""' and new_body[-1:] == '"':
 234         # edge case:
 235         new_body = new_body[:-1] + '\\"'
 236     orig_escape_count = body.count("\\")
 237     new_escape_count = new_body.count("\\")
 238     if new_escape_count > orig_escape_count:
 239         return s  # Do not introduce more escaping
 240
 241     if new_escape_count == orig_escape_count and orig_quote == '"':
 242         return s  # Prefer double quotes
 243
 244     return f"{prefix}{new_quote}{new_body}{new_quote}"
 245
 246
 247 def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
 248     """Replace hex codes in Unicode escape sequences with lowercase representation."""
 249     text = leaf.value
 250     prefix = get_string_prefix(text)
 251     if "r" in prefix.lower():
 252         return
 253
 254     def replace(m: Match[str]) -> str:
 255         groups = m.groupdict()
 256         back_slashes = groups["backslashes"]
 257
 258         if len(back_slashes) % 2 == 0:
 259             return back_slashes + groups["body"]
 260
 261         if groups["u"]:
 262             # \u
 263             return back_slashes + "u" + groups["u"].lower()
 264         elif groups["U"]:
 265             # \U
 266             return back_slashes + "U" + groups["U"].lower()
 267         elif groups["x"]:
 268             # \x
 269             return back_slashes + "x" + groups["x"].lower()
 270         else:
 271             assert groups["N"], f"Unexpected match: {m}"
 272             # \N{}
 273             return back_slashes + "N{" + groups["N"].upper() + "}"
 274
 275     leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)
 276
 277
 278 @lru_cache(maxsize=4096)
 279 def char_width(char: str) -> int:
 280     """Return the width of a single character as it would be displayed in a
 281     terminal or editor (which respects Unicode East Asian Width).
 282
 283     Full width characters are counted as 2, while half width characters are
 284     counted as 1.  Also control characters are counted as 0.
 285     """
 286     table = WIDTH_TABLE
 287     codepoint = ord(char)
 288     highest = len(table) - 1
 289     lowest = 0
 290     idx = highest // 2
 291     while True:
 292         start_codepoint, end_codepoint, width = table[idx]
 293         if codepoint < start_codepoint:
 294             highest = idx - 1
 295         elif codepoint > end_codepoint:
 296             lowest = idx + 1
 297         else:
 298             return 0 if width < 0 else width
 299         if highest < lowest:
 300             break
 301         idx = (highest + lowest) // 2
 302     return 1
 303
 304
 305 def str_width(line_str: str) -> int:
 306     """Return the width of `line_str` as it would be displayed in a terminal
 307     or editor (which respects Unicode East Asian Width).
 308
 309     You could utilize this function to determine, for example, if a string
 310     is too wide to display in a terminal or editor.
 311     """
 312     if line_str.isascii():
 313         # Fast path for a line consisting of only ASCII characters
 314         return len(line_str)
 315     return sum(map(char_width, line_str))
 316
 317
 318 def count_chars_in_width(line_str: str, max_width: int) -> int:
 319     """Count the number of characters in `line_str` that would fit in a
 320     terminal or editor of `max_width` (which respects Unicode East Asian
 321     Width).
 322     """
 323     total_width = 0
 324     for i, char in enumerate(line_str):
 325         width = char_width(char)
 326         if width + total_width > max_width:
 327             return i
 328         total_width += width
 329     return len(line_str)