src/black/strings.py

   1 """
   2 Simple formatting on strings. Further string formatting code is in trans.py.
   3 """
   4
   5 import re
   6 import sys
   7 from functools import lru_cache
   8 from typing import List, Match, Pattern
   9
  10 from blib2to3.pytree import Leaf
  11
  12 if sys.version_info < (3, 8):
  13     from typing_extensions import Final
  14 else:
  15     from typing import Final
  16
  17 from black._width_table import WIDTH_TABLE
  18
  19 STRING_PREFIX_CHARS: Final = "furbFURB"  # All possible string prefix characters.
  20 STRING_PREFIX_RE: Final = re.compile(
  21     r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
  22 )
  23 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
  24 UNICODE_ESCAPE_RE: Final = re.compile(
  25     r"(?P<backslashes>\\+)(?P<body>"
  26     r"(u(?P<u>[a-fA-F0-9]{4}))"  # Character with 16-bit hex value xxxx
  27     r"|(U(?P<U>[a-fA-F0-9]{8}))"  # Character with 32-bit hex value xxxxxxxx
  28     r"|(x(?P<x>[a-fA-F0-9]{2}))"  # Character with hex value hh
  29     r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})"  # Character named name in the Unicode database
  30     r")",
  31     re.VERBOSE,
  32 )
  33
  34
  35 def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
  36     """Replace `regex` with `replacement` twice on `original`.
  37
  38     This is used by string normalization to perform replaces on
  39     overlapping matches.
  40     """
  41     return regex.sub(replacement, regex.sub(replacement, original))
  42
  43
  44 def has_triple_quotes(string: str) -> bool:
  45     """
  46     Returns:
  47         True iff @string starts with three quotation characters.
  48     """
  49     raw_string = string.lstrip(STRING_PREFIX_CHARS)
  50     return raw_string[:3] in {'"""', "'''"}
  51
  52
  53 def lines_with_leading_tabs_expanded(s: str) -> List[str]:
  54     """
  55     Splits string into lines and expands only leading tabs (following the normal
  56     Python rules)
  57     """
  58     lines = []
  59     for line in s.splitlines():
  60         # Find the index of the first non-whitespace character after a string of
  61         # whitespace that includes at least one tab
  62         match = FIRST_NON_WHITESPACE_RE.match(line)
  63         if match:
  64             first_non_whitespace_idx = match.start(1)
  65
  66             lines.append(
  67                 line[:first_non_whitespace_idx].expandtabs()
  68                 + line[first_non_whitespace_idx:]
  69             )
  70         else:
  71             lines.append(line)
  72     return lines
  73
  74
  75 def fix_docstring(docstring: str, prefix: str) -> str:
  76     # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation
  77     if not docstring:
  78         return ""
  79     lines = lines_with_leading_tabs_expanded(docstring)
  80     # Determine minimum indentation (first line doesn't count):
  81     indent = sys.maxsize
  82     for line in lines[1:]:
  83         stripped = line.lstrip()
  84         if stripped:
  85             indent = min(indent, len(line) - len(stripped))
  86     # Remove indentation (first line is special):
  87     trimmed = [lines[0].strip()]
  88     if indent < sys.maxsize:
  89         last_line_idx = len(lines) - 2
  90         for i, line in enumerate(lines[1:]):
  91             stripped_line = line[indent:].rstrip()
  92             if stripped_line or i == last_line_idx:
  93                 trimmed.append(prefix + stripped_line)
  94             else:
  95                 trimmed.append("")
  96     return "\n".join(trimmed)
  97
  98
  99 def get_string_prefix(string: str) -> str:
 100     """
 101     Pre-conditions:
 102         * assert_is_leaf_string(@string)
 103
 104     Returns:
 105         @string's prefix (e.g. '', 'r', 'f', or 'rf').
 106     """
 107     assert_is_leaf_string(string)
 108
 109     prefix = ""
 110     prefix_idx = 0
 111     while string[prefix_idx] in STRING_PREFIX_CHARS:
 112         prefix += string[prefix_idx]
 113         prefix_idx += 1
 114
 115     return prefix
 116
 117
 118 def assert_is_leaf_string(string: str) -> None:
 119     """
 120     Checks the pre-condition that @string has the format that you would expect
 121     of `leaf.value` where `leaf` is some Leaf such that `leaf.type ==
 122     token.STRING`. A more precise description of the pre-conditions that are
 123     checked are listed below.
 124
 125     Pre-conditions:
 126         * @string starts with either ', ", <prefix>', or <prefix>" where
 127         `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`.
 128         * @string ends with a quote character (' or ").
 129
 130     Raises:
 131         AssertionError(...) if the pre-conditions listed above are not
 132         satisfied.
 133     """
 134     dquote_idx = string.find('"')
 135     squote_idx = string.find("'")
 136     if -1 in [dquote_idx, squote_idx]:
 137         quote_idx = max(dquote_idx, squote_idx)
 138     else:
 139         quote_idx = min(squote_idx, dquote_idx)
 140
 141     assert (
 142         0 <= quote_idx < len(string) - 1
 143     ), f"{string!r} is missing a starting quote character (' or \")."
 144     assert string[-1] in (
 145         "'",
 146         '"',
 147     ), f"{string!r} is missing an ending quote character (' or \")."
 148     assert set(string[:quote_idx]).issubset(
 149         set(STRING_PREFIX_CHARS)
 150     ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
 151
 152
 153 def normalize_string_prefix(s: str) -> str:
 154     """Make all string prefixes lowercase."""
 155     match = STRING_PREFIX_RE.match(s)
 156     assert match is not None, f"failed to match string {s!r}"
 157     orig_prefix = match.group(1)
 158     new_prefix = (
 159         orig_prefix.replace("F", "f")
 160         .replace("B", "b")
 161         .replace("U", "")
 162         .replace("u", "")
 163     )
 164
 165     # Python syntax guarantees max 2 prefixes and that one of them is "r"
 166     if len(new_prefix) == 2 and "r" != new_prefix[0].lower():
 167         new_prefix = new_prefix[::-1]
 168     return f"{new_prefix}{match.group(2)}"
 169
 170
 171 # Re(gex) does actually cache patterns internally but this still improves
 172 # performance on a long list literal of strings by 5-9% since lru_cache's
 173 # caching overhead is much lower.
 174 @lru_cache(maxsize=64)
 175 def _cached_compile(pattern: str) -> Pattern[str]:
 176     return re.compile(pattern)
 177
 178
 179 def normalize_string_quotes(s: str) -> str:
 180     """Prefer double quotes but only if it doesn't cause more escaping.
 181
 182     Adds or removes backslashes as appropriate. Doesn't parse and fix
 183     strings nested in f-strings.
 184     """
 185     value = s.lstrip(STRING_PREFIX_CHARS)
 186     if value[:3] == '"""':
 187         return s
 188
 189     elif value[:3] == "'''":
 190         orig_quote = "'''"
 191         new_quote = '"""'
 192     elif value[0] == '"':
 193         orig_quote = '"'
 194         new_quote = "'"
 195     else:
 196         orig_quote = "'"
 197         new_quote = '"'
 198     first_quote_pos = s.find(orig_quote)
 199     if first_quote_pos == -1:
 200         return s  # There's an internal error
 201
 202     prefix = s[:first_quote_pos]
 203     unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")
 204     escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}")
 205     escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}")
 206     body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)]
 207     if "r" in prefix.casefold():
 208         if unescaped_new_quote.search(body):
 209             # There's at least one unescaped new_quote in this raw string
 210             # so converting is impossible
 211             return s
 212
 213         # Do not introduce or remove backslashes in raw strings
 214         new_body = body
 215     else:
 216         # remove unnecessary escapes
 217         new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body)
 218         if body != new_body:
 219             # Consider the string without unnecessary escapes as the original
 220             body = new_body
 221             s = f"{prefix}{orig_quote}{body}{orig_quote}"
 222         new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body)
 223         new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body)
 224     if "f" in prefix.casefold():
 225         matches = re.findall(
 226             r"""
 227             (?:(?<!\{)|^)\{  # start of the string or a non-{ followed by a single {
 228                 ([^{].*?)  # contents of the brackets except if begins with {{
 229             \}(?:(?!\})|$)  # A } followed by end of the string or a non-}
 230             """,
 231             new_body,
 232             re.VERBOSE,
 233         )
 234         for m in matches:
 235             if "\\" in str(m):
 236                 # Do not introduce backslashes in interpolated expressions
 237                 return s
 238
 239     if new_quote == '"""' and new_body[-1:] == '"':
 240         # edge case:
 241         new_body = new_body[:-1] + '\\"'
 242     orig_escape_count = body.count("\\")
 243     new_escape_count = new_body.count("\\")
 244     if new_escape_count > orig_escape_count:
 245         return s  # Do not introduce more escaping
 246
 247     if new_escape_count == orig_escape_count and orig_quote == '"':
 248         return s  # Prefer double quotes
 249
 250     return f"{prefix}{new_quote}{new_body}{new_quote}"
 251
 252
 253 def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
 254     """Replace hex codes in Unicode escape sequences with lowercase representation."""
 255     text = leaf.value
 256     prefix = get_string_prefix(text)
 257     if "r" in prefix.lower():
 258         return
 259
 260     def replace(m: Match[str]) -> str:
 261         groups = m.groupdict()
 262         back_slashes = groups["backslashes"]
 263
 264         if len(back_slashes) % 2 == 0:
 265             return back_slashes + groups["body"]
 266
 267         if groups["u"]:
 268             # \u
 269             return back_slashes + "u" + groups["u"].lower()
 270         elif groups["U"]:
 271             # \U
 272             return back_slashes + "U" + groups["U"].lower()
 273         elif groups["x"]:
 274             # \x
 275             return back_slashes + "x" + groups["x"].lower()
 276         else:
 277             assert groups["N"], f"Unexpected match: {m}"
 278             # \N{}
 279             return back_slashes + "N{" + groups["N"].upper() + "}"
 280
 281     leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)
 282
 283
 284 @lru_cache(maxsize=4096)
 285 def char_width(char: str) -> int:
 286     """Return the width of a single character as it would be displayed in a
 287     terminal or editor (which respects Unicode East Asian Width).
 288
 289     Full width characters are counted as 2, while half width characters are
 290     counted as 1.  Also control characters are counted as 0.
 291     """
 292     table = WIDTH_TABLE
 293     codepoint = ord(char)
 294     highest = len(table) - 1
 295     lowest = 0
 296     idx = highest // 2
 297     while True:
 298         start_codepoint, end_codepoint, width = table[idx]
 299         if codepoint < start_codepoint:
 300             highest = idx - 1
 301         elif codepoint > end_codepoint:
 302             lowest = idx + 1
 303         else:
 304             return 0 if width < 0 else width
 305         if highest < lowest:
 306             break
 307         idx = (highest + lowest) // 2
 308     return 1
 309
 310
 311 def str_width(line_str: str) -> int:
 312     """Return the width of `line_str` as it would be displayed in a terminal
 313     or editor (which respects Unicode East Asian Width).
 314
 315     You could utilize this function to determine, for example, if a string
 316     is too wide to display in a terminal or editor.
 317     """
 318     if line_str.isascii():
 319         # Fast path for a line consisting of only ASCII characters
 320         return len(line_str)
 321     return sum(map(char_width, line_str))
 322
 323
 324 def count_chars_in_width(line_str: str, max_width: int) -> int:
 325     """Count the number of characters in `line_str` that would fit in a
 326     terminal or editor of `max_width` (which respects Unicode East Asian
 327     Width).
 328     """
 329     total_width = 0
 330     for i, char in enumerate(line_str):
 331         width = char_width(char)
 332         if width + total_width > max_width:
 333             return i
 334         total_width += width
 335     return len(line_str)