src/black/strings.py

   1 """
   2 Simple formatting on strings. Further string formatting code is in trans.py.
   3 """
   4
   5 import re
   6 import sys
   7 from functools import lru_cache
   8 from typing import List, Match, Pattern
   9
  10 from blib2to3.pytree import Leaf
  11
  12 if sys.version_info < (3, 8):
  13     from typing_extensions import Final
  14 else:
  15     from typing import Final
  16
  17
  18 STRING_PREFIX_CHARS: Final = "furbFURB"  # All possible string prefix characters.
  19 STRING_PREFIX_RE: Final = re.compile(
  20     r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
  21 )
  22 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
  23 UNICODE_ESCAPE_RE: Final = re.compile(
  24     r"(?P<backslashes>\\+)(?P<body>"
  25     r"(u(?P<u>[a-fA-F0-9]{4}))"  # Character with 16-bit hex value xxxx
  26     r"|(U(?P<U>[a-fA-F0-9]{8}))"  # Character with 32-bit hex value xxxxxxxx
  27     r"|(x(?P<x>[a-fA-F0-9]{2}))"  # Character with hex value hh
  28     r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})"  # Character named name in the Unicode database
  29     r")",
  30     re.VERBOSE,
  31 )
  32
  33
  34 def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
  35     """Replace `regex` with `replacement` twice on `original`.
  36
  37     This is used by string normalization to perform replaces on
  38     overlapping matches.
  39     """
  40     return regex.sub(replacement, regex.sub(replacement, original))
  41
  42
  43 def has_triple_quotes(string: str) -> bool:
  44     """
  45     Returns:
  46         True iff @string starts with three quotation characters.
  47     """
  48     raw_string = string.lstrip(STRING_PREFIX_CHARS)
  49     return raw_string[:3] in {'"""', "'''"}
  50
  51
  52 def lines_with_leading_tabs_expanded(s: str) -> List[str]:
  53     """
  54     Splits string into lines and expands only leading tabs (following the normal
  55     Python rules)
  56     """
  57     lines = []
  58     for line in s.splitlines():
  59         # Find the index of the first non-whitespace character after a string of
  60         # whitespace that includes at least one tab
  61         match = FIRST_NON_WHITESPACE_RE.match(line)
  62         if match:
  63             first_non_whitespace_idx = match.start(1)
  64
  65             lines.append(
  66                 line[:first_non_whitespace_idx].expandtabs()
  67                 + line[first_non_whitespace_idx:]
  68             )
  69         else:
  70             lines.append(line)
  71     return lines
  72
  73
  74 def fix_docstring(docstring: str, prefix: str) -> str:
  75     # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation
  76     if not docstring:
  77         return ""
  78     lines = lines_with_leading_tabs_expanded(docstring)
  79     # Determine minimum indentation (first line doesn't count):
  80     indent = sys.maxsize
  81     for line in lines[1:]:
  82         stripped = line.lstrip()
  83         if stripped:
  84             indent = min(indent, len(line) - len(stripped))
  85     # Remove indentation (first line is special):
  86     trimmed = [lines[0].strip()]
  87     if indent < sys.maxsize:
  88         last_line_idx = len(lines) - 2
  89         for i, line in enumerate(lines[1:]):
  90             stripped_line = line[indent:].rstrip()
  91             if stripped_line or i == last_line_idx:
  92                 trimmed.append(prefix + stripped_line)
  93             else:
  94                 trimmed.append("")
  95     return "\n".join(trimmed)
  96
  97
  98 def get_string_prefix(string: str) -> str:
  99     """
 100     Pre-conditions:
 101         * assert_is_leaf_string(@string)
 102
 103     Returns:
 104         @string's prefix (e.g. '', 'r', 'f', or 'rf').
 105     """
 106     assert_is_leaf_string(string)
 107
 108     prefix = ""
 109     prefix_idx = 0
 110     while string[prefix_idx] in STRING_PREFIX_CHARS:
 111         prefix += string[prefix_idx]
 112         prefix_idx += 1
 113
 114     return prefix
 115
 116
 117 def assert_is_leaf_string(string: str) -> None:
 118     """
 119     Checks the pre-condition that @string has the format that you would expect
 120     of `leaf.value` where `leaf` is some Leaf such that `leaf.type ==
 121     token.STRING`. A more precise description of the pre-conditions that are
 122     checked are listed below.
 123
 124     Pre-conditions:
 125         * @string starts with either ', ", <prefix>', or <prefix>" where
 126         `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`.
 127         * @string ends with a quote character (' or ").
 128
 129     Raises:
 130         AssertionError(...) if the pre-conditions listed above are not
 131         satisfied.
 132     """
 133     dquote_idx = string.find('"')
 134     squote_idx = string.find("'")
 135     if -1 in [dquote_idx, squote_idx]:
 136         quote_idx = max(dquote_idx, squote_idx)
 137     else:
 138         quote_idx = min(squote_idx, dquote_idx)
 139
 140     assert (
 141         0 <= quote_idx < len(string) - 1
 142     ), f"{string!r} is missing a starting quote character (' or \")."
 143     assert string[-1] in (
 144         "'",
 145         '"',
 146     ), f"{string!r} is missing an ending quote character (' or \")."
 147     assert set(string[:quote_idx]).issubset(
 148         set(STRING_PREFIX_CHARS)
 149     ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
 150
 151
 152 def normalize_string_prefix(s: str) -> str:
 153     """Make all string prefixes lowercase."""
 154     match = STRING_PREFIX_RE.match(s)
 155     assert match is not None, f"failed to match string {s!r}"
 156     orig_prefix = match.group(1)
 157     new_prefix = (
 158         orig_prefix.replace("F", "f")
 159         .replace("B", "b")
 160         .replace("U", "")
 161         .replace("u", "")
 162     )
 163
 164     # Python syntax guarantees max 2 prefixes and that one of them is "r"
 165     if len(new_prefix) == 2 and "r" != new_prefix[0].lower():
 166         new_prefix = new_prefix[::-1]
 167     return f"{new_prefix}{match.group(2)}"
 168
 169
 170 # Re(gex) does actually cache patterns internally but this still improves
 171 # performance on a long list literal of strings by 5-9% since lru_cache's
 172 # caching overhead is much lower.
 173 @lru_cache(maxsize=64)
 174 def _cached_compile(pattern: str) -> Pattern[str]:
 175     return re.compile(pattern)
 176
 177
 178 def normalize_string_quotes(s: str) -> str:
 179     """Prefer double quotes but only if it doesn't cause more escaping.
 180
 181     Adds or removes backslashes as appropriate. Doesn't parse and fix
 182     strings nested in f-strings.
 183     """
 184     value = s.lstrip(STRING_PREFIX_CHARS)
 185     if value[:3] == '"""':
 186         return s
 187
 188     elif value[:3] == "'''":
 189         orig_quote = "'''"
 190         new_quote = '"""'
 191     elif value[0] == '"':
 192         orig_quote = '"'
 193         new_quote = "'"
 194     else:
 195         orig_quote = "'"
 196         new_quote = '"'
 197     first_quote_pos = s.find(orig_quote)
 198     if first_quote_pos == -1:
 199         return s  # There's an internal error
 200
 201     prefix = s[:first_quote_pos]
 202     unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")
 203     escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}")
 204     escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}")
 205     body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)]
 206     if "r" in prefix.casefold():
 207         if unescaped_new_quote.search(body):
 208             # There's at least one unescaped new_quote in this raw string
 209             # so converting is impossible
 210             return s
 211
 212         # Do not introduce or remove backslashes in raw strings
 213         new_body = body
 214     else:
 215         # remove unnecessary escapes
 216         new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body)
 217         if body != new_body:
 218             # Consider the string without unnecessary escapes as the original
 219             body = new_body
 220             s = f"{prefix}{orig_quote}{body}{orig_quote}"
 221         new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body)
 222         new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body)
 223     if "f" in prefix.casefold():
 224         matches = re.findall(
 225             r"""
 226             (?:(?<!\{)|^)\{  # start of the string or a non-{ followed by a single {
 227                 ([^{].*?)  # contents of the brackets except if begins with {{
 228             \}(?:(?!\})|$)  # A } followed by end of the string or a non-}
 229             """,
 230             new_body,
 231             re.VERBOSE,
 232         )
 233         for m in matches:
 234             if "\\" in str(m):
 235                 # Do not introduce backslashes in interpolated expressions
 236                 return s
 237
 238     if new_quote == '"""' and new_body[-1:] == '"':
 239         # edge case:
 240         new_body = new_body[:-1] + '\\"'
 241     orig_escape_count = body.count("\\")
 242     new_escape_count = new_body.count("\\")
 243     if new_escape_count > orig_escape_count:
 244         return s  # Do not introduce more escaping
 245
 246     if new_escape_count == orig_escape_count and orig_quote == '"':
 247         return s  # Prefer double quotes
 248
 249     return f"{prefix}{new_quote}{new_body}{new_quote}"
 250
 251
 252 def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
 253     """Replace hex codes in Unicode escape sequences with lowercase representation."""
 254     text = leaf.value
 255     prefix = get_string_prefix(text)
 256     if "r" in prefix.lower():
 257         return
 258
 259     def replace(m: Match[str]) -> str:
 260         groups = m.groupdict()
 261         back_slashes = groups["backslashes"]
 262
 263         if len(back_slashes) % 2 == 0:
 264             return back_slashes + groups["body"]
 265
 266         if groups["u"]:
 267             # \u
 268             return back_slashes + "u" + groups["u"].lower()
 269         elif groups["U"]:
 270             # \U
 271             return back_slashes + "U" + groups["U"].lower()
 272         elif groups["x"]:
 273             # \x
 274             return back_slashes + "x" + groups["x"].lower()
 275         else:
 276             assert groups["N"], f"Unexpected match: {m}"
 277             # \N{}
 278             return back_slashes + "N{" + groups["N"].upper() + "}"
 279
 280     leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)