src/black/strings.py

   1 """
   2 Simple formatting on strings. Further string formatting code is in trans.py.
   3 """
   4
   5 import re
   6 import sys
   7 from functools import lru_cache
   8 from typing import List, Pattern
   9
  10 if sys.version_info < (3, 8):
  11     from typing_extensions import Final
  12 else:
  13     from typing import Final
  14
  15
  16 STRING_PREFIX_CHARS: Final = "furbFURB"  # All possible string prefix characters.
  17 STRING_PREFIX_RE: Final = re.compile(
  18     r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
  19 )
  20 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
  21
  22
  23 def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
  24     """Replace `regex` with `replacement` twice on `original`.
  25
  26     This is used by string normalization to perform replaces on
  27     overlapping matches.
  28     """
  29     return regex.sub(replacement, regex.sub(replacement, original))
  30
  31
  32 def has_triple_quotes(string: str) -> bool:
  33     """
  34     Returns:
  35         True iff @string starts with three quotation characters.
  36     """
  37     raw_string = string.lstrip(STRING_PREFIX_CHARS)
  38     return raw_string[:3] in {'"""', "'''"}
  39
  40
  41 def lines_with_leading_tabs_expanded(s: str) -> List[str]:
  42     """
  43     Splits string into lines and expands only leading tabs (following the normal
  44     Python rules)
  45     """
  46     lines = []
  47     for line in s.splitlines():
  48         # Find the index of the first non-whitespace character after a string of
  49         # whitespace that includes at least one tab
  50         match = FIRST_NON_WHITESPACE_RE.match(line)
  51         if match:
  52             first_non_whitespace_idx = match.start(1)
  53
  54             lines.append(
  55                 line[:first_non_whitespace_idx].expandtabs()
  56                 + line[first_non_whitespace_idx:]
  57             )
  58         else:
  59             lines.append(line)
  60     return lines
  61
  62
  63 def fix_docstring(docstring: str, prefix: str) -> str:
  64     # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation
  65     if not docstring:
  66         return ""
  67     lines = lines_with_leading_tabs_expanded(docstring)
  68     # Determine minimum indentation (first line doesn't count):
  69     indent = sys.maxsize
  70     for line in lines[1:]:
  71         stripped = line.lstrip()
  72         if stripped:
  73             indent = min(indent, len(line) - len(stripped))
  74     # Remove indentation (first line is special):
  75     trimmed = [lines[0].strip()]
  76     if indent < sys.maxsize:
  77         last_line_idx = len(lines) - 2
  78         for i, line in enumerate(lines[1:]):
  79             stripped_line = line[indent:].rstrip()
  80             if stripped_line or i == last_line_idx:
  81                 trimmed.append(prefix + stripped_line)
  82             else:
  83                 trimmed.append("")
  84     return "\n".join(trimmed)
  85
  86
  87 def get_string_prefix(string: str) -> str:
  88     """
  89     Pre-conditions:
  90         * assert_is_leaf_string(@string)
  91
  92     Returns:
  93         @string's prefix (e.g. '', 'r', 'f', or 'rf').
  94     """
  95     assert_is_leaf_string(string)
  96
  97     prefix = ""
  98     prefix_idx = 0
  99     while string[prefix_idx] in STRING_PREFIX_CHARS:
 100         prefix += string[prefix_idx]
 101         prefix_idx += 1
 102
 103     return prefix
 104
 105
 106 def assert_is_leaf_string(string: str) -> None:
 107     """
 108     Checks the pre-condition that @string has the format that you would expect
 109     of `leaf.value` where `leaf` is some Leaf such that `leaf.type ==
 110     token.STRING`. A more precise description of the pre-conditions that are
 111     checked are listed below.
 112
 113     Pre-conditions:
 114         * @string starts with either ', ", <prefix>', or <prefix>" where
 115         `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`.
 116         * @string ends with a quote character (' or ").
 117
 118     Raises:
 119         AssertionError(...) if the pre-conditions listed above are not
 120         satisfied.
 121     """
 122     dquote_idx = string.find('"')
 123     squote_idx = string.find("'")
 124     if -1 in [dquote_idx, squote_idx]:
 125         quote_idx = max(dquote_idx, squote_idx)
 126     else:
 127         quote_idx = min(squote_idx, dquote_idx)
 128
 129     assert (
 130         0 <= quote_idx < len(string) - 1
 131     ), f"{string!r} is missing a starting quote character (' or \")."
 132     assert string[-1] in (
 133         "'",
 134         '"',
 135     ), f"{string!r} is missing an ending quote character (' or \")."
 136     assert set(string[:quote_idx]).issubset(
 137         set(STRING_PREFIX_CHARS)
 138     ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
 139
 140
 141 def normalize_string_prefix(s: str) -> str:
 142     """Make all string prefixes lowercase."""
 143     match = STRING_PREFIX_RE.match(s)
 144     assert match is not None, f"failed to match string {s!r}"
 145     orig_prefix = match.group(1)
 146     new_prefix = (
 147         orig_prefix.replace("F", "f")
 148         .replace("B", "b")
 149         .replace("U", "")
 150         .replace("u", "")
 151     )
 152
 153     # Python syntax guarantees max 2 prefixes and that one of them is "r"
 154     if len(new_prefix) == 2 and "r" != new_prefix[0].lower():
 155         new_prefix = new_prefix[::-1]
 156     return f"{new_prefix}{match.group(2)}"
 157
 158
 159 # Re(gex) does actually cache patterns internally but this still improves
 160 # performance on a long list literal of strings by 5-9% since lru_cache's
 161 # caching overhead is much lower.
 162 @lru_cache(maxsize=64)
 163 def _cached_compile(pattern: str) -> Pattern[str]:
 164     return re.compile(pattern)
 165
 166
 167 def normalize_string_quotes(s: str) -> str:
 168     """Prefer double quotes but only if it doesn't cause more escaping.
 169
 170     Adds or removes backslashes as appropriate. Doesn't parse and fix
 171     strings nested in f-strings.
 172     """
 173     value = s.lstrip(STRING_PREFIX_CHARS)
 174     if value[:3] == '"""':
 175         return s
 176
 177     elif value[:3] == "'''":
 178         orig_quote = "'''"
 179         new_quote = '"""'
 180     elif value[0] == '"':
 181         orig_quote = '"'
 182         new_quote = "'"
 183     else:
 184         orig_quote = "'"
 185         new_quote = '"'
 186     first_quote_pos = s.find(orig_quote)
 187     if first_quote_pos == -1:
 188         return s  # There's an internal error
 189
 190     prefix = s[:first_quote_pos]
 191     unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")
 192     escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}")
 193     escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}")
 194     body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)]
 195     if "r" in prefix.casefold():
 196         if unescaped_new_quote.search(body):
 197             # There's at least one unescaped new_quote in this raw string
 198             # so converting is impossible
 199             return s
 200
 201         # Do not introduce or remove backslashes in raw strings
 202         new_body = body
 203     else:
 204         # remove unnecessary escapes
 205         new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body)
 206         if body != new_body:
 207             # Consider the string without unnecessary escapes as the original
 208             body = new_body
 209             s = f"{prefix}{orig_quote}{body}{orig_quote}"
 210         new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body)
 211         new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body)
 212     if "f" in prefix.casefold():
 213         matches = re.findall(
 214             r"""
 215             (?:(?<!\{)|^)\{  # start of the string or a non-{ followed by a single {
 216                 ([^{].*?)  # contents of the brackets except if begins with {{
 217             \}(?:(?!\})|$)  # A } followed by end of the string or a non-}
 218             """,
 219             new_body,
 220             re.VERBOSE,
 221         )
 222         for m in matches:
 223             if "\\" in str(m):
 224                 # Do not introduce backslashes in interpolated expressions
 225                 return s
 226
 227     if new_quote == '"""' and new_body[-1:] == '"':
 228         # edge case:
 229         new_body = new_body[:-1] + '\\"'
 230     orig_escape_count = body.count("\\")
 231     new_escape_count = new_body.count("\\")
 232     if new_escape_count > orig_escape_count:
 233         return s  # Do not introduce more escaping
 234
 235     if new_escape_count == orig_escape_count and orig_quote == '"':
 236         return s  # Prefer double quotes
 237
 238     return f"{prefix}{new_quote}{new_body}{new_quote}"