src/black/strings.py

   1 """
   2 Simple formatting on strings. Further string formatting code is in trans.py.
   3 """
   4
   5 import regex as re
   6 import sys
   7 from typing import List, Pattern
   8
   9
  10 STRING_PREFIX_CHARS = "furbFURB"  # All possible string prefix characters.
  11
  12
  13 def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
  14     """Replace `regex` with `replacement` twice on `original`.
  15
  16     This is used by string normalization to perform replaces on
  17     overlapping matches.
  18     """
  19     return regex.sub(replacement, regex.sub(replacement, original))
  20
  21
  22 def has_triple_quotes(string: str) -> bool:
  23     """
  24     Returns:
  25         True iff @string starts with three quotation characters.
  26     """
  27     raw_string = string.lstrip(STRING_PREFIX_CHARS)
  28     return raw_string[:3] in {'"""', "'''"}
  29
  30
  31 def lines_with_leading_tabs_expanded(s: str) -> List[str]:
  32     """
  33     Splits string into lines and expands only leading tabs (following the normal
  34     Python rules)
  35     """
  36     lines = []
  37     for line in s.splitlines():
  38         # Find the index of the first non-whitespace character after a string of
  39         # whitespace that includes at least one tab
  40         match = re.match(r"\s*\t+\s*(\S)", line)
  41         if match:
  42             first_non_whitespace_idx = match.start(1)
  43
  44             lines.append(
  45                 line[:first_non_whitespace_idx].expandtabs()
  46                 + line[first_non_whitespace_idx:]
  47             )
  48         else:
  49             lines.append(line)
  50     return lines
  51
  52
  53 def fix_docstring(docstring: str, prefix: str) -> str:
  54     # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation
  55     if not docstring:
  56         return ""
  57     lines = lines_with_leading_tabs_expanded(docstring)
  58     # Determine minimum indentation (first line doesn't count):
  59     indent = sys.maxsize
  60     for line in lines[1:]:
  61         stripped = line.lstrip()
  62         if stripped:
  63             indent = min(indent, len(line) - len(stripped))
  64     # Remove indentation (first line is special):
  65     trimmed = [lines[0].strip()]
  66     if indent < sys.maxsize:
  67         last_line_idx = len(lines) - 2
  68         for i, line in enumerate(lines[1:]):
  69             stripped_line = line[indent:].rstrip()
  70             if stripped_line or i == last_line_idx:
  71                 trimmed.append(prefix + stripped_line)
  72             else:
  73                 trimmed.append("")
  74     return "\n".join(trimmed)
  75
  76
  77 def get_string_prefix(string: str) -> str:
  78     """
  79     Pre-conditions:
  80         * assert_is_leaf_string(@string)
  81
  82     Returns:
  83         @string's prefix (e.g. '', 'r', 'f', or 'rf').
  84     """
  85     assert_is_leaf_string(string)
  86
  87     prefix = ""
  88     prefix_idx = 0
  89     while string[prefix_idx] in STRING_PREFIX_CHARS:
  90         prefix += string[prefix_idx]
  91         prefix_idx += 1
  92
  93     return prefix
  94
  95
  96 def assert_is_leaf_string(string: str) -> None:
  97     """
  98     Checks the pre-condition that @string has the format that you would expect
  99     of `leaf.value` where `leaf` is some Leaf such that `leaf.type ==
 100     token.STRING`. A more precise description of the pre-conditions that are
 101     checked are listed below.
 102
 103     Pre-conditions:
 104         * @string starts with either ', ", <prefix>', or <prefix>" where
 105         `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`.
 106         * @string ends with a quote character (' or ").
 107
 108     Raises:
 109         AssertionError(...) if the pre-conditions listed above are not
 110         satisfied.
 111     """
 112     dquote_idx = string.find('"')
 113     squote_idx = string.find("'")
 114     if -1 in [dquote_idx, squote_idx]:
 115         quote_idx = max(dquote_idx, squote_idx)
 116     else:
 117         quote_idx = min(squote_idx, dquote_idx)
 118
 119     assert (
 120         0 <= quote_idx < len(string) - 1
 121     ), f"{string!r} is missing a starting quote character (' or \")."
 122     assert string[-1] in (
 123         "'",
 124         '"',
 125     ), f"{string!r} is missing an ending quote character (' or \")."
 126     assert set(string[:quote_idx]).issubset(
 127         set(STRING_PREFIX_CHARS)
 128     ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
 129
 130
 131 def normalize_string_prefix(s: str, remove_u_prefix: bool = False) -> str:
 132     """Make all string prefixes lowercase.
 133
 134     If remove_u_prefix is given, also removes any u prefix from the string.
 135     """
 136     match = re.match(r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", s, re.DOTALL)
 137     assert match is not None, f"failed to match string {s!r}"
 138     orig_prefix = match.group(1)
 139     new_prefix = orig_prefix.replace("F", "f").replace("B", "b").replace("U", "u")
 140     if remove_u_prefix:
 141         new_prefix = new_prefix.replace("u", "")
 142     return f"{new_prefix}{match.group(2)}"
 143
 144
 145 def normalize_string_quotes(s: str) -> str:
 146     """Prefer double quotes but only if it doesn't cause more escaping.
 147
 148     Adds or removes backslashes as appropriate. Doesn't parse and fix
 149     strings nested in f-strings.
 150     """
 151     value = s.lstrip(STRING_PREFIX_CHARS)
 152     if value[:3] == '"""':
 153         return s
 154
 155     elif value[:3] == "'''":
 156         orig_quote = "'''"
 157         new_quote = '"""'
 158     elif value[0] == '"':
 159         orig_quote = '"'
 160         new_quote = "'"
 161     else:
 162         orig_quote = "'"
 163         new_quote = '"'
 164     first_quote_pos = s.find(orig_quote)
 165     if first_quote_pos == -1:
 166         return s  # There's an internal error
 167
 168     prefix = s[:first_quote_pos]
 169     unescaped_new_quote = re.compile(rf"(([^\\]|^)(\\\\)*){new_quote}")
 170     escaped_new_quote = re.compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}")
 171     escaped_orig_quote = re.compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}")
 172     body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)]
 173     if "r" in prefix.casefold():
 174         if unescaped_new_quote.search(body):
 175             # There's at least one unescaped new_quote in this raw string
 176             # so converting is impossible
 177             return s
 178
 179         # Do not introduce or remove backslashes in raw strings
 180         new_body = body
 181     else:
 182         # remove unnecessary escapes
 183         new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body)
 184         if body != new_body:
 185             # Consider the string without unnecessary escapes as the original
 186             body = new_body
 187             s = f"{prefix}{orig_quote}{body}{orig_quote}"
 188         new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body)
 189         new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body)
 190     if "f" in prefix.casefold():
 191         matches = re.findall(
 192             r"""
 193             (?:[^{]|^)\{  # start of the string or a non-{ followed by a single {
 194                 ([^{].*?)  # contents of the brackets except if begins with {{
 195             \}(?:[^}]|$)  # A } followed by end of the string or a non-}
 196             """,
 197             new_body,
 198             re.VERBOSE,
 199         )
 200         for m in matches:
 201             if "\\" in str(m):
 202                 # Do not introduce backslashes in interpolated expressions
 203                 return s
 204
 205     if new_quote == '"""' and new_body[-1:] == '"':
 206         # edge case:
 207         new_body = new_body[:-1] + '\\"'
 208     orig_escape_count = body.count("\\")
 209     new_escape_count = new_body.count("\\")
 210     if new_escape_count > orig_escape_count:
 211         return s  # Do not introduce more escaping
 212
 213     if new_escape_count == orig_escape_count and orig_quote == '"':
 214         return s  # Prefer double quotes
 215
 216     return f"{prefix}{new_quote}{new_body}{new_quote}"