All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
2 Simple formatting on strings. Further string formatting code is in trans.py.
7 from functools import lru_cache
8 from typing import List, Match, Pattern
10 from blib2to3.pytree import Leaf
12 if sys.version_info < (3, 8):
13 from typing_extensions import Final
15 from typing import Final
17 from black._width_table import WIDTH_TABLE
19 STRING_PREFIX_CHARS: Final = "furbFURB" # All possible string prefix characters.
20 STRING_PREFIX_RE: Final = re.compile(
21 r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
23 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
24 UNICODE_ESCAPE_RE: Final = re.compile(
25 r"(?P<backslashes>\\+)(?P<body>"
26 r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx
27 r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx
28 r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh
29 r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database
35 def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
36 """Replace `regex` with `replacement` twice on `original`.
38 This is used by string normalization to perform replaces on
41 return regex.sub(replacement, regex.sub(replacement, original))
44 def has_triple_quotes(string: str) -> bool:
47 True iff @string starts with three quotation characters.
49 raw_string = string.lstrip(STRING_PREFIX_CHARS)
50 return raw_string[:3] in {'"""', "'''"}
53 def lines_with_leading_tabs_expanded(s: str) -> List[str]:
55 Splits string into lines and expands only leading tabs (following the normal
59 for line in s.splitlines():
60 # Find the index of the first non-whitespace character after a string of
61 # whitespace that includes at least one tab
62 match = FIRST_NON_WHITESPACE_RE.match(line)
64 first_non_whitespace_idx = match.start(1)
67 line[:first_non_whitespace_idx].expandtabs()
68 + line[first_non_whitespace_idx:]
75 def fix_docstring(docstring: str, prefix: str) -> str:
76 # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation
79 lines = lines_with_leading_tabs_expanded(docstring)
80 # Determine minimum indentation (first line doesn't count):
82 for line in lines[1:]:
83 stripped = line.lstrip()
85 indent = min(indent, len(line) - len(stripped))
86 # Remove indentation (first line is special):
87 trimmed = [lines[0].strip()]
88 if indent < sys.maxsize:
89 last_line_idx = len(lines) - 2
90 for i, line in enumerate(lines[1:]):
91 stripped_line = line[indent:].rstrip()
92 if stripped_line or i == last_line_idx:
93 trimmed.append(prefix + stripped_line)
96 return "\n".join(trimmed)
99 def get_string_prefix(string: str) -> str:
102 * assert_is_leaf_string(@string)
105 @string's prefix (e.g. '', 'r', 'f', or 'rf').
107 assert_is_leaf_string(string)
111 while string[prefix_idx] in STRING_PREFIX_CHARS:
112 prefix += string[prefix_idx]
118 def assert_is_leaf_string(string: str) -> None:
120 Checks the pre-condition that @string has the format that you would expect
121 of `leaf.value` where `leaf` is some Leaf such that `leaf.type ==
122 token.STRING`. A more precise description of the pre-conditions that are
123 checked are listed below.
126 * @string starts with either ', ", <prefix>', or <prefix>" where
127 `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`.
128 * @string ends with a quote character (' or ").
131 AssertionError(...) if the pre-conditions listed above are not
134 dquote_idx = string.find('"')
135 squote_idx = string.find("'")
136 if -1 in [dquote_idx, squote_idx]:
137 quote_idx = max(dquote_idx, squote_idx)
139 quote_idx = min(squote_idx, dquote_idx)
142 0 <= quote_idx < len(string) - 1
143 ), f"{string!r} is missing a starting quote character (' or \")."
144 assert string[-1] in (
147 ), f"{string!r} is missing an ending quote character (' or \")."
148 assert set(string[:quote_idx]).issubset(
149 set(STRING_PREFIX_CHARS)
150 ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
153 def normalize_string_prefix(s: str) -> str:
154 """Make all string prefixes lowercase."""
155 match = STRING_PREFIX_RE.match(s)
156 assert match is not None, f"failed to match string {s!r}"
157 orig_prefix = match.group(1)
159 orig_prefix.replace("F", "f")
165 # Python syntax guarantees max 2 prefixes and that one of them is "r"
166 if len(new_prefix) == 2 and "r" != new_prefix[0].lower():
167 new_prefix = new_prefix[::-1]
168 return f"{new_prefix}{match.group(2)}"
171 # Re(gex) does actually cache patterns internally but this still improves
172 # performance on a long list literal of strings by 5-9% since lru_cache's
173 # caching overhead is much lower.
174 @lru_cache(maxsize=64)
175 def _cached_compile(pattern: str) -> Pattern[str]:
176 return re.compile(pattern)
179 def normalize_string_quotes(s: str) -> str:
180 """Prefer double quotes but only if it doesn't cause more escaping.
182 Adds or removes backslashes as appropriate. Doesn't parse and fix
183 strings nested in f-strings.
185 value = s.lstrip(STRING_PREFIX_CHARS)
186 if value[:3] == '"""':
189 elif value[:3] == "'''":
192 elif value[0] == '"':
198 first_quote_pos = s.find(orig_quote)
199 if first_quote_pos == -1:
200 return s # There's an internal error
202 prefix = s[:first_quote_pos]
203 unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")
204 escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}")
205 escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}")
206 body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)]
207 if "r" in prefix.casefold():
208 if unescaped_new_quote.search(body):
209 # There's at least one unescaped new_quote in this raw string
210 # so converting is impossible
213 # Do not introduce or remove backslashes in raw strings
216 # remove unnecessary escapes
217 new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body)
219 # Consider the string without unnecessary escapes as the original
221 s = f"{prefix}{orig_quote}{body}{orig_quote}"
222 new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body)
223 new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body)
224 if "f" in prefix.casefold():
225 matches = re.findall(
227 (?:(?<!\{)|^)\{ # start of the string or a non-{ followed by a single {
228 ([^{].*?) # contents of the brackets except if begins with {{
229 \}(?:(?!\})|$) # A } followed by end of the string or a non-}
236 # Do not introduce backslashes in interpolated expressions
239 if new_quote == '"""' and new_body[-1:] == '"':
241 new_body = new_body[:-1] + '\\"'
242 orig_escape_count = body.count("\\")
243 new_escape_count = new_body.count("\\")
244 if new_escape_count > orig_escape_count:
245 return s # Do not introduce more escaping
247 if new_escape_count == orig_escape_count and orig_quote == '"':
248 return s # Prefer double quotes
250 return f"{prefix}{new_quote}{new_body}{new_quote}"
253 def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
254 """Replace hex codes in Unicode escape sequences with lowercase representation."""
256 prefix = get_string_prefix(text)
257 if "r" in prefix.lower():
260 def replace(m: Match[str]) -> str:
261 groups = m.groupdict()
262 back_slashes = groups["backslashes"]
264 if len(back_slashes) % 2 == 0:
265 return back_slashes + groups["body"]
269 return back_slashes + "u" + groups["u"].lower()
272 return back_slashes + "U" + groups["U"].lower()
275 return back_slashes + "x" + groups["x"].lower()
277 assert groups["N"], f"Unexpected match: {m}"
279 return back_slashes + "N{" + groups["N"].upper() + "}"
281 leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)
284 @lru_cache(maxsize=4096)
285 def char_width(char: str) -> int:
286 """Return the width of a single character as it would be displayed in a
287 terminal or editor (which respects Unicode East Asian Width).
289 Full width characters are counted as 2, while half width characters are
290 counted as 1. Also control characters are counted as 0.
293 codepoint = ord(char)
294 highest = len(table) - 1
298 start_codepoint, end_codepoint, width = table[idx]
299 if codepoint < start_codepoint:
301 elif codepoint > end_codepoint:
304 return 0 if width < 0 else width
307 idx = (highest + lowest) // 2
311 def str_width(line_str: str) -> int:
312 """Return the width of `line_str` as it would be displayed in a terminal
313 or editor (which respects Unicode East Asian Width).
315 You could utilize this function to determine, for example, if a string
316 is too wide to display in a terminal or editor.
318 if line_str.isascii():
319 # Fast path for a line consisting of only ASCII characters
321 return sum(map(char_width, line_str))
324 def count_chars_in_width(line_str: str, max_width: int) -> int:
325 """Count the number of characters in `line_str` that would fit in a
326 terminal or editor of `max_width` (which respects Unicode East Asian
330 for i, char in enumerate(line_str):
331 width = char_width(char)
332 if width + total_width > max_width: