All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
2 Simple formatting on strings. Further string formatting code is in trans.py.
7 from functools import lru_cache
8 from typing import Final, List, Match, Pattern
10 from black._width_table import WIDTH_TABLE
11 from blib2to3.pytree import Leaf
13 STRING_PREFIX_CHARS: Final = "furbFURB" # All possible string prefix characters.
14 STRING_PREFIX_RE: Final = re.compile(
15 r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
17 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
18 UNICODE_ESCAPE_RE: Final = re.compile(
19 r"(?P<backslashes>\\+)(?P<body>"
20 r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx
21 r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx
22 r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh
23 r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database
29 def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
30 """Replace `regex` with `replacement` twice on `original`.
32 This is used by string normalization to perform replaces on
35 return regex.sub(replacement, regex.sub(replacement, original))
38 def has_triple_quotes(string: str) -> bool:
41 True iff @string starts with three quotation characters.
43 raw_string = string.lstrip(STRING_PREFIX_CHARS)
44 return raw_string[:3] in {'"""', "'''"}
47 def lines_with_leading_tabs_expanded(s: str) -> List[str]:
49 Splits string into lines and expands only leading tabs (following the normal
53 for line in s.splitlines():
54 # Find the index of the first non-whitespace character after a string of
55 # whitespace that includes at least one tab
56 match = FIRST_NON_WHITESPACE_RE.match(line)
58 first_non_whitespace_idx = match.start(1)
61 line[:first_non_whitespace_idx].expandtabs()
62 + line[first_non_whitespace_idx:]
69 def fix_docstring(docstring: str, prefix: str) -> str:
70 # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation
73 lines = lines_with_leading_tabs_expanded(docstring)
74 # Determine minimum indentation (first line doesn't count):
76 for line in lines[1:]:
77 stripped = line.lstrip()
79 indent = min(indent, len(line) - len(stripped))
80 # Remove indentation (first line is special):
81 trimmed = [lines[0].strip()]
82 if indent < sys.maxsize:
83 last_line_idx = len(lines) - 2
84 for i, line in enumerate(lines[1:]):
85 stripped_line = line[indent:].rstrip()
86 if stripped_line or i == last_line_idx:
87 trimmed.append(prefix + stripped_line)
90 return "\n".join(trimmed)
93 def get_string_prefix(string: str) -> str:
96 * assert_is_leaf_string(@string)
99 @string's prefix (e.g. '', 'r', 'f', or 'rf').
101 assert_is_leaf_string(string)
105 while string[prefix_idx] in STRING_PREFIX_CHARS:
106 prefix += string[prefix_idx]
112 def assert_is_leaf_string(string: str) -> None:
114 Checks the pre-condition that @string has the format that you would expect
115 of `leaf.value` where `leaf` is some Leaf such that `leaf.type ==
116 token.STRING`. A more precise description of the pre-conditions that are
117 checked are listed below.
120 * @string starts with either ', ", <prefix>', or <prefix>" where
121 `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`.
122 * @string ends with a quote character (' or ").
125 AssertionError(...) if the pre-conditions listed above are not
128 dquote_idx = string.find('"')
129 squote_idx = string.find("'")
130 if -1 in [dquote_idx, squote_idx]:
131 quote_idx = max(dquote_idx, squote_idx)
133 quote_idx = min(squote_idx, dquote_idx)
136 0 <= quote_idx < len(string) - 1
137 ), f"{string!r} is missing a starting quote character (' or \")."
138 assert string[-1] in (
141 ), f"{string!r} is missing an ending quote character (' or \")."
142 assert set(string[:quote_idx]).issubset(
143 set(STRING_PREFIX_CHARS)
144 ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
147 def normalize_string_prefix(s: str) -> str:
148 """Make all string prefixes lowercase."""
149 match = STRING_PREFIX_RE.match(s)
150 assert match is not None, f"failed to match string {s!r}"
151 orig_prefix = match.group(1)
153 orig_prefix.replace("F", "f")
159 # Python syntax guarantees max 2 prefixes and that one of them is "r"
160 if len(new_prefix) == 2 and "r" != new_prefix[0].lower():
161 new_prefix = new_prefix[::-1]
162 return f"{new_prefix}{match.group(2)}"
165 # Re(gex) does actually cache patterns internally but this still improves
166 # performance on a long list literal of strings by 5-9% since lru_cache's
167 # caching overhead is much lower.
168 @lru_cache(maxsize=64)
169 def _cached_compile(pattern: str) -> Pattern[str]:
170 return re.compile(pattern)
173 def normalize_string_quotes(s: str) -> str:
174 """Prefer double quotes but only if it doesn't cause more escaping.
176 Adds or removes backslashes as appropriate. Doesn't parse and fix
177 strings nested in f-strings.
179 value = s.lstrip(STRING_PREFIX_CHARS)
180 if value[:3] == '"""':
183 elif value[:3] == "'''":
186 elif value[0] == '"':
192 first_quote_pos = s.find(orig_quote)
193 if first_quote_pos == -1:
194 return s # There's an internal error
196 prefix = s[:first_quote_pos]
197 unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")
198 escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}")
199 escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}")
200 body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)]
201 if "r" in prefix.casefold():
202 if unescaped_new_quote.search(body):
203 # There's at least one unescaped new_quote in this raw string
204 # so converting is impossible
207 # Do not introduce or remove backslashes in raw strings
210 # remove unnecessary escapes
211 new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body)
213 # Consider the string without unnecessary escapes as the original
215 s = f"{prefix}{orig_quote}{body}{orig_quote}"
216 new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body)
217 new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body)
218 if "f" in prefix.casefold():
219 matches = re.findall(
221 (?:(?<!\{)|^)\{ # start of the string or a non-{ followed by a single {
222 ([^{].*?) # contents of the brackets except if begins with {{
223 \}(?:(?!\})|$) # A } followed by end of the string or a non-}
230 # Do not introduce backslashes in interpolated expressions
233 if new_quote == '"""' and new_body[-1:] == '"':
235 new_body = new_body[:-1] + '\\"'
236 orig_escape_count = body.count("\\")
237 new_escape_count = new_body.count("\\")
238 if new_escape_count > orig_escape_count:
239 return s # Do not introduce more escaping
241 if new_escape_count == orig_escape_count and orig_quote == '"':
242 return s # Prefer double quotes
244 return f"{prefix}{new_quote}{new_body}{new_quote}"
247 def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
248 """Replace hex codes in Unicode escape sequences with lowercase representation."""
250 prefix = get_string_prefix(text)
251 if "r" in prefix.lower():
254 def replace(m: Match[str]) -> str:
255 groups = m.groupdict()
256 back_slashes = groups["backslashes"]
258 if len(back_slashes) % 2 == 0:
259 return back_slashes + groups["body"]
263 return back_slashes + "u" + groups["u"].lower()
266 return back_slashes + "U" + groups["U"].lower()
269 return back_slashes + "x" + groups["x"].lower()
271 assert groups["N"], f"Unexpected match: {m}"
273 return back_slashes + "N{" + groups["N"].upper() + "}"
275 leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)
278 @lru_cache(maxsize=4096)
279 def char_width(char: str) -> int:
280 """Return the width of a single character as it would be displayed in a
281 terminal or editor (which respects Unicode East Asian Width).
283 Full width characters are counted as 2, while half width characters are
284 counted as 1. Also control characters are counted as 0.
287 codepoint = ord(char)
288 highest = len(table) - 1
292 start_codepoint, end_codepoint, width = table[idx]
293 if codepoint < start_codepoint:
295 elif codepoint > end_codepoint:
298 return 0 if width < 0 else width
301 idx = (highest + lowest) // 2
305 def str_width(line_str: str) -> int:
306 """Return the width of `line_str` as it would be displayed in a terminal
307 or editor (which respects Unicode East Asian Width).
309 You could utilize this function to determine, for example, if a string
310 is too wide to display in a terminal or editor.
312 if line_str.isascii():
313 # Fast path for a line consisting of only ASCII characters
315 return sum(map(char_width, line_str))
318 def count_chars_in_width(line_str: str, max_width: int) -> int:
319 """Count the number of characters in `line_str` that would fit in a
320 terminal or editor of `max_width` (which respects Unicode East Asian
324 for i, char in enumerate(line_str):
325 width = char_width(char)
326 if width + total_width > max_width: