All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
2 Simple formatting on strings. Further string formatting code is in trans.py.
7 from functools import lru_cache
8 from typing import List, Match, Pattern
10 from blib2to3.pytree import Leaf
12 if sys.version_info < (3, 8):
13 from typing_extensions import Final
15 from typing import Final
18 STRING_PREFIX_CHARS: Final = "furbFURB" # All possible string prefix characters.
19 STRING_PREFIX_RE: Final = re.compile(
20 r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
22 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
23 UNICODE_ESCAPE_RE: Final = re.compile(
24 r"(?P<backslashes>\\+)(?P<body>"
25 r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx
26 r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx
27 r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh
28 r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database
34 def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
35 """Replace `regex` with `replacement` twice on `original`.
37 This is used by string normalization to perform replaces on
40 return regex.sub(replacement, regex.sub(replacement, original))
43 def has_triple_quotes(string: str) -> bool:
46 True iff @string starts with three quotation characters.
48 raw_string = string.lstrip(STRING_PREFIX_CHARS)
49 return raw_string[:3] in {'"""', "'''"}
52 def lines_with_leading_tabs_expanded(s: str) -> List[str]:
54 Splits string into lines and expands only leading tabs (following the normal
58 for line in s.splitlines():
59 # Find the index of the first non-whitespace character after a string of
60 # whitespace that includes at least one tab
61 match = FIRST_NON_WHITESPACE_RE.match(line)
63 first_non_whitespace_idx = match.start(1)
66 line[:first_non_whitespace_idx].expandtabs()
67 + line[first_non_whitespace_idx:]
74 def fix_docstring(docstring: str, prefix: str) -> str:
75 # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation
78 lines = lines_with_leading_tabs_expanded(docstring)
79 # Determine minimum indentation (first line doesn't count):
81 for line in lines[1:]:
82 stripped = line.lstrip()
84 indent = min(indent, len(line) - len(stripped))
85 # Remove indentation (first line is special):
86 trimmed = [lines[0].strip()]
87 if indent < sys.maxsize:
88 last_line_idx = len(lines) - 2
89 for i, line in enumerate(lines[1:]):
90 stripped_line = line[indent:].rstrip()
91 if stripped_line or i == last_line_idx:
92 trimmed.append(prefix + stripped_line)
95 return "\n".join(trimmed)
98 def get_string_prefix(string: str) -> str:
101 * assert_is_leaf_string(@string)
104 @string's prefix (e.g. '', 'r', 'f', or 'rf').
106 assert_is_leaf_string(string)
110 while string[prefix_idx] in STRING_PREFIX_CHARS:
111 prefix += string[prefix_idx]
117 def assert_is_leaf_string(string: str) -> None:
119 Checks the pre-condition that @string has the format that you would expect
120 of `leaf.value` where `leaf` is some Leaf such that `leaf.type ==
121 token.STRING`. A more precise description of the pre-conditions that are
122 checked are listed below.
125 * @string starts with either ', ", <prefix>', or <prefix>" where
126 `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`.
127 * @string ends with a quote character (' or ").
130 AssertionError(...) if the pre-conditions listed above are not
133 dquote_idx = string.find('"')
134 squote_idx = string.find("'")
135 if -1 in [dquote_idx, squote_idx]:
136 quote_idx = max(dquote_idx, squote_idx)
138 quote_idx = min(squote_idx, dquote_idx)
141 0 <= quote_idx < len(string) - 1
142 ), f"{string!r} is missing a starting quote character (' or \")."
143 assert string[-1] in (
146 ), f"{string!r} is missing an ending quote character (' or \")."
147 assert set(string[:quote_idx]).issubset(
148 set(STRING_PREFIX_CHARS)
149 ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
152 def normalize_string_prefix(s: str) -> str:
153 """Make all string prefixes lowercase."""
154 match = STRING_PREFIX_RE.match(s)
155 assert match is not None, f"failed to match string {s!r}"
156 orig_prefix = match.group(1)
158 orig_prefix.replace("F", "f")
164 # Python syntax guarantees max 2 prefixes and that one of them is "r"
165 if len(new_prefix) == 2 and "r" != new_prefix[0].lower():
166 new_prefix = new_prefix[::-1]
167 return f"{new_prefix}{match.group(2)}"
170 # Re(gex) does actually cache patterns internally but this still improves
171 # performance on a long list literal of strings by 5-9% since lru_cache's
172 # caching overhead is much lower.
173 @lru_cache(maxsize=64)
174 def _cached_compile(pattern: str) -> Pattern[str]:
175 return re.compile(pattern)
178 def normalize_string_quotes(s: str) -> str:
179 """Prefer double quotes but only if it doesn't cause more escaping.
181 Adds or removes backslashes as appropriate. Doesn't parse and fix
182 strings nested in f-strings.
184 value = s.lstrip(STRING_PREFIX_CHARS)
185 if value[:3] == '"""':
188 elif value[:3] == "'''":
191 elif value[0] == '"':
197 first_quote_pos = s.find(orig_quote)
198 if first_quote_pos == -1:
199 return s # There's an internal error
201 prefix = s[:first_quote_pos]
202 unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")
203 escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}")
204 escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}")
205 body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)]
206 if "r" in prefix.casefold():
207 if unescaped_new_quote.search(body):
208 # There's at least one unescaped new_quote in this raw string
209 # so converting is impossible
212 # Do not introduce or remove backslashes in raw strings
215 # remove unnecessary escapes
216 new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body)
218 # Consider the string without unnecessary escapes as the original
220 s = f"{prefix}{orig_quote}{body}{orig_quote}"
221 new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body)
222 new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body)
223 if "f" in prefix.casefold():
224 matches = re.findall(
226 (?:(?<!\{)|^)\{ # start of the string or a non-{ followed by a single {
227 ([^{].*?) # contents of the brackets except if begins with {{
228 \}(?:(?!\})|$) # A } followed by end of the string or a non-}
235 # Do not introduce backslashes in interpolated expressions
238 if new_quote == '"""' and new_body[-1:] == '"':
240 new_body = new_body[:-1] + '\\"'
241 orig_escape_count = body.count("\\")
242 new_escape_count = new_body.count("\\")
243 if new_escape_count > orig_escape_count:
244 return s # Do not introduce more escaping
246 if new_escape_count == orig_escape_count and orig_quote == '"':
247 return s # Prefer double quotes
249 return f"{prefix}{new_quote}{new_body}{new_quote}"
252 def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
253 """Replace hex codes in Unicode escape sequences with lowercase representation."""
255 prefix = get_string_prefix(text)
256 if "r" in prefix.lower():
259 def replace(m: Match[str]) -> str:
260 groups = m.groupdict()
261 back_slashes = groups["backslashes"]
263 if len(back_slashes) % 2 == 0:
264 return back_slashes + groups["body"]
268 return back_slashes + "u" + groups["u"].lower()
271 return back_slashes + "U" + groups["U"].lower()
274 return back_slashes + "x" + groups["x"].lower()
276 assert groups["N"], f"Unexpected match: {m}"
278 return back_slashes + "N{" + groups["N"].upper() + "}"
280 leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)