]> git.madduck.net Git - etc/vim.git/blob - src/black/strings.py

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

Add SECURITY.md (#3612)
[etc/vim.git] / src / black / strings.py
1 """
2 Simple formatting on strings. Further string formatting code is in trans.py.
3 """
4
5 import re
6 import sys
7 from functools import lru_cache
8 from typing import List, Match, Pattern
9
10 from blib2to3.pytree import Leaf
11
12 if sys.version_info < (3, 8):
13     from typing_extensions import Final
14 else:
15     from typing import Final
16
17
18 STRING_PREFIX_CHARS: Final = "furbFURB"  # All possible string prefix characters.
19 STRING_PREFIX_RE: Final = re.compile(
20     r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
21 )
22 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
23 UNICODE_ESCAPE_RE: Final = re.compile(
24     r"(?P<backslashes>\\+)(?P<body>"
25     r"(u(?P<u>[a-fA-F0-9]{4}))"  # Character with 16-bit hex value xxxx
26     r"|(U(?P<U>[a-fA-F0-9]{8}))"  # Character with 32-bit hex value xxxxxxxx
27     r"|(x(?P<x>[a-fA-F0-9]{2}))"  # Character with hex value hh
28     r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})"  # Character named name in the Unicode database
29     r")",
30     re.VERBOSE,
31 )
32
33
34 def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
35     """Replace `regex` with `replacement` twice on `original`.
36
37     This is used by string normalization to perform replaces on
38     overlapping matches.
39     """
40     return regex.sub(replacement, regex.sub(replacement, original))
41
42
43 def has_triple_quotes(string: str) -> bool:
44     """
45     Returns:
46         True iff @string starts with three quotation characters.
47     """
48     raw_string = string.lstrip(STRING_PREFIX_CHARS)
49     return raw_string[:3] in {'"""', "'''"}
50
51
52 def lines_with_leading_tabs_expanded(s: str) -> List[str]:
53     """
54     Splits string into lines and expands only leading tabs (following the normal
55     Python rules)
56     """
57     lines = []
58     for line in s.splitlines():
59         # Find the index of the first non-whitespace character after a string of
60         # whitespace that includes at least one tab
61         match = FIRST_NON_WHITESPACE_RE.match(line)
62         if match:
63             first_non_whitespace_idx = match.start(1)
64
65             lines.append(
66                 line[:first_non_whitespace_idx].expandtabs()
67                 + line[first_non_whitespace_idx:]
68             )
69         else:
70             lines.append(line)
71     return lines
72
73
74 def fix_docstring(docstring: str, prefix: str) -> str:
75     # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation
76     if not docstring:
77         return ""
78     lines = lines_with_leading_tabs_expanded(docstring)
79     # Determine minimum indentation (first line doesn't count):
80     indent = sys.maxsize
81     for line in lines[1:]:
82         stripped = line.lstrip()
83         if stripped:
84             indent = min(indent, len(line) - len(stripped))
85     # Remove indentation (first line is special):
86     trimmed = [lines[0].strip()]
87     if indent < sys.maxsize:
88         last_line_idx = len(lines) - 2
89         for i, line in enumerate(lines[1:]):
90             stripped_line = line[indent:].rstrip()
91             if stripped_line or i == last_line_idx:
92                 trimmed.append(prefix + stripped_line)
93             else:
94                 trimmed.append("")
95     return "\n".join(trimmed)
96
97
98 def get_string_prefix(string: str) -> str:
99     """
100     Pre-conditions:
101         * assert_is_leaf_string(@string)
102
103     Returns:
104         @string's prefix (e.g. '', 'r', 'f', or 'rf').
105     """
106     assert_is_leaf_string(string)
107
108     prefix = ""
109     prefix_idx = 0
110     while string[prefix_idx] in STRING_PREFIX_CHARS:
111         prefix += string[prefix_idx]
112         prefix_idx += 1
113
114     return prefix
115
116
117 def assert_is_leaf_string(string: str) -> None:
118     """
119     Checks the pre-condition that @string has the format that you would expect
120     of `leaf.value` where `leaf` is some Leaf such that `leaf.type ==
121     token.STRING`. A more precise description of the pre-conditions that are
122     checked are listed below.
123
124     Pre-conditions:
125         * @string starts with either ', ", <prefix>', or <prefix>" where
126         `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`.
127         * @string ends with a quote character (' or ").
128
129     Raises:
130         AssertionError(...) if the pre-conditions listed above are not
131         satisfied.
132     """
133     dquote_idx = string.find('"')
134     squote_idx = string.find("'")
135     if -1 in [dquote_idx, squote_idx]:
136         quote_idx = max(dquote_idx, squote_idx)
137     else:
138         quote_idx = min(squote_idx, dquote_idx)
139
140     assert (
141         0 <= quote_idx < len(string) - 1
142     ), f"{string!r} is missing a starting quote character (' or \")."
143     assert string[-1] in (
144         "'",
145         '"',
146     ), f"{string!r} is missing an ending quote character (' or \")."
147     assert set(string[:quote_idx]).issubset(
148         set(STRING_PREFIX_CHARS)
149     ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
150
151
152 def normalize_string_prefix(s: str) -> str:
153     """Make all string prefixes lowercase."""
154     match = STRING_PREFIX_RE.match(s)
155     assert match is not None, f"failed to match string {s!r}"
156     orig_prefix = match.group(1)
157     new_prefix = (
158         orig_prefix.replace("F", "f")
159         .replace("B", "b")
160         .replace("U", "")
161         .replace("u", "")
162     )
163
164     # Python syntax guarantees max 2 prefixes and that one of them is "r"
165     if len(new_prefix) == 2 and "r" != new_prefix[0].lower():
166         new_prefix = new_prefix[::-1]
167     return f"{new_prefix}{match.group(2)}"
168
169
170 # Re(gex) does actually cache patterns internally but this still improves
171 # performance on a long list literal of strings by 5-9% since lru_cache's
172 # caching overhead is much lower.
173 @lru_cache(maxsize=64)
174 def _cached_compile(pattern: str) -> Pattern[str]:
175     return re.compile(pattern)
176
177
178 def normalize_string_quotes(s: str) -> str:
179     """Prefer double quotes but only if it doesn't cause more escaping.
180
181     Adds or removes backslashes as appropriate. Doesn't parse and fix
182     strings nested in f-strings.
183     """
184     value = s.lstrip(STRING_PREFIX_CHARS)
185     if value[:3] == '"""':
186         return s
187
188     elif value[:3] == "'''":
189         orig_quote = "'''"
190         new_quote = '"""'
191     elif value[0] == '"':
192         orig_quote = '"'
193         new_quote = "'"
194     else:
195         orig_quote = "'"
196         new_quote = '"'
197     first_quote_pos = s.find(orig_quote)
198     if first_quote_pos == -1:
199         return s  # There's an internal error
200
201     prefix = s[:first_quote_pos]
202     unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")
203     escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}")
204     escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}")
205     body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)]
206     if "r" in prefix.casefold():
207         if unescaped_new_quote.search(body):
208             # There's at least one unescaped new_quote in this raw string
209             # so converting is impossible
210             return s
211
212         # Do not introduce or remove backslashes in raw strings
213         new_body = body
214     else:
215         # remove unnecessary escapes
216         new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body)
217         if body != new_body:
218             # Consider the string without unnecessary escapes as the original
219             body = new_body
220             s = f"{prefix}{orig_quote}{body}{orig_quote}"
221         new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body)
222         new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body)
223     if "f" in prefix.casefold():
224         matches = re.findall(
225             r"""
226             (?:(?<!\{)|^)\{  # start of the string or a non-{ followed by a single {
227                 ([^{].*?)  # contents of the brackets except if begins with {{
228             \}(?:(?!\})|$)  # A } followed by end of the string or a non-}
229             """,
230             new_body,
231             re.VERBOSE,
232         )
233         for m in matches:
234             if "\\" in str(m):
235                 # Do not introduce backslashes in interpolated expressions
236                 return s
237
238     if new_quote == '"""' and new_body[-1:] == '"':
239         # edge case:
240         new_body = new_body[:-1] + '\\"'
241     orig_escape_count = body.count("\\")
242     new_escape_count = new_body.count("\\")
243     if new_escape_count > orig_escape_count:
244         return s  # Do not introduce more escaping
245
246     if new_escape_count == orig_escape_count and orig_quote == '"':
247         return s  # Prefer double quotes
248
249     return f"{prefix}{new_quote}{new_body}{new_quote}"
250
251
252 def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
253     """Replace hex codes in Unicode escape sequences with lowercase representation."""
254     text = leaf.value
255     prefix = get_string_prefix(text)
256     if "r" in prefix.lower():
257         return
258
259     def replace(m: Match[str]) -> str:
260         groups = m.groupdict()
261         back_slashes = groups["backslashes"]
262
263         if len(back_slashes) % 2 == 0:
264             return back_slashes + groups["body"]
265
266         if groups["u"]:
267             # \u
268             return back_slashes + "u" + groups["u"].lower()
269         elif groups["U"]:
270             # \U
271             return back_slashes + "U" + groups["U"].lower()
272         elif groups["x"]:
273             # \x
274             return back_slashes + "x" + groups["x"].lower()
275         else:
276             assert groups["N"], f"Unexpected match: {m}"
277             # \N{}
278             return back_slashes + "N{" + groups["N"].upper() + "}"
279
280     leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)