]> git.madduck.net Git - etc/vim.git/blob - src/black/strings.py

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

Github now supports .git-blame-ignore-revs (GH-2948)
[etc/vim.git] / src / black / strings.py
1 """
2 Simple formatting on strings. Further string formatting code is in trans.py.
3 """
4
5 import re
6 import sys
7 from functools import lru_cache
8 from typing import List, Pattern
9
10 if sys.version_info < (3, 8):
11     from typing_extensions import Final
12 else:
13     from typing import Final
14
15
16 STRING_PREFIX_CHARS: Final = "furbFURB"  # All possible string prefix characters.
17 STRING_PREFIX_RE: Final = re.compile(
18     r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
19 )
20 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
21
22
23 def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
24     """Replace `regex` with `replacement` twice on `original`.
25
26     This is used by string normalization to perform replaces on
27     overlapping matches.
28     """
29     return regex.sub(replacement, regex.sub(replacement, original))
30
31
32 def has_triple_quotes(string: str) -> bool:
33     """
34     Returns:
35         True iff @string starts with three quotation characters.
36     """
37     raw_string = string.lstrip(STRING_PREFIX_CHARS)
38     return raw_string[:3] in {'"""', "'''"}
39
40
41 def lines_with_leading_tabs_expanded(s: str) -> List[str]:
42     """
43     Splits string into lines and expands only leading tabs (following the normal
44     Python rules)
45     """
46     lines = []
47     for line in s.splitlines():
48         # Find the index of the first non-whitespace character after a string of
49         # whitespace that includes at least one tab
50         match = FIRST_NON_WHITESPACE_RE.match(line)
51         if match:
52             first_non_whitespace_idx = match.start(1)
53
54             lines.append(
55                 line[:first_non_whitespace_idx].expandtabs()
56                 + line[first_non_whitespace_idx:]
57             )
58         else:
59             lines.append(line)
60     return lines
61
62
63 def fix_docstring(docstring: str, prefix: str) -> str:
64     # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation
65     if not docstring:
66         return ""
67     lines = lines_with_leading_tabs_expanded(docstring)
68     # Determine minimum indentation (first line doesn't count):
69     indent = sys.maxsize
70     for line in lines[1:]:
71         stripped = line.lstrip()
72         if stripped:
73             indent = min(indent, len(line) - len(stripped))
74     # Remove indentation (first line is special):
75     trimmed = [lines[0].strip()]
76     if indent < sys.maxsize:
77         last_line_idx = len(lines) - 2
78         for i, line in enumerate(lines[1:]):
79             stripped_line = line[indent:].rstrip()
80             if stripped_line or i == last_line_idx:
81                 trimmed.append(prefix + stripped_line)
82             else:
83                 trimmed.append("")
84     return "\n".join(trimmed)
85
86
87 def get_string_prefix(string: str) -> str:
88     """
89     Pre-conditions:
90         * assert_is_leaf_string(@string)
91
92     Returns:
93         @string's prefix (e.g. '', 'r', 'f', or 'rf').
94     """
95     assert_is_leaf_string(string)
96
97     prefix = ""
98     prefix_idx = 0
99     while string[prefix_idx] in STRING_PREFIX_CHARS:
100         prefix += string[prefix_idx]
101         prefix_idx += 1
102
103     return prefix
104
105
106 def assert_is_leaf_string(string: str) -> None:
107     """
108     Checks the pre-condition that @string has the format that you would expect
109     of `leaf.value` where `leaf` is some Leaf such that `leaf.type ==
110     token.STRING`. A more precise description of the pre-conditions that are
111     checked are listed below.
112
113     Pre-conditions:
114         * @string starts with either ', ", <prefix>', or <prefix>" where
115         `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`.
116         * @string ends with a quote character (' or ").
117
118     Raises:
119         AssertionError(...) if the pre-conditions listed above are not
120         satisfied.
121     """
122     dquote_idx = string.find('"')
123     squote_idx = string.find("'")
124     if -1 in [dquote_idx, squote_idx]:
125         quote_idx = max(dquote_idx, squote_idx)
126     else:
127         quote_idx = min(squote_idx, dquote_idx)
128
129     assert (
130         0 <= quote_idx < len(string) - 1
131     ), f"{string!r} is missing a starting quote character (' or \")."
132     assert string[-1] in (
133         "'",
134         '"',
135     ), f"{string!r} is missing an ending quote character (' or \")."
136     assert set(string[:quote_idx]).issubset(
137         set(STRING_PREFIX_CHARS)
138     ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
139
140
141 def normalize_string_prefix(s: str) -> str:
142     """Make all string prefixes lowercase."""
143     match = STRING_PREFIX_RE.match(s)
144     assert match is not None, f"failed to match string {s!r}"
145     orig_prefix = match.group(1)
146     new_prefix = (
147         orig_prefix.replace("F", "f")
148         .replace("B", "b")
149         .replace("U", "")
150         .replace("u", "")
151     )
152
153     # Python syntax guarantees max 2 prefixes and that one of them is "r"
154     if len(new_prefix) == 2 and "r" != new_prefix[0].lower():
155         new_prefix = new_prefix[::-1]
156     return f"{new_prefix}{match.group(2)}"
157
158
159 # Re(gex) does actually cache patterns internally but this still improves
160 # performance on a long list literal of strings by 5-9% since lru_cache's
161 # caching overhead is much lower.
162 @lru_cache(maxsize=64)
163 def _cached_compile(pattern: str) -> Pattern[str]:
164     return re.compile(pattern)
165
166
167 def normalize_string_quotes(s: str) -> str:
168     """Prefer double quotes but only if it doesn't cause more escaping.
169
170     Adds or removes backslashes as appropriate. Doesn't parse and fix
171     strings nested in f-strings.
172     """
173     value = s.lstrip(STRING_PREFIX_CHARS)
174     if value[:3] == '"""':
175         return s
176
177     elif value[:3] == "'''":
178         orig_quote = "'''"
179         new_quote = '"""'
180     elif value[0] == '"':
181         orig_quote = '"'
182         new_quote = "'"
183     else:
184         orig_quote = "'"
185         new_quote = '"'
186     first_quote_pos = s.find(orig_quote)
187     if first_quote_pos == -1:
188         return s  # There's an internal error
189
190     prefix = s[:first_quote_pos]
191     unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")
192     escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}")
193     escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}")
194     body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)]
195     if "r" in prefix.casefold():
196         if unescaped_new_quote.search(body):
197             # There's at least one unescaped new_quote in this raw string
198             # so converting is impossible
199             return s
200
201         # Do not introduce or remove backslashes in raw strings
202         new_body = body
203     else:
204         # remove unnecessary escapes
205         new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body)
206         if body != new_body:
207             # Consider the string without unnecessary escapes as the original
208             body = new_body
209             s = f"{prefix}{orig_quote}{body}{orig_quote}"
210         new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body)
211         new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body)
212     if "f" in prefix.casefold():
213         matches = re.findall(
214             r"""
215             (?:(?<!\{)|^)\{  # start of the string or a non-{ followed by a single {
216                 ([^{].*?)  # contents of the brackets except if begins with {{
217             \}(?:(?!\})|$)  # A } followed by end of the string or a non-}
218             """,
219             new_body,
220             re.VERBOSE,
221         )
222         for m in matches:
223             if "\\" in str(m):
224                 # Do not introduce backslashes in interpolated expressions
225                 return s
226
227     if new_quote == '"""' and new_body[-1:] == '"':
228         # edge case:
229         new_body = new_body[:-1] + '\\"'
230     orig_escape_count = body.count("\\")
231     new_escape_count = new_body.count("\\")
232     if new_escape_count > orig_escape_count:
233         return s  # Do not introduce more escaping
234
235     if new_escape_count == orig_escape_count and orig_quote == '"':
236         return s  # Prefer double quotes
237
238     return f"{prefix}{new_quote}{new_body}{new_quote}"