]> git.madduck.net Git - etc/vim.git/blob - src/black/strings.py

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

Assignment to env var in Jupyter Notebook doesn't round-trip (#2642)
[etc/vim.git] / src / black / strings.py
1 """
2 Simple formatting on strings. Further string formatting code is in trans.py.
3 """
4
5 import regex as re
6 import sys
7 from functools import lru_cache
8 from typing import List, Pattern
9
10 if sys.version_info < (3, 8):
11     from typing_extensions import Final
12 else:
13     from typing import Final
14
15
16 STRING_PREFIX_CHARS: Final = "furbFURB"  # All possible string prefix characters.
17 STRING_PREFIX_RE: Final = re.compile(
18     r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
19 )
20 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
21
22
23 def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
24     """Replace `regex` with `replacement` twice on `original`.
25
26     This is used by string normalization to perform replaces on
27     overlapping matches.
28     """
29     return regex.sub(replacement, regex.sub(replacement, original))
30
31
32 def has_triple_quotes(string: str) -> bool:
33     """
34     Returns:
35         True iff @string starts with three quotation characters.
36     """
37     raw_string = string.lstrip(STRING_PREFIX_CHARS)
38     return raw_string[:3] in {'"""', "'''"}
39
40
41 def lines_with_leading_tabs_expanded(s: str) -> List[str]:
42     """
43     Splits string into lines and expands only leading tabs (following the normal
44     Python rules)
45     """
46     lines = []
47     for line in s.splitlines():
48         # Find the index of the first non-whitespace character after a string of
49         # whitespace that includes at least one tab
50         match = FIRST_NON_WHITESPACE_RE.match(line)
51         if match:
52             first_non_whitespace_idx = match.start(1)
53
54             lines.append(
55                 line[:first_non_whitespace_idx].expandtabs()
56                 + line[first_non_whitespace_idx:]
57             )
58         else:
59             lines.append(line)
60     return lines
61
62
63 def fix_docstring(docstring: str, prefix: str) -> str:
64     # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation
65     if not docstring:
66         return ""
67     lines = lines_with_leading_tabs_expanded(docstring)
68     # Determine minimum indentation (first line doesn't count):
69     indent = sys.maxsize
70     for line in lines[1:]:
71         stripped = line.lstrip()
72         if stripped:
73             indent = min(indent, len(line) - len(stripped))
74     # Remove indentation (first line is special):
75     trimmed = [lines[0].strip()]
76     if indent < sys.maxsize:
77         last_line_idx = len(lines) - 2
78         for i, line in enumerate(lines[1:]):
79             stripped_line = line[indent:].rstrip()
80             if stripped_line or i == last_line_idx:
81                 trimmed.append(prefix + stripped_line)
82             else:
83                 trimmed.append("")
84     return "\n".join(trimmed)
85
86
87 def get_string_prefix(string: str) -> str:
88     """
89     Pre-conditions:
90         * assert_is_leaf_string(@string)
91
92     Returns:
93         @string's prefix (e.g. '', 'r', 'f', or 'rf').
94     """
95     assert_is_leaf_string(string)
96
97     prefix = ""
98     prefix_idx = 0
99     while string[prefix_idx] in STRING_PREFIX_CHARS:
100         prefix += string[prefix_idx]
101         prefix_idx += 1
102
103     return prefix
104
105
106 def assert_is_leaf_string(string: str) -> None:
107     """
108     Checks the pre-condition that @string has the format that you would expect
109     of `leaf.value` where `leaf` is some Leaf such that `leaf.type ==
110     token.STRING`. A more precise description of the pre-conditions that are
111     checked are listed below.
112
113     Pre-conditions:
114         * @string starts with either ', ", <prefix>', or <prefix>" where
115         `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`.
116         * @string ends with a quote character (' or ").
117
118     Raises:
119         AssertionError(...) if the pre-conditions listed above are not
120         satisfied.
121     """
122     dquote_idx = string.find('"')
123     squote_idx = string.find("'")
124     if -1 in [dquote_idx, squote_idx]:
125         quote_idx = max(dquote_idx, squote_idx)
126     else:
127         quote_idx = min(squote_idx, dquote_idx)
128
129     assert (
130         0 <= quote_idx < len(string) - 1
131     ), f"{string!r} is missing a starting quote character (' or \")."
132     assert string[-1] in (
133         "'",
134         '"',
135     ), f"{string!r} is missing an ending quote character (' or \")."
136     assert set(string[:quote_idx]).issubset(
137         set(STRING_PREFIX_CHARS)
138     ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
139
140
141 def normalize_string_prefix(s: str, remove_u_prefix: bool = False) -> str:
142     """Make all string prefixes lowercase.
143
144     If remove_u_prefix is given, also removes any u prefix from the string.
145     """
146     match = STRING_PREFIX_RE.match(s)
147     assert match is not None, f"failed to match string {s!r}"
148     orig_prefix = match.group(1)
149     new_prefix = orig_prefix.replace("F", "f").replace("B", "b").replace("U", "u")
150     if remove_u_prefix:
151         new_prefix = new_prefix.replace("u", "")
152     return f"{new_prefix}{match.group(2)}"
153
154
155 # Re(gex) does actually cache patterns internally but this still improves
156 # performance on a long list literal of strings by 5-9% since lru_cache's
157 # caching overhead is much lower.
158 @lru_cache(maxsize=64)
159 def _cached_compile(pattern: str) -> re.Pattern:
160     return re.compile(pattern)
161
162
163 def normalize_string_quotes(s: str) -> str:
164     """Prefer double quotes but only if it doesn't cause more escaping.
165
166     Adds or removes backslashes as appropriate. Doesn't parse and fix
167     strings nested in f-strings.
168     """
169     value = s.lstrip(STRING_PREFIX_CHARS)
170     if value[:3] == '"""':
171         return s
172
173     elif value[:3] == "'''":
174         orig_quote = "'''"
175         new_quote = '"""'
176     elif value[0] == '"':
177         orig_quote = '"'
178         new_quote = "'"
179     else:
180         orig_quote = "'"
181         new_quote = '"'
182     first_quote_pos = s.find(orig_quote)
183     if first_quote_pos == -1:
184         return s  # There's an internal error
185
186     prefix = s[:first_quote_pos]
187     unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")
188     escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}")
189     escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}")
190     body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)]
191     if "r" in prefix.casefold():
192         if unescaped_new_quote.search(body):
193             # There's at least one unescaped new_quote in this raw string
194             # so converting is impossible
195             return s
196
197         # Do not introduce or remove backslashes in raw strings
198         new_body = body
199     else:
200         # remove unnecessary escapes
201         new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body)
202         if body != new_body:
203             # Consider the string without unnecessary escapes as the original
204             body = new_body
205             s = f"{prefix}{orig_quote}{body}{orig_quote}"
206         new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body)
207         new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body)
208     if "f" in prefix.casefold():
209         matches = re.findall(
210             r"""
211             (?:(?<!\{)|^)\{  # start of the string or a non-{ followed by a single {
212                 ([^{].*?)  # contents of the brackets except if begins with {{
213             \}(?:(?!\})|$)  # A } followed by end of the string or a non-}
214             """,
215             new_body,
216             re.VERBOSE,
217         )
218         for m in matches:
219             if "\\" in str(m):
220                 # Do not introduce backslashes in interpolated expressions
221                 return s
222
223     if new_quote == '"""' and new_body[-1:] == '"':
224         # edge case:
225         new_body = new_body[:-1] + '\\"'
226     orig_escape_count = body.count("\\")
227     new_escape_count = new_body.count("\\")
228     if new_escape_count > orig_escape_count:
229         return s  # Do not introduce more escaping
230
231     if new_escape_count == orig_escape_count and orig_quote == '"':
232         return s  # Prefer double quotes
233
234     return f"{prefix}{new_quote}{new_body}{new_quote}"