]> git.madduck.net Git - etc/vim.git/blobdiff - src/black/trans.py

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

Fix merging implicit multiline strings that have inline comments (#3956)
[etc/vim.git] / src / black / trans.py
index 2360c13f06a249e17c340a684d9f8a3e9eba02ce..a3f6467cc9e3920758ce3b5ddf00a5e2c41129b6 100644 (file)
@@ -1,8 +1,8 @@
 """
 String transformers that can split and merge strings.
 """
+
 import re
-import sys
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from dataclasses import dataclass
@@ -12,9 +12,11 @@ from typing import (
     ClassVar,
     Collection,
     Dict,
+    Final,
     Iterable,
     Iterator,
     List,
+    Literal,
     Optional,
     Sequence,
     Set,
@@ -23,16 +25,11 @@ from typing import (
     Union,
 )
 
-if sys.version_info < (3, 8):
-    from typing_extensions import Final, Literal
-else:
-    from typing import Literal, Final
-
 from mypy_extensions import trait
 
 from black.comments import contains_pragma_comment
 from black.lines import Line, append_leaves
-from black.mode import Feature
+from black.mode import Feature, Mode
 from black.nodes import (
     CLOSING_BRACKETS,
     OPENING_BRACKETS,
@@ -48,9 +45,11 @@ from black.nodes import (
 from black.rusty import Err, Ok, Result
 from black.strings import (
     assert_is_leaf_string,
+    count_chars_in_width,
     get_string_prefix,
     has_triple_quotes,
     normalize_string_quotes,
+    str_width,
 )
 from blib2to3.pgen2 import token
 from blib2to3.pytree import Leaf, Node
@@ -63,7 +62,7 @@ class CannotTransform(Exception):
 # types
 T = TypeVar("T")
 LN = Union[Leaf, Node]
-Transformer = Callable[[Line, Collection[Feature]], Iterator[Line]]
+Transformer = Callable[[Line, Collection[Feature], Mode], Iterator[Line]]
 Index = int
 NodeType = int
 ParserState = int
@@ -71,6 +70,8 @@ StringID = int
 TResult = Result[T, CannotTransform]  # (T)ransform Result
 TMatchResult = TResult[List[Index]]
 
+SPLIT_SAFE_CHARS = frozenset(["\u3001", "\u3002", "\uff0c"])  # East Asian stops
+
 
 def TErr(err_msg: str) -> Err[CannotTransform]:
     """(T)ransform Err
@@ -81,7 +82,9 @@ def TErr(err_msg: str) -> Err[CannotTransform]:
     return Err(cant_transform)
 
 
-def hug_power_op(line: Line, features: Collection[Feature]) -> Iterator[Line]:
+def hug_power_op(
+    line: Line, features: Collection[Feature], mode: Mode
+) -> Iterator[Line]:
     """A transformer which normalizes spacing around power operators."""
 
     # Performance optimization to avoid unnecessary Leaf clones and other ops.
@@ -199,11 +202,11 @@ class StringTransformer(ABC):
         """
         Returns:
             * Ok(string_indices) such that for each index, `line.leaves[index]`
-            is our target string if a match was able to be made. For
-            transformers that don't result in more lines (e.g. StringMerger,
-            StringParenStripper), multiple matches and transforms are done at
-            once to reduce the complexity.
-                OR
+              is our target string if a match was able to be made. For
+              transformers that don't result in more lines (e.g. StringMerger,
+              StringParenStripper), multiple matches and transforms are done at
+              once to reduce the complexity.
+              OR
             * Err(CannotTransform), if no match could be made.
         """
 
@@ -214,12 +217,12 @@ class StringTransformer(ABC):
         """
         Yields:
             * Ok(new_line) where new_line is the new transformed line.
-                OR
+              OR
             * Err(CannotTransform) if the transformation failed for some reason. The
-            `do_match(...)` template method should usually be used to reject
-            the form of the given Line, but in some cases it is difficult to
-            know whether or not a Line meets the StringTransformer's
-            requirements until the transformation is already midway.
+              `do_match(...)` template method should usually be used to reject
+              the form of the given Line, but in some cases it is difficult to
+              know whether or not a Line meets the StringTransformer's
+              requirements until the transformation is already midway.
 
         Side Effects:
             This method should NOT mutate @line directly, but it MAY mutate the
@@ -228,7 +231,9 @@ class StringTransformer(ABC):
             yield an CannotTransform after that point.)
         """
 
-    def __call__(self, line: Line, _features: Collection[Feature]) -> Iterator[Line]:
+    def __call__(
+        self, line: Line, _features: Collection[Feature], _mode: Mode
+    ) -> Iterator[Line]:
         """
         StringTransformer instances have a call signature that mirrors that of
         the Transformer type.
@@ -327,8 +332,8 @@ class CustomSplitMapMixin:
 
         Returns:
             * A list of the custom splits that are mapped to @string, if any
-            exist.
-                OR
+              exist.
+              OR
             * [], otherwise.
 
         Side Effects:
@@ -357,14 +362,14 @@ class StringMerger(StringTransformer, CustomSplitMapMixin):
     Requirements:
         (A) The line contains adjacent strings such that ALL of the validation checks
         listed in StringMerger._validate_msg(...)'s docstring pass.
-            OR
+        OR
         (B) The line contains a string which uses line continuation backslashes.
 
     Transformations:
         Depending on which of the two requirements above where met, either:
 
         (A) The string group associated with the target string is merged.
-            OR
+        OR
         (B) All line-continuation backslashes are removed from the target string.
 
     Collaborations:
@@ -385,7 +390,19 @@ class StringMerger(StringTransformer, CustomSplitMapMixin):
                 and is_valid_index(idx + 1)
                 and LL[idx + 1].type == token.STRING
             ):
-                if not is_part_of_annotation(leaf):
+                # Let's check if the string group contains an inline comment
+                # If we have a comment inline, we don't merge the strings
+                contains_comment = False
+                i = idx
+                while is_valid_index(i):
+                    if LL[i].type != token.STRING:
+                        break
+                    if line.comments_after(LL[i]):
+                        contains_comment = True
+                        break
+                    i += 1
+
+                if not is_part_of_annotation(leaf) and not contains_comment:
                     string_indices.append(idx)
 
                 # Advance to the next non-STRING leaf.
@@ -938,6 +955,9 @@ class StringParenStripper(StringTransformer):
                 LL[lpar_or_rpar_idx].remove()  # Remove lpar.
                 replace_child(LL[idx], string_leaf)
                 new_line.append(string_leaf)
+                # replace comments
+                old_comments = new_line.comments.pop(id(LL[idx]), [])
+                new_line.comments.setdefault(id(string_leaf), []).extend(old_comments)
             else:
                 LL[lpar_or_rpar_idx].remove()  # This is a rpar.
 
@@ -957,17 +977,20 @@ class BaseStringSplitter(StringTransformer):
 
     Requirements:
         * The target string value is responsible for the line going over the
-        line length limit. It follows that after all of black's other line
-        split methods have been exhausted, this line (or one of the resulting
-        lines after all line splits are performed) would still be over the
-        line_length limit unless we split this string.
-            AND
+          line length limit. It follows that after all of black's other line
+          split methods have been exhausted, this line (or one of the resulting
+          lines after all line splits are performed) would still be over the
+          line_length limit unless we split this string.
+          AND
+
         * The target string is NOT a "pointless" string (i.e. a string that has
-        no parent or siblings).
-            AND
+          no parent or siblings).
+          AND
+
         * The target string is not followed by an inline comment that appears
-        to be a pragma.
-            AND
+          to be a pragma.
+          AND
+
         * The target string is not a multiline (i.e. triple-quote) string.
     """
 
@@ -1019,7 +1042,7 @@ class BaseStringSplitter(StringTransformer):
 
         Returns:
             * Ok(None), if ALL of the requirements are met.
-                OR
+              OR
             * Err(CannotTransform), if ANY of the requirements are NOT met.
         """
         LL = line.leaves
@@ -1160,7 +1183,7 @@ class BaseStringSplitter(StringTransformer):
             # WMA4 the length of the inline comment.
             offset += len(comment_leaf.value)
 
-        max_string_length = self.line_length - offset
+        max_string_length = count_chars_in_width(str(line), self.line_length - offset)
         return max_string_length
 
     @staticmethod
@@ -1178,19 +1201,33 @@ class BaseStringSplitter(StringTransformer):
         if LL[0].type != token.STRING:
             return None
 
-        # If the string is surrounded by commas (or is the first/last child)...
-        prev_sibling = LL[0].prev_sibling
-        next_sibling = LL[0].next_sibling
-        if not prev_sibling and not next_sibling and parent_type(LL[0]) == syms.atom:
-            # If it's an atom string, we need to check the parent atom's siblings.
-            parent = LL[0].parent
-            assert parent is not None  # For type checkers.
-            prev_sibling = parent.prev_sibling
-            next_sibling = parent.next_sibling
-        if (not prev_sibling or prev_sibling.type == token.COMMA) and (
-            not next_sibling or next_sibling.type == token.COMMA
+        matching_nodes = [
+            syms.listmaker,
+            syms.dictsetmaker,
+            syms.testlist_gexp,
+        ]
+        # If the string is an immediate child of a list/set/tuple literal...
+        if (
+            parent_type(LL[0]) in matching_nodes
+            or parent_type(LL[0].parent) in matching_nodes
         ):
-            return 0
+            # And the string is surrounded by commas (or is the first/last child)...
+            prev_sibling = LL[0].prev_sibling
+            next_sibling = LL[0].next_sibling
+            if (
+                not prev_sibling
+                and not next_sibling
+                and parent_type(LL[0]) == syms.atom
+            ):
+                # If it's an atom string, we need to check the parent atom's siblings.
+                parent = LL[0].parent
+                assert parent is not None  # For type checkers.
+                prev_sibling = parent.prev_sibling
+                next_sibling = parent.next_sibling
+            if (not prev_sibling or prev_sibling.type == token.COMMA) and (
+                not next_sibling or next_sibling.type == token.COMMA
+            ):
+                return 0
 
         return None
 
@@ -1277,9 +1314,9 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
 
     Requirements:
         * The line consists ONLY of a single string (possibly prefixed by a
-        string operator [e.g. '+' or '==']), MAYBE a string trailer, and MAYBE
-        a trailing comma.
-            AND
+          string operator [e.g. '+' or '==']), MAYBE a string trailer, and MAYBE
+          a trailing comma.
+          AND
         * All of the requirements listed in BaseStringSplitter's docstring.
 
     Transformations:
@@ -1415,11 +1452,13 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
             is_valid_index(string_idx + 1) and LL[string_idx + 1].type == token.COMMA
         )
 
-        def max_last_string() -> int:
+        def max_last_string_column() -> int:
             """
             Returns:
-                The max allowed length of the string value used for the last
-                line we will construct.
+                The max allowed width of the string value used for the last
+                line we will construct.  Note that this value means the width
+                rather than the number of characters (e.g., many East Asian
+                characters expand to two columns).
             """
             result = self.line_length
             result -= line.depth * 4
@@ -1427,14 +1466,14 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
             result -= string_op_leaves_length
             return result
 
-        # --- Calculate Max Break Index (for string value)
+        # --- Calculate Max Break Width (for string value)
         # We start with the line length limit
-        max_break_idx = self.line_length
+        max_break_width = self.line_length
         # The last index of a string of length N is N-1.
-        max_break_idx -= 1
+        max_break_width -= 1
         # Leading whitespace is not present in the string value (e.g. Leaf.value).
-        max_break_idx -= line.depth * 4
-        if max_break_idx < 0:
+        max_break_width -= line.depth * 4
+        if max_break_width < 0:
             yield TErr(
                 f"Unable to split {LL[string_idx].value} at such high of a line depth:"
                 f" {line.depth}"
@@ -1447,7 +1486,7 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
         # line limit.
         use_custom_breakpoints = bool(
             custom_splits
-            and all(csplit.break_idx <= max_break_idx for csplit in custom_splits)
+            and all(csplit.break_idx <= max_break_width for csplit in custom_splits)
         )
 
         # Temporary storage for the remaining chunk of the string line that
@@ -1463,7 +1502,7 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
             if use_custom_breakpoints:
                 return len(custom_splits) > 1
             else:
-                return len(rest_value) > max_last_string()
+                return str_width(rest_value) > max_last_string_column()
 
         string_line_results: List[Ok[Line]] = []
         while more_splits_should_be_made():
@@ -1473,7 +1512,10 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
                 break_idx = csplit.break_idx
             else:
                 # Algorithmic Split (automatic)
-                max_bidx = max_break_idx - string_op_leaves_length
+                max_bidx = (
+                    count_chars_in_width(rest_value, max_break_width)
+                    - string_op_leaves_length
+                )
                 maybe_break_idx = self._get_break_idx(rest_value, max_bidx)
                 if maybe_break_idx is None:
                     # If we are unable to algorithmically determine a good split
@@ -1570,7 +1612,7 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
 
             # Try to fit them all on the same line with the last substring...
             if (
-                len(temp_value) <= max_last_string()
+                str_width(temp_value) <= max_last_string_column()
                 or LL[string_idx + 1].type == token.COMMA
             ):
                 last_line.append(rest_leaf)
@@ -1690,6 +1732,7 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
                 section of this classes' docstring would be be met by returning @i.
             """
             is_space = string[i] == " "
+            is_split_safe = is_valid_index(i - 1) and string[i - 1] in SPLIT_SAFE_CHARS
 
             is_not_escaped = True
             j = i - 1
@@ -1702,7 +1745,7 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
                 and len(string[:i]) >= self.MIN_SUBSTR_SIZE
             )
             return (
-                is_space
+                (is_space or is_split_safe)
                 and is_not_escaped
                 and is_big_enough
                 and not breaks_unsplittable_expression(i)
@@ -1780,25 +1823,26 @@ class StringParenWrapper(BaseStringSplitter, CustomSplitMapMixin):
         addition to the requirements listed below:
 
         * The line is a return/yield statement, which returns/yields a string.
-            OR
+          OR
         * The line is part of a ternary expression (e.g. `x = y if cond else
-        z`) such that the line starts with `else <string>`, where <string> is
-        some string.
-            OR
+          z`) such that the line starts with `else <string>`, where <string> is
+          some string.
+          OR
         * The line is an assert statement, which ends with a string.
-            OR
+          OR
         * The line is an assignment statement (e.g. `x = <string>` or `x +=
-        <string>`) such that the variable is being assigned the value of some
-        string.
-            OR
+          <string>`) such that the variable is being assigned the value of some
+          string.
+          OR
         * The line is a dictionary key assignment where some valid key is being
-        assigned the value of some string.
-            OR
+          assigned the value of some string.
+          OR
         * The line is an lambda expression and the value is a string.
-            OR
+          OR
         * The line starts with an "atom" string that prefers to be wrapped in
-        parens. It's preferred to be wrapped when the string is surrounded by
-        commas (or is the first/last child).
+          parens. It's preferred to be wrapped when it's is an immediate child of
+          a list/set/tuple literal, AND the string is surrounded by commas (or is
+          the first/last child).
 
     Transformations:
         The chosen string is wrapped in parentheses and then split at the LPAR.
@@ -1847,11 +1891,13 @@ class StringParenWrapper(BaseStringSplitter, CustomSplitMapMixin):
 
         if string_idx is not None:
             string_value = line.leaves[string_idx].value
-            # If the string has no spaces...
-            if " " not in string_value:
+            # If the string has neither spaces nor East Asian stops...
+            if not any(
+                char == " " or char in SPLIT_SAFE_CHARS for char in string_value
+            ):
                 # And will still violate the line length limit when split...
-                max_string_length = self.line_length - ((line.depth + 1) * 4)
-                if len(string_value) > max_string_length:
+                max_string_width = self.line_length - ((line.depth + 1) * 4)
+                if str_width(string_value) > max_string_width:
                     # And has no associated custom splits...
                     if not self.has_custom_splits(string_value):
                         # Then we should NOT put this string on its own line.
@@ -2242,7 +2288,7 @@ class StringParser:
         Returns:
             The index directly after the last leaf which is apart of the string
             trailer, if a "trailer" exists.
-                OR
+            OR
             @string_idx + 1, if no string "trailer" exists.
         """
         assert leaves[string_idx].type == token.STRING
@@ -2256,11 +2302,11 @@ class StringParser:
         """
         Pre-conditions:
             * On the first call to this function, @leaf MUST be the leaf that
-            was directly after the string leaf in question (e.g. if our target
-            string is `line.leaves[i]` then the first call to this method must
-            be `line.leaves[i + 1]`).
+              was directly after the string leaf in question (e.g. if our target
+              string is `line.leaves[i]` then the first call to this method must
+              be `line.leaves[i + 1]`).
             * On the next call to this function, the leaf parameter passed in
-            MUST be the leaf directly following @leaf.
+              MUST be the leaf directly following @leaf.
 
         Returns:
             True iff @leaf is apart of the string's trailer.