Fix merging implicit multiline strings that have inline comments (#3956)

[etc/vim.git] / src / black / trans.py
diff --git a/src/black/trans.py b/src/black/trans.py

index 2360c13f06a249e17c340a684d9f8a3e9eba02ce..a3f6467cc9e3920758ce3b5ddf00a5e2c41129b6 100644 (file)
--- a/src/black/trans.py
+++ b/src/black/trans.py
@@ -1,8 +1,8 @@
  """
  String transformers that can split and merge strings.
  """
+
  import re
-import sys
  from abc import ABC, abstractmethod
  from collections import defaultdict
  from dataclasses import dataclass
@@ -12,9 +12,11 @@ from typing import (
      ClassVar,
      Collection,
      Dict,
+    Final,
      Iterable,
      Iterator,
      List,
+    Literal,
      Optional,
      Sequence,
      Set,
@@ -23,16 +25,11 @@ from typing import (
      Union,
  )
  
-if sys.version_info < (3, 8):
-    from typing_extensions import Final, Literal
-else:
-    from typing import Literal, Final
-
  from mypy_extensions import trait
  
  from black.comments import contains_pragma_comment
  from black.lines import Line, append_leaves
-from black.mode import Feature
+from black.mode import Feature, Mode
  from black.nodes import (
      CLOSING_BRACKETS,
      OPENING_BRACKETS,
@@ -48,9 +45,11 @@ from black.nodes import (
  from black.rusty import Err, Ok, Result
  from black.strings import (
      assert_is_leaf_string,
+    count_chars_in_width,
      get_string_prefix,
      has_triple_quotes,
      normalize_string_quotes,
+    str_width,
  )
  from blib2to3.pgen2 import token
  from blib2to3.pytree import Leaf, Node
@@ -63,7 +62,7 @@ class CannotTransform(Exception):
  # types
  T = TypeVar("T")
  LN = Union[Leaf, Node]
-Transformer = Callable[[Line, Collection[Feature]], Iterator[Line]]
+Transformer = Callable[[Line, Collection[Feature], Mode], Iterator[Line]]
  Index = int
  NodeType = int
  ParserState = int
@@ -71,6 +70,8 @@ StringID = int
  TResult = Result[T, CannotTransform]  # (T)ransform Result
  TMatchResult = TResult[List[Index]]
  
+SPLIT_SAFE_CHARS = frozenset(["\u3001", "\u3002", "\uff0c"])  # East Asian stops
+
  
  def TErr(err_msg: str) -> Err[CannotTransform]:
      """(T)ransform Err
@@ -81,7 +82,9 @@ def TErr(err_msg: str) -> Err[CannotTransform]:
      return Err(cant_transform)
  
  
-def hug_power_op(line: Line, features: Collection[Feature]) -> Iterator[Line]:
+def hug_power_op(
+    line: Line, features: Collection[Feature], mode: Mode
+) -> Iterator[Line]:
      """A transformer which normalizes spacing around power operators."""
  
      # Performance optimization to avoid unnecessary Leaf clones and other ops.
@@ -199,11 +202,11 @@ class StringTransformer(ABC):
          """
          Returns:
              * Ok(string_indices) such that for each index, `line.leaves[index]`
-            is our target string if a match was able to be made. For
-            transformers that don't result in more lines (e.g. StringMerger,
-            StringParenStripper), multiple matches and transforms are done at
-            once to reduce the complexity.
-                OR
+              is our target string if a match was able to be made. For
+              transformers that don't result in more lines (e.g. StringMerger,
+              StringParenStripper), multiple matches and transforms are done at
+              once to reduce the complexity.
+              OR
              * Err(CannotTransform), if no match could be made.
          """
  
@@ -214,12 +217,12 @@ class StringTransformer(ABC):
          """
          Yields:
              * Ok(new_line) where new_line is the new transformed line.
-                OR
+              OR
              * Err(CannotTransform) if the transformation failed for some reason. The
-            `do_match(...)` template method should usually be used to reject
-            the form of the given Line, but in some cases it is difficult to
-            know whether or not a Line meets the StringTransformer's
-            requirements until the transformation is already midway.
+              `do_match(...)` template method should usually be used to reject
+              the form of the given Line, but in some cases it is difficult to
+              know whether or not a Line meets the StringTransformer's
+              requirements until the transformation is already midway.
  
          Side Effects:
              This method should NOT mutate @line directly, but it MAY mutate the
@@ -228,7 +231,9 @@ class StringTransformer(ABC):
              yield an CannotTransform after that point.)
          """
  
-    def __call__(self, line: Line, _features: Collection[Feature]) -> Iterator[Line]:
+    def __call__(
+        self, line: Line, _features: Collection[Feature], _mode: Mode
+    ) -> Iterator[Line]:
          """
          StringTransformer instances have a call signature that mirrors that of
          the Transformer type.
@@ -327,8 +332,8 @@ class CustomSplitMapMixin:
  
          Returns:
              * A list of the custom splits that are mapped to @string, if any
-            exist.
-                OR
+              exist.
+              OR
              * [], otherwise.
  
          Side Effects:
@@ -357,14 +362,14 @@ class StringMerger(StringTransformer, CustomSplitMapMixin):
      Requirements:
          (A) The line contains adjacent strings such that ALL of the validation checks
          listed in StringMerger._validate_msg(...)'s docstring pass.
-            OR
+        OR
          (B) The line contains a string which uses line continuation backslashes.
  
      Transformations:
          Depending on which of the two requirements above where met, either:
  
          (A) The string group associated with the target string is merged.
-            OR
+        OR
          (B) All line-continuation backslashes are removed from the target string.
  
      Collaborations:
@@ -385,7 +390,19 @@ class StringMerger(StringTransformer, CustomSplitMapMixin):
                  and is_valid_index(idx + 1)
                  and LL[idx + 1].type == token.STRING
              ):
-                if not is_part_of_annotation(leaf):
+                # Let's check if the string group contains an inline comment
+                # If we have a comment inline, we don't merge the strings
+                contains_comment = False
+                i = idx
+                while is_valid_index(i):
+                    if LL[i].type != token.STRING:
+                        break
+                    if line.comments_after(LL[i]):
+                        contains_comment = True
+                        break
+                    i += 1
+
+                if not is_part_of_annotation(leaf) and not contains_comment:
                      string_indices.append(idx)
  
                  # Advance to the next non-STRING leaf.
@@ -938,6 +955,9 @@ class StringParenStripper(StringTransformer):
                  LL[lpar_or_rpar_idx].remove()  # Remove lpar.
                  replace_child(LL[idx], string_leaf)
                  new_line.append(string_leaf)
+                # replace comments
+                old_comments = new_line.comments.pop(id(LL[idx]), [])
+                new_line.comments.setdefault(id(string_leaf), []).extend(old_comments)
              else:
                  LL[lpar_or_rpar_idx].remove()  # This is a rpar.
  
@@ -957,17 +977,20 @@ class BaseStringSplitter(StringTransformer):
  
      Requirements:
          * The target string value is responsible for the line going over the
-        line length limit. It follows that after all of black's other line
-        split methods have been exhausted, this line (or one of the resulting
-        lines after all line splits are performed) would still be over the
-        line_length limit unless we split this string.
-            AND
+          line length limit. It follows that after all of black's other line
+          split methods have been exhausted, this line (or one of the resulting
+          lines after all line splits are performed) would still be over the
+          line_length limit unless we split this string.
+          AND
+
          * The target string is NOT a "pointless" string (i.e. a string that has
-        no parent or siblings).
-            AND
+          no parent or siblings).
+          AND
+
          * The target string is not followed by an inline comment that appears
-        to be a pragma.
-            AND
+          to be a pragma.
+          AND
+
          * The target string is not a multiline (i.e. triple-quote) string.
      """
  
@@ -1019,7 +1042,7 @@ class BaseStringSplitter(StringTransformer):
  
          Returns:
              * Ok(None), if ALL of the requirements are met.
-                OR
+              OR
              * Err(CannotTransform), if ANY of the requirements are NOT met.
          """
          LL = line.leaves
@@ -1160,7 +1183,7 @@ class BaseStringSplitter(StringTransformer):
              # WMA4 the length of the inline comment.
              offset += len(comment_leaf.value)
  
-        max_string_length = self.line_length - offset
+        max_string_length = count_chars_in_width(str(line), self.line_length - offset)
          return max_string_length
  
      @staticmethod
@@ -1178,19 +1201,33 @@ class BaseStringSplitter(StringTransformer):
          if LL[0].type != token.STRING:
              return None
  
-        # If the string is surrounded by commas (or is the first/last child)...
-        prev_sibling = LL[0].prev_sibling
-        next_sibling = LL[0].next_sibling
-        if not prev_sibling and not next_sibling and parent_type(LL[0]) == syms.atom:
-            # If it's an atom string, we need to check the parent atom's siblings.
-            parent = LL[0].parent
-            assert parent is not None  # For type checkers.
-            prev_sibling = parent.prev_sibling
-            next_sibling = parent.next_sibling
-        if (not prev_sibling or prev_sibling.type == token.COMMA) and (
-            not next_sibling or next_sibling.type == token.COMMA
+        matching_nodes = [
+            syms.listmaker,
+            syms.dictsetmaker,
+            syms.testlist_gexp,
+        ]
+        # If the string is an immediate child of a list/set/tuple literal...
+        if (
+            parent_type(LL[0]) in matching_nodes
+            or parent_type(LL[0].parent) in matching_nodes
          ):
-            return 0
+            # And the string is surrounded by commas (or is the first/last child)...
+            prev_sibling = LL[0].prev_sibling
+            next_sibling = LL[0].next_sibling
+            if (
+                not prev_sibling
+                and not next_sibling
+                and parent_type(LL[0]) == syms.atom
+            ):
+                # If it's an atom string, we need to check the parent atom's siblings.
+                parent = LL[0].parent
+                assert parent is not None  # For type checkers.
+                prev_sibling = parent.prev_sibling
+                next_sibling = parent.next_sibling
+            if (not prev_sibling or prev_sibling.type == token.COMMA) and (
+                not next_sibling or next_sibling.type == token.COMMA
+            ):
+                return 0
  
          return None
  
@@ -1277,9 +1314,9 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
  
      Requirements:
          * The line consists ONLY of a single string (possibly prefixed by a
-        string operator [e.g. '+' or '==']), MAYBE a string trailer, and MAYBE
-        a trailing comma.
-            AND
+          string operator [e.g. '+' or '==']), MAYBE a string trailer, and MAYBE
+          a trailing comma.
+          AND
          * All of the requirements listed in BaseStringSplitter's docstring.
  
      Transformations:
@@ -1415,11 +1452,13 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
              is_valid_index(string_idx + 1) and LL[string_idx + 1].type == token.COMMA
          )
  
-        def max_last_string() -> int:
+        def max_last_string_column() -> int:
              """
              Returns:
-                The max allowed length of the string value used for the last
-                line we will construct.
+                The max allowed width of the string value used for the last
+                line we will construct.  Note that this value means the width
+                rather than the number of characters (e.g., many East Asian
+                characters expand to two columns).
              """
              result = self.line_length
              result -= line.depth * 4
@@ -1427,14 +1466,14 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
              result -= string_op_leaves_length
              return result
  
-        # --- Calculate Max Break Index (for string value)
+        # --- Calculate Max Break Width (for string value)
          # We start with the line length limit
-        max_break_idx = self.line_length
+        max_break_width = self.line_length
          # The last index of a string of length N is N-1.
-        max_break_idx -= 1
+        max_break_width -= 1
          # Leading whitespace is not present in the string value (e.g. Leaf.value).
-        max_break_idx -= line.depth * 4
-        if max_break_idx < 0:
+        max_break_width -= line.depth * 4
+        if max_break_width < 0:
              yield TErr(
                  f"Unable to split {LL[string_idx].value} at such high of a line depth:"
                  f" {line.depth}"
@@ -1447,7 +1486,7 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
          # line limit.
          use_custom_breakpoints = bool(
              custom_splits
-            and all(csplit.break_idx <= max_break_idx for csplit in custom_splits)
+            and all(csplit.break_idx <= max_break_width for csplit in custom_splits)
          )
  
          # Temporary storage for the remaining chunk of the string line that
@@ -1463,7 +1502,7 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
              if use_custom_breakpoints:
                  return len(custom_splits) > 1
              else:
-                return len(rest_value) > max_last_string()
+                return str_width(rest_value) > max_last_string_column()
  
          string_line_results: List[Ok[Line]] = []
          while more_splits_should_be_made():
@@ -1473,7 +1512,10 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
                  break_idx = csplit.break_idx
              else:
                  # Algorithmic Split (automatic)
-                max_bidx = max_break_idx - string_op_leaves_length
+                max_bidx = (
+                    count_chars_in_width(rest_value, max_break_width)
+                    - string_op_leaves_length
+                )
                  maybe_break_idx = self._get_break_idx(rest_value, max_bidx)
                  if maybe_break_idx is None:
                      # If we are unable to algorithmically determine a good split
@@ -1570,7 +1612,7 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
  
              # Try to fit them all on the same line with the last substring...
              if (
-                len(temp_value) <= max_last_string()
+                str_width(temp_value) <= max_last_string_column()
                  or LL[string_idx + 1].type == token.COMMA
              ):
                  last_line.append(rest_leaf)
@@ -1690,6 +1732,7 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
                  section of this classes' docstring would be be met by returning @i.
              """
              is_space = string[i] == " "
+            is_split_safe = is_valid_index(i - 1) and string[i - 1] in SPLIT_SAFE_CHARS
  
              is_not_escaped = True
              j = i - 1
@@ -1702,7 +1745,7 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
                  and len(string[:i]) >= self.MIN_SUBSTR_SIZE
              )
              return (
-                is_space
+                (is_space or is_split_safe)
                  and is_not_escaped
                  and is_big_enough
                  and not breaks_unsplittable_expression(i)
@@ -1780,25 +1823,26 @@ class StringParenWrapper(BaseStringSplitter, CustomSplitMapMixin):
          addition to the requirements listed below:
  
          * The line is a return/yield statement, which returns/yields a string.
-            OR
+          OR
          * The line is part of a ternary expression (e.g. `x = y if cond else
-        z`) such that the line starts with `else <string>`, where <string> is
-        some string.
-            OR
+          z`) such that the line starts with `else <string>`, where <string> is
+          some string.
+          OR
          * The line is an assert statement, which ends with a string.
-            OR
+          OR
          * The line is an assignment statement (e.g. `x = <string>` or `x +=
-        <string>`) such that the variable is being assigned the value of some
-        string.
-            OR
+          <string>`) such that the variable is being assigned the value of some
+          string.
+          OR
          * The line is a dictionary key assignment where some valid key is being
-        assigned the value of some string.
-            OR
+          assigned the value of some string.
+          OR
          * The line is an lambda expression and the value is a string.
-            OR
+          OR
          * The line starts with an "atom" string that prefers to be wrapped in
-        parens. It's preferred to be wrapped when the string is surrounded by
-        commas (or is the first/last child).
+          parens. It's preferred to be wrapped when it's is an immediate child of
+          a list/set/tuple literal, AND the string is surrounded by commas (or is
+          the first/last child).
  
      Transformations:
          The chosen string is wrapped in parentheses and then split at the LPAR.
@@ -1847,11 +1891,13 @@ class StringParenWrapper(BaseStringSplitter, CustomSplitMapMixin):
  
          if string_idx is not None:
              string_value = line.leaves[string_idx].value
-            # If the string has no spaces...
-            if " " not in string_value:
+            # If the string has neither spaces nor East Asian stops...
+            if not any(
+                char == " " or char in SPLIT_SAFE_CHARS for char in string_value
+            ):
                  # And will still violate the line length limit when split...
-                max_string_length = self.line_length - ((line.depth + 1) * 4)
-                if len(string_value) > max_string_length:
+                max_string_width = self.line_length - ((line.depth + 1) * 4)
+                if str_width(string_value) > max_string_width:
                      # And has no associated custom splits...
                      if not self.has_custom_splits(string_value):
                          # Then we should NOT put this string on its own line.
@@ -2242,7 +2288,7 @@ class StringParser:
          Returns:
              The index directly after the last leaf which is apart of the string
              trailer, if a "trailer" exists.
-                OR
+            OR
              @string_idx + 1, if no string "trailer" exists.
          """
          assert leaves[string_idx].type == token.STRING
@@ -2256,11 +2302,11 @@ class StringParser:
          """
          Pre-conditions:
              * On the first call to this function, @leaf MUST be the leaf that
-            was directly after the string leaf in question (e.g. if our target
-            string is `line.leaves[i]` then the first call to this method must
-            be `line.leaves[i + 1]`).
+              was directly after the string leaf in question (e.g. if our target
+              string is `line.leaves[i]` then the first call to this method must
+              be `line.leaves[i + 1]`).
              * On the next call to this function, the leaf parameter passed in
-            MUST be the leaf directly following @leaf.
+              MUST be the leaf directly following @leaf.
  
          Returns:
              True iff @leaf is apart of the string's trailer.