Improve multiline string handling (#1879)

author Aneesh Agrawal <aneeshusa@gmail.com>

Tue, 7 Mar 2023 19:52:19 +0000 (14:52 -0500)

committer GitHub <noreply@github.com>

Tue, 7 Mar 2023 19:52:19 +0000 (11:52 -0800)
author Aneesh Agrawal <aneeshusa@gmail.com>
Tue, 7 Mar 2023 19:52:19 +0000 (14:52 -0500)
committer GitHub <noreply@github.com>
Tue, 7 Mar 2023 19:52:19 +0000 (11:52 -0800)
diff --git a/CHANGES.md b/CHANGES.md

index a8b556cb7e8692c241f4566e7e5c4bf08d5cafd5..53682df2b398f5db1cc5f32ec027be4ff8e9734c 100644 (file)
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -133,6 +133,7 @@ versions separately.
    code. Implicitly concatenated f-strings with different quotes can now be merged or
    quote-normalized by changing the quotes used in expressions. (#3509)
  - Fix crash on `await (yield)` when Black is compiled with mypyc (#3533)
+- Improve handling of multiline strings by changing line split behavior (#1879)
  
  ### Configuration
  
diff --git a/docs/the_black_code_style/future_style.md b/docs/the_black_code_style/future_style.md

index b2cdca2590dab86ca015c2ad01a5324f29a8033c..96abc99ef41335cefd7d88fab8245156f0d2ad64 100644 (file)
--- a/docs/the_black_code_style/future_style.md
+++ b/docs/the_black_code_style/future_style.md
@@ -111,3 +111,51 @@ my_dict = {
      "another key": short_value,
  }
  ```
+
+### Improved multiline string handling
+
+_Black_ is smarter when formatting multiline strings, especially in function arguments,
+to avoid introducing extra line breaks. Previously, it would always consider multiline
+strings as not fitting on a single line. With this new feature, _Black_ looks at the
+context around the multiline string to decide if it should be inlined or split to a
+separate line. For example, when a multiline string is passed to a function, _Black_
+will only split the multiline string if a line is too long or if multiple arguments are
+being passed.
+
+For example, _Black_ will reformat
+
+```python
+textwrap.dedent(
+    """\
+    This is a
+    multiline string
+"""
+)
+```
+
+to:
+
+```python
+textwrap.dedent("""\
+    This is a
+    multiline string
+""")
+```
+
+And:
+
+```python
+MULTILINE = """
+foobar
+""".replace(
+    "\n", ""
+)
+```
+
+to:
+
+```python
+MULTILINE = """
+foobar
+""".replace("\n", "")
+```
diff --git a/src/black/linegen.py b/src/black/linegen.py

index f7d3655e962bbb5cbdf2d27f894216aafd351dac..95d5583c5f53fd1ab97287c78a41cc90d3d8c33d 100644 (file)
--- a/src/black/linegen.py
+++ b/src/black/linegen.py
@@ -2,7 +2,7 @@
  Generating lines of code.
  """
  import sys
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
  from enum import Enum, auto
  from functools import partial, wraps
  from typing import Collection, Iterator, List, Optional, Set, Union, cast
@@ -505,7 +505,7 @@ def transform_line(
          and not line.should_split_rhs
          and not line.magic_trailing_comma
          and (
-            is_line_short_enough(line, line_length=mode.line_length, line_str=line_str)
+            is_line_short_enough(line, mode=mode, line_str=line_str)
              or line.contains_unsplittable_type_ignore()
          )
          and not (line.inside_brackets and line.contains_standalone_comments())
@@ -529,14 +529,12 @@ def transform_line(
              bracket pair instead.
              """
              for omit in generate_trailers_to_omit(line, mode.line_length):
-                lines = list(
-                    right_hand_split(line, mode.line_length, features, omit=omit)
-                )
+                lines = list(right_hand_split(line, mode, features, omit=omit))
                  # Note: this check is only able to figure out if the first line of the
                  # *current* transformation fits in the line length.  This is true only
                  # for simple cases.  All others require running more transforms via
                  # `transform_line()`.  This check doesn't know if those would succeed.
-                if is_line_short_enough(lines[0], line_length=mode.line_length):
+                if is_line_short_enough(lines[0], mode=mode):
                      yield from lines
                      return
  
@@ -544,9 +542,7 @@ def transform_line(
              # This mostly happens to multiline strings that are by definition
              # reported as not fitting a single line, as well as lines that contain
              # trailing commas (those have to be exploded).
-            yield from right_hand_split(
-                line, line_length=mode.line_length, features=features
-            )
+            yield from right_hand_split(line, mode, features=features)
  
          # HACK: nested functions (like _rhs) compiled by mypyc don't retain their
          # __name__ attribute which is needed in `run_transformer` further down.
@@ -664,7 +660,7 @@ class _RHSResult:
  
  def right_hand_split(
      line: Line,
-    line_length: int,
+    mode: Mode,
      features: Collection[Feature] = (),
      omit: Collection[LeafID] = (),
  ) -> Iterator[Line]:
@@ -678,7 +674,7 @@ def right_hand_split(
      """
      rhs_result = _first_right_hand_split(line, omit=omit)
      yield from _maybe_split_omitting_optional_parens(
-        rhs_result, line, line_length, features=features, omit=omit
+        rhs_result, line, mode, features=features, omit=omit
      )
  
  
@@ -733,7 +729,7 @@ def _first_right_hand_split(
  def _maybe_split_omitting_optional_parens(
      rhs: _RHSResult,
      line: Line,
-    line_length: int,
+    mode: Mode,
      features: Collection[Feature] = (),
      omit: Collection[LeafID] = (),
  ) -> Iterator[Line]:
@@ -751,7 +747,7 @@ def _maybe_split_omitting_optional_parens(
          # there are no standalone comments in the body
          and not rhs.body.contains_standalone_comments(0)
          # and we can actually remove the parens
-        and can_omit_invisible_parens(rhs.body, line_length)
+        and can_omit_invisible_parens(rhs.body, mode.line_length)
      ):
          omit = {id(rhs.closing_bracket), *omit}
          try:
@@ -766,23 +762,24 @@ def _maybe_split_omitting_optional_parens(
                  and any(leaf.type in BRACKETS for leaf in rhs.head.leaves[:-1])
                  # the left side of assignment is short enough (the -1 is for the ending
                  # optional paren)
-                and is_line_short_enough(rhs.head, line_length=line_length - 1)
+                and is_line_short_enough(
+                    rhs.head, mode=replace(mode, line_length=mode.line_length - 1)
+                )
                  # the left side of assignment won't explode further because of magic
                  # trailing comma
                  and rhs.head.magic_trailing_comma is None
                  # the split by omitting optional parens isn't preferred by some other
                  # reason
-                and not _prefer_split_rhs_oop(rhs_oop, line_length=line_length)
+                and not _prefer_split_rhs_oop(rhs_oop, mode)
              ):
                  yield from _maybe_split_omitting_optional_parens(
-                    rhs_oop, line, line_length, features=features, omit=omit
+                    rhs_oop, line, mode, features=features, omit=omit
                  )
                  return
  
          except CannotSplit as e:
              if not (
-                can_be_split(rhs.body)
-                or is_line_short_enough(rhs.body, line_length=line_length)
+                can_be_split(rhs.body) or is_line_short_enough(rhs.body, mode=mode)
              ):
                  raise CannotSplit(
                      "Splitting failed, body is still too long and can't be split."
@@ -806,7 +803,7 @@ def _maybe_split_omitting_optional_parens(
              yield result
  
  
-def _prefer_split_rhs_oop(rhs_oop: _RHSResult, line_length: int) -> bool:
+def _prefer_split_rhs_oop(rhs_oop: _RHSResult, mode: Mode) -> bool:
      """
      Returns whether we should prefer the result from a split omitting optional parens.
      """
@@ -826,7 +823,7 @@ def _prefer_split_rhs_oop(rhs_oop: _RHSResult, line_length: int) -> bool:
              # the first line still contains the `=`)
              any(leaf.type == token.EQUAL for leaf in rhs_oop.head.leaves)
              # the first line is short enough
-            and is_line_short_enough(rhs_oop.head, line_length=line_length)
+            and is_line_short_enough(rhs_oop.head, mode=mode)
          )
          # contains unsplittable type ignore
          or rhs_oop.head.contains_unsplittable_type_ignore()
@@ -1525,7 +1522,7 @@ def run_transformer(
          or line.contains_multiline_strings()
          or result[0].contains_uncollapsable_type_comments()
          or result[0].contains_unsplittable_type_ignore()
-        or is_line_short_enough(result[0], line_length=mode.line_length)
+        or is_line_short_enough(result[0], mode=mode)
          # If any leaves have no parents (which _can_ occur since
          # `transform(line)` potentially destroys the line's underlying node
          # structure), then we can't proceed. Doing so would cause the below
@@ -1540,8 +1537,6 @@ def run_transformer(
      second_opinion = run_transformer(
          line_copy, transform, mode, features_fop, line_str=line_str
      )
-    if all(
-        is_line_short_enough(ln, line_length=mode.line_length) for ln in second_opinion
-    ):
+    if all(is_line_short_enough(ln, mode=mode) for ln in second_opinion):
          result = second_opinion
      return result
diff --git a/src/black/lines.py b/src/black/lines.py

index ec6ef5d952290ad153d4db27b6a99ff0c709ebe8..b65604864a4b73fabb69f893e13b99c39806ea93 100644 (file)
--- a/src/black/lines.py
+++ b/src/black/lines.py
@@ -1,4 +1,5 @@
  import itertools
+import math
  import sys
  from dataclasses import dataclass, field
  from typing import (
@@ -10,11 +11,12 @@ from typing import (
      Sequence,
      Tuple,
      TypeVar,
+    Union,
      cast,
  )
  
  from black.brackets import DOT_PRIORITY, BracketTracker
-from black.mode import Mode
+from black.mode import Mode, Preview
  from black.nodes import (
      BRACKETS,
      CLOSING_BRACKETS,
@@ -37,6 +39,7 @@ from blib2to3.pytree import Leaf, Node
  T = TypeVar("T")
  Index = int
  LeafID = int
+LN = Union[Leaf, Node]
  
  
  @dataclass
@@ -701,18 +704,93 @@ def append_leaves(
              new_line.append(comment_leaf, preformatted=True)
  
  
-def is_line_short_enough(line: Line, *, line_length: int, line_str: str = "") -> bool:
-    """Return True if `line` is no longer than `line_length`.
-
+def is_line_short_enough(  # noqa: C901
+    line: Line, *, mode: Mode, line_str: str = ""
+) -> bool:
+    """For non-multiline strings, return True if `line` is no longer than `line_length`.
+    For multiline strings, looks at the context around `line` to determine
+    if it should be inlined or split up.
      Uses the provided `line_str` rendering, if any, otherwise computes a new one.
      """
      if not line_str:
          line_str = line_to_string(line)
-    return (
-        len(line_str) <= line_length
-        and "\n" not in line_str  # multiline strings
-        and not line.contains_standalone_comments()
-    )
+
+    if Preview.multiline_string_handling not in mode:
+        return (
+            len(line_str) <= mode.line_length
+            and "\n" not in line_str  # multiline strings
+            and not line.contains_standalone_comments()
+        )
+
+    if line.contains_standalone_comments():
+        return False
+    if "\n" not in line_str:
+        # No multiline strings (MLS) present
+        return len(line_str) <= mode.line_length
+
+    first, *_, last = line_str.split("\n")
+    if len(first) > mode.line_length or len(last) > mode.line_length:
+        return False
+
+    # Traverse the AST to examine the context of the multiline string (MLS),
+    # tracking aspects such as depth and comma existence,
+    # to determine whether to split the MLS or keep it together.
+    # Depth (which is based on the existing bracket_depth concept)
+    # is needed to determine nesting level of the MLS.
+    # Includes special case for trailing commas.
+    commas: List[int] = []  # tracks number of commas per depth level
+    multiline_string: Optional[Leaf] = None
+    # store the leaves that contain parts of the MLS
+    multiline_string_contexts: List[LN] = []
+
+    max_level_to_update = math.inf  # track the depth of the MLS
+    for i, leaf in enumerate(line.leaves):
+        if max_level_to_update == math.inf:
+            had_comma: Optional[int] = None
+            if leaf.bracket_depth + 1 > len(commas):
+                commas.append(0)
+            elif leaf.bracket_depth + 1 < len(commas):
+                had_comma = commas.pop()
+            if (
+                had_comma is not None
+                and multiline_string is not None
+                and multiline_string.bracket_depth == leaf.bracket_depth + 1
+            ):
+                # Have left the level with the MLS, stop tracking commas
+                max_level_to_update = leaf.bracket_depth
+                if had_comma > 0:
+                    # MLS was in parens with at least one comma - force split
+                    return False
+
+        if leaf.bracket_depth <= max_level_to_update and leaf.type == token.COMMA:
+            # Ignore non-nested trailing comma
+            # directly after MLS/MLS-containing expression
+            ignore_ctxs: List[Optional[LN]] = [None]
+            ignore_ctxs += multiline_string_contexts
+            if not (leaf.prev_sibling in ignore_ctxs and i == len(line.leaves) - 1):
+                commas[leaf.bracket_depth] += 1
+        if max_level_to_update != math.inf:
+            max_level_to_update = min(max_level_to_update, leaf.bracket_depth)
+
+        if is_multiline_string(leaf):
+            if len(multiline_string_contexts) > 0:
+                # >1 multiline string cannot fit on a single line - force split
+                return False
+            multiline_string = leaf
+            ctx: LN = leaf
+            # fetch the leaf components of the MLS in the AST
+            while str(ctx) in line_str:
+                multiline_string_contexts.append(ctx)
+                if ctx.parent is None:
+                    break
+                ctx = ctx.parent
+
+    # May not have a triple-quoted multiline string at all,
+    # in case of a regular string with embedded newlines and line continuations
+    if len(multiline_string_contexts) == 0:
+        return True
+
+    return all(val == 0 for val in commas)
  
  
  def can_be_split(line: Line) -> bool:
diff --git a/src/black/mode.py b/src/black/mode.py

index c9a4c2b080c7c8d2b7159ecbde403fc188e1abdc..d70388916dabf1053d62f20921cb83f64abd9922 100644 (file)
--- a/src/black/mode.py
+++ b/src/black/mode.py
@@ -155,6 +155,7 @@ class Preview(Enum):
  
      add_trailing_comma_consistently = auto()
      hex_codes_in_unicode_sequences = auto()
+    multiline_string_handling = auto()
      prefer_splitting_right_hand_side_of_assignments = auto()
      # NOTE: string_processing requires wrap_long_dict_values_in_parens
      # for https://github.com/psf/black/issues/3117 to be fixed.
diff --git a/tests/data/preview/multiline_strings.py b/tests/data/preview/multiline_strings.py

new file mode 100644 (file)

index 0000000..bb517d1
--- /dev/null
+++ b/tests/data/preview/multiline_strings.py
@@ -0,0 +1,358 @@
+"""cow
+say""",
+call(3, "dogsay", textwrap.dedent("""dove
+    coo""" % "cowabunga"))
+call(3, "dogsay", textwrap.dedent("""dove
+coo""" % "cowabunga"))
+call(3, textwrap.dedent("""cow
+    moo""" % "cowabunga"), "dogsay")
+call(3, "dogsay", textwrap.dedent("""crow
+    caw""" % "cowabunga"),)
+call(3, textwrap.dedent("""cat
+    meow""" % "cowabunga"), {"dog", "say"})
+call(3, {"dog", "say"}, textwrap.dedent("""horse
+    neigh""" % "cowabunga"))
+call(3, {"dog", "say"}, textwrap.dedent("""pig
+    oink""" % "cowabunga"),)
+textwrap.dedent("""A one-line triple-quoted string.""")
+textwrap.dedent("""A two-line triple-quoted string
+since it goes to the next line.""")
+textwrap.dedent("""A three-line triple-quoted string
+that not only goes to the next line
+but also goes one line beyond.""")
+textwrap.dedent("""\
+    A triple-quoted string
+    actually leveraging the textwrap.dedent functionality
+    that ends in a trailing newline,
+    representing e.g. file contents.
+""")
+path.write_text(textwrap.dedent("""\
+    A triple-quoted string
+    actually leveraging the textwrap.dedent functionality
+    that ends in a trailing newline,
+    representing e.g. file contents.
+"""))
+path.write_text(textwrap.dedent("""\
+    A triple-quoted string
+    actually leveraging the textwrap.dedent functionality
+    that ends in a trailing newline,
+    representing e.g. {config_filename} file contents.
+""".format("config_filename", config_filename)))
+# Another use case
+data = yaml.load("""\
+a: 1
+b: 2
+""")
+data = yaml.load("""\
+a: 1
+b: 2
+""",)
+data = yaml.load(
+    """\
+    a: 1
+    b: 2
+"""
+)
+
+MULTILINE = """
+foo
+""".replace("\n", "")
+generated_readme = lambda project_name: """
+{}
+
+<Add content here!>
+""".strip().format(project_name)
+parser.usage += """
+Custom extra help summary.
+
+Extra test:
+- with
+- bullets
+"""
+
+
+def get_stuff(cr, value):
+    # original
+    cr.execute("""
+        SELECT whatever
+          FROM some_table t
+         WHERE id = %s
+    """, [value])
+    return cr.fetchone()
+
+
+def get_stuff(cr, value):
+    # preferred
+    cr.execute(
+        """
+        SELECT whatever
+          FROM some_table t
+         WHERE id = %s
+        """,
+        [value],
+    )
+    return cr.fetchone()
+
+
+call(arg1, arg2, """
+short
+""", arg3=True)
+test_vectors = [
+    "one-liner\n",
+    "two\nliner\n",
+    """expressed
+as a three line
+mulitline string""",
+]
+
+_wat = re.compile(
+    r"""
+    regex
+    """,
+    re.MULTILINE | re.VERBOSE,
+)
+dis_c_instance_method = """\
+%3d           0 LOAD_FAST                1 (x)
+              2 LOAD_CONST               1 (1)
+              4 COMPARE_OP               2 (==)
+              6 LOAD_FAST                0 (self)
+              8 STORE_ATTR               0 (x)
+             10 LOAD_CONST               0 (None)
+             12 RETURN_VALUE
+""" % (_C.__init__.__code__.co_firstlineno + 1,)
+path.write_text(textwrap.dedent("""\
+    A triple-quoted string
+    actually {verb} the textwrap.dedent functionality
+    that ends in a trailing newline,
+    representing e.g. {file_type} file contents.
+""".format(verb="using", file_type="test")))
+{"""cow
+moos"""}
+["""cow
+moos"""]
+["""cow
+moos""", """dog
+woofs
+and
+barks"""]
+def dastardly_default_value(
+    cow: String = json.loads("""this
+is
+quite
+the
+dastadardly
+value!"""),
+    **kwargs,
+):
+    pass
+
+print(f"""
+    This {animal}
+    moos and barks
+{animal} say
+""")
+msg = f"""The arguments {bad_arguments} were passed in.
+Please use `--build-option` instead,
+`--global-option` is reserved to flags like `--verbose` or `--quiet`.
+"""
+
+# output
+"""cow
+say""",
+call(
+    3,
+    "dogsay",
+    textwrap.dedent("""dove
+    coo""" % "cowabunga"),
+)
+call(
+    3,
+    "dogsay",
+    textwrap.dedent("""dove
+coo""" % "cowabunga"),
+)
+call(
+    3,
+    textwrap.dedent("""cow
+    moo""" % "cowabunga"),
+    "dogsay",
+)
+call(
+    3,
+    "dogsay",
+    textwrap.dedent("""crow
+    caw""" % "cowabunga"),
+)
+call(
+    3,
+    textwrap.dedent("""cat
+    meow""" % "cowabunga"),
+    {"dog", "say"},
+)
+call(
+    3,
+    {"dog", "say"},
+    textwrap.dedent("""horse
+    neigh""" % "cowabunga"),
+)
+call(
+    3,
+    {"dog", "say"},
+    textwrap.dedent("""pig
+    oink""" % "cowabunga"),
+)
+textwrap.dedent("""A one-line triple-quoted string.""")
+textwrap.dedent("""A two-line triple-quoted string
+since it goes to the next line.""")
+textwrap.dedent("""A three-line triple-quoted string
+that not only goes to the next line
+but also goes one line beyond.""")
+textwrap.dedent("""\
+    A triple-quoted string
+    actually leveraging the textwrap.dedent functionality
+    that ends in a trailing newline,
+    representing e.g. file contents.
+""")
+path.write_text(textwrap.dedent("""\
+    A triple-quoted string
+    actually leveraging the textwrap.dedent functionality
+    that ends in a trailing newline,
+    representing e.g. file contents.
+"""))
+path.write_text(textwrap.dedent("""\
+    A triple-quoted string
+    actually leveraging the textwrap.dedent functionality
+    that ends in a trailing newline,
+    representing e.g. {config_filename} file contents.
+""".format("config_filename", config_filename)))
+# Another use case
+data = yaml.load("""\
+a: 1
+b: 2
+""")
+data = yaml.load(
+    """\
+a: 1
+b: 2
+""",
+)
+data = yaml.load("""\
+    a: 1
+    b: 2
+""")
+
+MULTILINE = """
+foo
+""".replace("\n", "")
+generated_readme = lambda project_name: """
+{}
+
+<Add content here!>
+""".strip().format(project_name)
+parser.usage += """
+Custom extra help summary.
+
+Extra test:
+- with
+- bullets
+"""
+
+
+def get_stuff(cr, value):
+    # original
+    cr.execute(
+        """
+        SELECT whatever
+          FROM some_table t
+         WHERE id = %s
+    """,
+        [value],
+    )
+    return cr.fetchone()
+
+
+def get_stuff(cr, value):
+    # preferred
+    cr.execute(
+        """
+        SELECT whatever
+          FROM some_table t
+         WHERE id = %s
+        """,
+        [value],
+    )
+    return cr.fetchone()
+
+
+call(
+    arg1,
+    arg2,
+    """
+short
+""",
+    arg3=True,
+)
+test_vectors = [
+    "one-liner\n",
+    "two\nliner\n",
+    """expressed
+as a three line
+mulitline string""",
+]
+
+_wat = re.compile(
+    r"""
+    regex
+    """,
+    re.MULTILINE | re.VERBOSE,
+)
+dis_c_instance_method = """\
+%3d           0 LOAD_FAST                1 (x)
+              2 LOAD_CONST               1 (1)
+              4 COMPARE_OP               2 (==)
+              6 LOAD_FAST                0 (self)
+              8 STORE_ATTR               0 (x)
+             10 LOAD_CONST               0 (None)
+             12 RETURN_VALUE
+""" % (_C.__init__.__code__.co_firstlineno + 1,)
+path.write_text(textwrap.dedent("""\
+    A triple-quoted string
+    actually {verb} the textwrap.dedent functionality
+    that ends in a trailing newline,
+    representing e.g. {file_type} file contents.
+""".format(verb="using", file_type="test")))
+{"""cow
+moos"""}
+["""cow
+moos"""]
+[
+    """cow
+moos""",
+    """dog
+woofs
+and
+barks""",
+]
+
+
+def dastardly_default_value(
+    cow: String = json.loads("""this
+is
+quite
+the
+dastadardly
+value!"""),
+    **kwargs,
+):
+    pass
+
+
+print(f"""
+    This {animal}
+    moos and barks
+{animal} say
+""")
+msg = f"""The arguments {bad_arguments} were passed in.
+Please use `--build-option` instead,
+`--global-option` is reserved to flags like `--verbose` or `--quiet`.
+"""
author	Aneesh Agrawal <aneeshusa@gmail.com>
	Tue, 7 Mar 2023 19:52:19 +0000 (14:52 -0500)
committer	GitHub <noreply@github.com>
	Tue, 7 Mar 2023 19:52:19 +0000 (11:52 -0800)
CHANGES.md		patch \| blob \| history
docs/the_black_code_style/future_style.md		patch \| blob \| history
src/black/linegen.py		patch \| blob \| history
src/black/lines.py		patch \| blob \| history
src/black/mode.py		patch \| blob \| history
tests/data/preview/multiline_strings.py	[new file with mode: 0644]	patch \| blob