Support named escapes (`\N{...}`) in string processing (#2319)

author jack1142 <6032823+jack1142@users.noreply.github.com>

Wed, 9 Jun 2021 19:29:32 +0000 (21:29 +0200)

committer GitHub <noreply@github.com>

Wed, 9 Jun 2021 19:29:32 +0000 (12:29 -0700)
author jack1142 <6032823+jack1142@users.noreply.github.com>
Wed, 9 Jun 2021 19:29:32 +0000 (21:29 +0200)
committer GitHub <noreply@github.com>
Wed, 9 Jun 2021 19:29:32 +0000 (12:29 -0700)
diff --git a/CHANGES.md b/CHANGES.md

index 9c2939e1b1bd6e3aaa5edbaa86726aa44a71c605..01c02fe2b70a30cc8b59466ef60b064bbe7fbb19 100644 (file)
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -13,6 +13,8 @@
  - Fix incorrect custom breakpoint indices when string group contains fake f-strings
    (#2311)
  - Fix regression where `R` prefixes would be lowercased for docstrings (#2285)
+- Fix handling of named escapes (`\N{...}`) when `--experimental-string-processing` is
+  used (#2319)
  
  ## 21.5b2
  
diff --git a/src/black/trans.py b/src/black/trans.py

index ca620f6b2a5dd4236d5cc45e380e03604380b8e1..023dcd3618a5c0edb6b6f5e7e91e09bb6d3add30 100644 (file)
--- a/src/black/trans.py
+++ b/src/black/trans.py
@@ -15,6 +15,7 @@ from typing import (
      List,
      Optional,
      Sequence,
+    Set,
      Tuple,
      TypeVar,
      Union,
@@ -1243,6 +1244,61 @@ class StringSplitter(CustomSplitMapMixin, BaseStringSplitter):
              last_line.comments = line.comments.copy()
              yield Ok(last_line)
  
+    def _iter_nameescape_slices(self, string: str) -> Iterator[Tuple[Index, Index]]:
+        """
+        Yields:
+            All ranges of @string which, if @string were to be split there,
+            would result in the splitting of an \\N{...} expression (which is NOT
+            allowed).
+        """
+        # True - the previous backslash was unescaped
+        # False - the previous backslash was escaped *or* there was no backslash
+        previous_was_unescaped_backslash = False
+        it = iter(enumerate(string))
+        for idx, c in it:
+            if c == "\\":
+                previous_was_unescaped_backslash = not previous_was_unescaped_backslash
+                continue
+            if not previous_was_unescaped_backslash or c != "N":
+                previous_was_unescaped_backslash = False
+                continue
+            previous_was_unescaped_backslash = False
+
+            begin = idx - 1  # the position of backslash before \N{...}
+            for idx, c in it:
+                if c == "}":
+                    end = idx
+                    break
+            else:
+                # malformed nameescape expression?
+                # should have been detected by AST parsing earlier...
+                raise RuntimeError(f"{self.__class__.__name__} LOGIC ERROR!")
+            yield begin, end
+
+    def _iter_fexpr_slices(self, string: str) -> Iterator[Tuple[Index, Index]]:
+        """
+        Yields:
+            All ranges of @string which, if @string were to be split there,
+            would result in the splitting of an f-expression (which is NOT
+            allowed).
+        """
+        if "f" not in get_string_prefix(string).lower():
+            return
+
+        for match in re.finditer(self.RE_FEXPR, string, re.VERBOSE):
+            yield match.span()
+
+    def _get_illegal_split_indices(self, string: str) -> Set[Index]:
+        illegal_indices: Set[Index] = set()
+        iterators = [
+            self._iter_fexpr_slices(string),
+            self._iter_nameescape_slices(string),
+        ]
+        for it in iterators:
+            for begin, end in it:
+                illegal_indices.update(range(begin, end + 1))
+        return illegal_indices
+
      def _get_break_idx(self, string: str, max_break_idx: int) -> Optional[int]:
          """
          This method contains the algorithm that StringSplitter uses to
@@ -1272,40 +1328,15 @@ class StringSplitter(CustomSplitMapMixin, BaseStringSplitter):
          assert is_valid_index(max_break_idx)
          assert_is_leaf_string(string)
  
-        _fexpr_slices: Optional[List[Tuple[Index, Index]]] = None
-
-        def fexpr_slices() -> Iterator[Tuple[Index, Index]]:
-            """
-            Yields:
-                All ranges of @string which, if @string were to be split there,
-                would result in the splitting of an f-expression (which is NOT
-                allowed).
-            """
-            nonlocal _fexpr_slices
-
-            if _fexpr_slices is None:
-                _fexpr_slices = []
-                for match in re.finditer(self.RE_FEXPR, string, re.VERBOSE):
-                    _fexpr_slices.append(match.span())
-
-            yield from _fexpr_slices
-
-        is_fstring = "f" in get_string_prefix(string).lower()
+        _illegal_split_indices = self._get_illegal_split_indices(string)
  
-        def breaks_fstring_expression(i: Index) -> bool:
+        def breaks_unsplittable_expression(i: Index) -> bool:
              """
              Returns:
                  True iff returning @i would result in the splitting of an
-                f-expression (which is NOT allowed).
+                unsplittable expression (which is NOT allowed).
              """
-            if not is_fstring:
-                return False
-
-            for (start, end) in fexpr_slices():
-                if start <= i < end:
-                    return True
-
-            return False
+            return i in _illegal_split_indices
  
          def passes_all_checks(i: Index) -> bool:
              """
@@ -1329,7 +1360,7 @@ class StringSplitter(CustomSplitMapMixin, BaseStringSplitter):
                  is_space
                  and is_not_escaped
                  and is_big_enough
-                and not breaks_fstring_expression(i)
+                and not breaks_unsplittable_expression(i)
              )
  
          # First, we check all indices BELOW @max_break_idx.
diff --git a/tests/data/long_strings.py b/tests/data/long_strings.py

index 151396b5239ee2a99921f7d5460c6bf2dfeeaf7a..430f760cf0b9dc9a8490974198904b4ff01195ba 100644 (file)
--- a/tests/data/long_strings.py
+++ b/tests/data/long_strings.py
@@ -207,6 +207,38 @@ long_unmergable_string_with_pragma = (
      " of it."
  )
  
+string_with_nameescape = (
+    "........................................................................ \N{LAO KO LA}"
+)
+
+string_with_nameescape = (
+    "........................................................................... \N{LAO KO LA}"
+)
+
+string_with_nameescape = (
+    "............................................................................ \N{LAO KO LA}"
+)
+
+string_with_nameescape_and_escaped_backslash = (
+    "...................................................................... \\\N{LAO KO LA}"
+)
+
+string_with_nameescape_and_escaped_backslash = (
+    "......................................................................... \\\N{LAO KO LA}"
+)
+
+string_with_nameescape_and_escaped_backslash = (
+    ".......................................................................... \\\N{LAO KO LA}"
+)
+
+string_with_escaped_nameescape = (
+    "........................................................................ \\N{LAO KO LA}"
+)
+
+string_with_escaped_nameescape = (
+    "........................................................................... \\N{LAO KO LA}"
+)
+
  
  # output
  
@@ -587,3 +619,43 @@ long_unmergable_string_with_pragma = (
      "This is a really long string that can't be merged because it has a likely pragma at the end"  # pylint: disable=some-pylint-check
      " of it."
  )
+
+string_with_nameescape = (
+    "........................................................................"
+    " \N{LAO KO LA}"
+)
+
+string_with_nameescape = (
+    "..........................................................................."
+    " \N{LAO KO LA}"
+)
+
+string_with_nameescape = (
+    "............................................................................"
+    " \N{LAO KO LA}"
+)
+
+string_with_nameescape_and_escaped_backslash = (
+    "......................................................................"
+    " \\\N{LAO KO LA}"
+)
+
+string_with_nameescape_and_escaped_backslash = (
+    "........................................................................."
+    " \\\N{LAO KO LA}"
+)
+
+string_with_nameescape_and_escaped_backslash = (
+    ".........................................................................."
+    " \\\N{LAO KO LA}"
+)
+
+string_with_escaped_nameescape = (
+    "........................................................................ \\N{LAO"
+    " KO LA}"
+)
+
+string_with_escaped_nameescape = (
+    "..........................................................................."
+    " \\N{LAO KO LA}"
+)
diff --git a/tests/data/long_strings__regression.py b/tests/data/long_strings__regression.py

index e4234b2f97c160d98e11e8e2127d94c37d9e2d75..61c28d376ef8519fba80d286f711d735fb827aff 100644 (file)
--- a/tests/data/long_strings__regression.py
+++ b/tests/data/long_strings__regression.py
@@ -514,6 +514,10 @@ fstring = F"f-strings definitely make things more {difficult} than they need to
  
  x = F"This is a long string which contains an f-expr that should not split {{{[i for i in range(5)]}}}."
  
+x = (
+    "\N{BLACK RIGHT-POINTING TRIANGLE WITH DOUBLE VERTICAL BAR}\N{VARIATION SELECTOR-16}"
+)
+
  
  # output
  
@@ -1142,3 +1146,7 @@ x = (
      "This is a long string which contains an f-expr that should not split"
      f" {{{[i for i in range(5)]}}}."
  )
+
+x = (
+    "\N{BLACK RIGHT-POINTING TRIANGLE WITH DOUBLE VERTICAL BAR}\N{VARIATION SELECTOR-16}"
+)
author	jack1142 <6032823+jack1142@users.noreply.github.com>
	Wed, 9 Jun 2021 19:29:32 +0000 (21:29 +0200)
committer	GitHub <noreply@github.com>
	Wed, 9 Jun 2021 19:29:32 +0000 (12:29 -0700)
CHANGES.md		patch \| blob \| history
src/black/trans.py		patch \| blob \| history
tests/data/long_strings.py		patch \| blob \| history
tests/data/long_strings__regression.py		patch \| blob \| history