From: jack1142 <6032823+jack1142@users.noreply.github.com>
Date: Wed, 9 Jun 2021 19:29:32 +0000 (+0200)
Subject: Support named escapes (`\N{...}`) in string processing (#2319)
X-Git-Url: https://git.madduck.net/etc/vim.git/commitdiff_plain/62402a32618bc62ae90cfcdc3d47c7ad20e60e10?ds=inline

Support named escapes (`\N{...}`) in string processing (#2319)

Co-authored-by: Felix Hildén <felix.hilden@gmail.com>
Co-authored-by: Jelle Zijlstra <jelle.zijlstra@gmail.com>
---

diff --git a/CHANGES.md b/CHANGES.md
index 9c2939e1..01c02fe2 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -13,6 +13,8 @@
 - Fix incorrect custom breakpoint indices when string group contains fake f-strings
   (#2311)
 - Fix regression where `R` prefixes would be lowercased for docstrings (#2285)
+- Fix handling of named escapes (`\N{...}`) when `--experimental-string-processing` is
+  used (#2319)
 
 ## 21.5b2
 
diff --git a/src/black/trans.py b/src/black/trans.py
index ca620f6b..023dcd36 100644
--- a/src/black/trans.py
+++ b/src/black/trans.py
@@ -15,6 +15,7 @@ from typing import (
     List,
     Optional,
     Sequence,
+    Set,
     Tuple,
     TypeVar,
     Union,
@@ -1243,6 +1244,61 @@ class StringSplitter(CustomSplitMapMixin, BaseStringSplitter):
             last_line.comments = line.comments.copy()
             yield Ok(last_line)
 
+    def _iter_nameescape_slices(self, string: str) -> Iterator[Tuple[Index, Index]]:
+        """
+        Yields:
+            All ranges of @string which, if @string were to be split there,
+            would result in the splitting of an \\N{...} expression (which is NOT
+            allowed).
+        """
+        # True - the previous backslash was unescaped
+        # False - the previous backslash was escaped *or* there was no backslash
+        previous_was_unescaped_backslash = False
+        it = iter(enumerate(string))
+        for idx, c in it:
+            if c == "\\":
+                previous_was_unescaped_backslash = not previous_was_unescaped_backslash
+                continue
+            if not previous_was_unescaped_backslash or c != "N":
+                previous_was_unescaped_backslash = False
+                continue
+            previous_was_unescaped_backslash = False
+
+            begin = idx - 1  # the position of backslash before \N{...}
+            for idx, c in it:
+                if c == "}":
+                    end = idx
+                    break
+            else:
+                # malformed nameescape expression?
+                # should have been detected by AST parsing earlier...
+                raise RuntimeError(f"{self.__class__.__name__} LOGIC ERROR!")
+            yield begin, end
+
+    def _iter_fexpr_slices(self, string: str) -> Iterator[Tuple[Index, Index]]:
+        """
+        Yields:
+            All ranges of @string which, if @string were to be split there,
+            would result in the splitting of an f-expression (which is NOT
+            allowed).
+        """
+        if "f" not in get_string_prefix(string).lower():
+            return
+
+        for match in re.finditer(self.RE_FEXPR, string, re.VERBOSE):
+            yield match.span()
+
+    def _get_illegal_split_indices(self, string: str) -> Set[Index]:
+        illegal_indices: Set[Index] = set()
+        iterators = [
+            self._iter_fexpr_slices(string),
+            self._iter_nameescape_slices(string),
+        ]
+        for it in iterators:
+            for begin, end in it:
+                illegal_indices.update(range(begin, end + 1))
+        return illegal_indices
+
     def _get_break_idx(self, string: str, max_break_idx: int) -> Optional[int]:
         """
         This method contains the algorithm that StringSplitter uses to
@@ -1272,40 +1328,15 @@ class StringSplitter(CustomSplitMapMixin, BaseStringSplitter):
         assert is_valid_index(max_break_idx)
         assert_is_leaf_string(string)
 
-        _fexpr_slices: Optional[List[Tuple[Index, Index]]] = None
-
-        def fexpr_slices() -> Iterator[Tuple[Index, Index]]:
-            """
-            Yields:
-                All ranges of @string which, if @string were to be split there,
-                would result in the splitting of an f-expression (which is NOT
-                allowed).
-            """
-            nonlocal _fexpr_slices
-
-            if _fexpr_slices is None:
-                _fexpr_slices = []
-                for match in re.finditer(self.RE_FEXPR, string, re.VERBOSE):
-                    _fexpr_slices.append(match.span())
-
-            yield from _fexpr_slices
-
-        is_fstring = "f" in get_string_prefix(string).lower()
+        _illegal_split_indices = self._get_illegal_split_indices(string)
 
-        def breaks_fstring_expression(i: Index) -> bool:
+        def breaks_unsplittable_expression(i: Index) -> bool:
             """
             Returns:
                 True iff returning @i would result in the splitting of an
-                f-expression (which is NOT allowed).
+                unsplittable expression (which is NOT allowed).
             """
-            if not is_fstring:
-                return False
-
-            for (start, end) in fexpr_slices():
-                if start <= i < end:
-                    return True
-
-            return False
+            return i in _illegal_split_indices
 
         def passes_all_checks(i: Index) -> bool:
             """
@@ -1329,7 +1360,7 @@ class StringSplitter(CustomSplitMapMixin, BaseStringSplitter):
                 is_space
                 and is_not_escaped
                 and is_big_enough
-                and not breaks_fstring_expression(i)
+                and not breaks_unsplittable_expression(i)
             )
 
         # First, we check all indices BELOW @max_break_idx.
diff --git a/tests/data/long_strings.py b/tests/data/long_strings.py
index 151396b5..430f760c 100644
--- a/tests/data/long_strings.py
+++ b/tests/data/long_strings.py
@@ -207,6 +207,38 @@ long_unmergable_string_with_pragma = (
     " of it."
 )
 
+string_with_nameescape = (
+    "........................................................................ \N{LAO KO LA}"
+)
+
+string_with_nameescape = (
+    "........................................................................... \N{LAO KO LA}"
+)
+
+string_with_nameescape = (
+    "............................................................................ \N{LAO KO LA}"
+)
+
+string_with_nameescape_and_escaped_backslash = (
+    "...................................................................... \\\N{LAO KO LA}"
+)
+
+string_with_nameescape_and_escaped_backslash = (
+    "......................................................................... \\\N{LAO KO LA}"
+)
+
+string_with_nameescape_and_escaped_backslash = (
+    ".......................................................................... \\\N{LAO KO LA}"
+)
+
+string_with_escaped_nameescape = (
+    "........................................................................ \\N{LAO KO LA}"
+)
+
+string_with_escaped_nameescape = (
+    "........................................................................... \\N{LAO KO LA}"
+)
+
 
 # output
 
@@ -587,3 +619,43 @@ long_unmergable_string_with_pragma = (
     "This is a really long string that can't be merged because it has a likely pragma at the end"  # pylint: disable=some-pylint-check
     " of it."
 )
+
+string_with_nameescape = (
+    "........................................................................"
+    " \N{LAO KO LA}"
+)
+
+string_with_nameescape = (
+    "..........................................................................."
+    " \N{LAO KO LA}"
+)
+
+string_with_nameescape = (
+    "............................................................................"
+    " \N{LAO KO LA}"
+)
+
+string_with_nameescape_and_escaped_backslash = (
+    "......................................................................"
+    " \\\N{LAO KO LA}"
+)
+
+string_with_nameescape_and_escaped_backslash = (
+    "........................................................................."
+    " \\\N{LAO KO LA}"
+)
+
+string_with_nameescape_and_escaped_backslash = (
+    ".........................................................................."
+    " \\\N{LAO KO LA}"
+)
+
+string_with_escaped_nameescape = (
+    "........................................................................ \\N{LAO"
+    " KO LA}"
+)
+
+string_with_escaped_nameescape = (
+    "..........................................................................."
+    " \\N{LAO KO LA}"
+)
diff --git a/tests/data/long_strings__regression.py b/tests/data/long_strings__regression.py
index e4234b2f..61c28d37 100644
--- a/tests/data/long_strings__regression.py
+++ b/tests/data/long_strings__regression.py
@@ -514,6 +514,10 @@ fstring = F"f-strings definitely make things more {difficult} than they need to
 
 x = F"This is a long string which contains an f-expr that should not split {{{[i for i in range(5)]}}}."
 
+x = (
+    "\N{BLACK RIGHT-POINTING TRIANGLE WITH DOUBLE VERTICAL BAR}\N{VARIATION SELECTOR-16}"
+)
+
 
 # output
 
@@ -1142,3 +1146,7 @@ x = (
     "This is a long string which contains an f-expr that should not split"
     f" {{{[i for i in range(5)]}}}."
 )
+
+x = (
+    "\N{BLACK RIGHT-POINTING TRIANGLE WITH DOUBLE VERTICAL BAR}\N{VARIATION SELECTOR-16}"
+)