Format hex code in unicode escape sequences in string literals (#2916)

author Shivansh-007 <shivansh-007@outlook.com>

Sun, 22 Jan 2023 13:21:09 +0000 (18:51 +0530)

committer GitHub <noreply@github.com>

Sun, 22 Jan 2023 13:21:09 +0000 (05:21 -0800)
author Shivansh-007 <shivansh-007@outlook.com>
Sun, 22 Jan 2023 13:21:09 +0000 (18:51 +0530)
committer GitHub <noreply@github.com>
Sun, 22 Jan 2023 13:21:09 +0000 (05:21 -0800)
diff --git a/CHANGES.md b/CHANGES.md

index 1450278341b8eac4d6380ac59f1f74d32ee0f413..e2e4b341761eb679b21d8fbb990946bbac304c81 100644 (file)
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -16,6 +16,7 @@
  
  <!-- Changes that affect Black's preview style -->
  
  
  <!-- Changes that affect Black's preview style -->
  
+- Format hex code in unicode escape sequences in string literals (#2916)
  - Add parentheses around `if`-`else` expressions (#2278)
  - Improve the performance on large expressions that contain many strings (#3467)
  - Fix a crash in preview style with assert + parenthesized string (#3415)
  - Add parentheses around `if`-`else` expressions (#2278)
  - Improve the performance on large expressions that contain many strings (#3467)
  - Fix a crash in preview style with assert + parenthesized string (#3415)
diff --git a/src/black/linegen.py b/src/black/linegen.py

index 2f50257a9305b0203ee662e4d5d3187bbe20741d..bfc28ca006ccc44d72185a2cd355c7e958d31de6 100644 (file)
--- a/src/black/linegen.py
+++ b/src/black/linegen.py
@@ -59,6 +59,7 @@ from black.strings import (
      get_string_prefix,
      normalize_string_prefix,
      normalize_string_quotes,
      get_string_prefix,
      normalize_string_prefix,
      normalize_string_quotes,
+    normalize_unicode_escape_sequences,
  )
  from black.trans import (
      CannotTransform,
  )
  from black.trans import (
      CannotTransform,
@@ -368,6 +369,9 @@ class LineGenerator(Visitor[Line]):
          yield from self.visit_default(node)
  
      def visit_STRING(self, leaf: Leaf) -> Iterator[Line]:
          yield from self.visit_default(node)
  
      def visit_STRING(self, leaf: Leaf) -> Iterator[Line]:
+        if Preview.hex_codes_in_unicode_sequences in self.mode:
+            normalize_unicode_escape_sequences(leaf)
+
          if is_docstring(leaf) and "\\\n" not in leaf.value:
              # We're ignoring docstrings with backslash newline escapes because changing
              # indentation of those changes the AST representation of the code.
          if is_docstring(leaf) and "\\\n" not in leaf.value:
              # We're ignoring docstrings with backslash newline escapes because changing
              # indentation of those changes the AST representation of the code.
diff --git a/src/black/mode.py b/src/black/mode.py

index af0706e6a0b2a425b1c312625b2657867eae4e46..4309d4fa635b784d5f0eb5d438fd185957383457 100644 (file)
--- a/src/black/mode.py
+++ b/src/black/mode.py
@@ -153,6 +153,7 @@ def supports_feature(target_versions: Set[TargetVersion], feature: Feature) -> b
  class Preview(Enum):
      """Individual preview style features."""
  
  class Preview(Enum):
      """Individual preview style features."""
  
+    hex_codes_in_unicode_sequences = auto()
      annotation_parens = auto()
      empty_lines_before_class_or_def_with_leading_comments = auto()
      handle_trailing_commas_in_head = auto()
      annotation_parens = auto()
      empty_lines_before_class_or_def_with_leading_comments = auto()
      handle_trailing_commas_in_head = auto()
diff --git a/src/black/strings.py b/src/black/strings.py

index 9d0e2eb8430e538efcba96582dcfc9ee2d4bc468..3e3bc12fe728cd4a23f3b19395506e88b109723c 100644 (file)
--- a/src/black/strings.py
+++ b/src/black/strings.py
@@ -5,7 +5,9 @@ Simple formatting on strings. Further string formatting code is in trans.py.
  import re
  import sys
  from functools import lru_cache
  import re
  import sys
  from functools import lru_cache
-from typing import List, Pattern
+from typing import List, Match, Pattern
+
+from blib2to3.pytree import Leaf
  
  if sys.version_info < (3, 8):
      from typing_extensions import Final
  
  if sys.version_info < (3, 8):
      from typing_extensions import Final
@@ -18,6 +20,15 @@ STRING_PREFIX_RE: Final = re.compile(
      r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
  )
  FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
      r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
  )
  FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
+UNICODE_ESCAPE_RE: Final = re.compile(
+    r"(?P<backslashes>\\+)(?P<body>"
+    r"(u(?P<u>[a-fA-F0-9]{4}))"  # Character with 16-bit hex value xxxx
+    r"|(U(?P<U>[a-fA-F0-9]{8}))"  # Character with 32-bit hex value xxxxxxxx
+    r"|(x(?P<x>[a-fA-F0-9]{2}))"  # Character with hex value hh
+    r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})"  # Character named name in the Unicode database
+    r")",
+    re.VERBOSE,
+)
  
  
  def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
  
  
  def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
@@ -236,3 +247,34 @@ def normalize_string_quotes(s: str) -> str:
          return s  # Prefer double quotes
  
      return f"{prefix}{new_quote}{new_body}{new_quote}"
          return s  # Prefer double quotes
  
      return f"{prefix}{new_quote}{new_body}{new_quote}"
+
+
+def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
+    """Replace hex codes in Unicode escape sequences with lowercase representation."""
+    text = leaf.value
+    prefix = get_string_prefix(text)
+    if "r" in prefix.lower():
+        return
+
+    def replace(m: Match[str]) -> str:
+        groups = m.groupdict()
+        back_slashes = groups["backslashes"]
+
+        if len(back_slashes) % 2 == 0:
+            return back_slashes + groups["body"]
+
+        if groups["u"]:
+            # \u
+            return back_slashes + "u" + groups["u"].lower()
+        elif groups["U"]:
+            # \U
+            return back_slashes + "U" + groups["U"].lower()
+        elif groups["x"]:
+            # \x
+            return back_slashes + "x" + groups["x"].lower()
+        else:
+            assert groups["N"], f"Unexpected match: {m}"
+            # \N{}
+            return back_slashes + "N{" + groups["N"].upper() + "}"
+
+    leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)
diff --git a/tests/data/preview/format_unicode_escape_seq.py b/tests/data/preview/format_unicode_escape_seq.py

new file mode 100644 (file)

index 0000000..3440696
--- /dev/null
+++ b/tests/data/preview/format_unicode_escape_seq.py
@@ -0,0 +1,33 @@
+x = "\x1F"
+x = "\\x1B"
+x = "\\\x1B"
+x = "\U0001F60E"
+x = "\u0001F60E"
+x = r"\u0001F60E"
+x = "don't format me"
+x = "\xA3"
+x = "\u2717"
+x = "\uFaCe"
+x = "\N{ox}\N{OX}"
+x = "\N{lAtIn smaLL letteR x}"
+x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}"
+x = b"\x1Fdon't byte"
+x = rb"\x1Fdon't format"
+
+# output
+
+x = "\x1f"
+x = "\\x1B"
+x = "\\\x1b"
+x = "\U0001f60e"
+x = "\u0001F60E"
+x = r"\u0001F60E"
+x = "don't format me"
+x = "\xa3"
+x = "\u2717"
+x = "\uface"
+x = "\N{OX}\N{OX}"
+x = "\N{LATIN SMALL LETTER X}"
+x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}"
+x = b"\x1fdon't byte"
+x = rb"\x1Fdon't format"
author	Shivansh-007 <shivansh-007@outlook.com>
	Sun, 22 Jan 2023 13:21:09 +0000 (18:51 +0530)
committer	GitHub <noreply@github.com>
	Sun, 22 Jan 2023 13:21:09 +0000 (05:21 -0800)
CHANGES.md		patch \| blob \| history
src/black/linegen.py		patch \| blob \| history
src/black/mode.py		patch \| blob \| history
src/black/strings.py		patch \| blob \| history
tests/data/preview/format_unicode_escape_seq.py	[new file with mode: 0644]	patch \| blob