Add SECURITY.md (#3612)

[etc/vim.git] / src / black / strings.py
diff --git a/src/black/strings.py b/src/black/strings.py

index 97debe3b5de08569e24f5ca8abcdbbfe4e258d21..3e3bc12fe728cd4a23f3b19395506e88b109723c 100644 (file)
--- a/src/black/strings.py
+++ b/src/black/strings.py
@@ -2,10 +2,12 @@
  Simple formatting on strings. Further string formatting code is in trans.py.
  """
  
  Simple formatting on strings. Further string formatting code is in trans.py.
  """
  
-import regex as re
+import re
  import sys
  from functools import lru_cache
  import sys
  from functools import lru_cache
-from typing import List, Pattern
+from typing import List, Match, Pattern
+
+from blib2to3.pytree import Leaf
  
  if sys.version_info < (3, 8):
      from typing_extensions import Final
  
  if sys.version_info < (3, 8):
      from typing_extensions import Final
@@ -18,6 +20,15 @@ STRING_PREFIX_RE: Final = re.compile(
      r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
  )
  FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
      r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
  )
  FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
+UNICODE_ESCAPE_RE: Final = re.compile(
+    r"(?P<backslashes>\\+)(?P<body>"
+    r"(u(?P<u>[a-fA-F0-9]{4}))"  # Character with 16-bit hex value xxxx
+    r"|(U(?P<U>[a-fA-F0-9]{8}))"  # Character with 32-bit hex value xxxxxxxx
+    r"|(x(?P<x>[a-fA-F0-9]{2}))"  # Character with hex value hh
+    r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})"  # Character named name in the Unicode database
+    r")",
+    re.VERBOSE,
+)
  
  
  def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
  
  
  def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
@@ -138,17 +149,21 @@ def assert_is_leaf_string(string: str) -> None:
      ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
  
  
      ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
  
  
-def normalize_string_prefix(s: str, remove_u_prefix: bool = False) -> str:
-    """Make all string prefixes lowercase.
-
-    If remove_u_prefix is given, also removes any u prefix from the string.
-    """
+def normalize_string_prefix(s: str) -> str:
+    """Make all string prefixes lowercase."""
      match = STRING_PREFIX_RE.match(s)
      assert match is not None, f"failed to match string {s!r}"
      orig_prefix = match.group(1)
      match = STRING_PREFIX_RE.match(s)
      assert match is not None, f"failed to match string {s!r}"
      orig_prefix = match.group(1)
-    new_prefix = orig_prefix.replace("F", "f").replace("B", "b").replace("U", "u")
-    if remove_u_prefix:
-        new_prefix = new_prefix.replace("u", "")
+    new_prefix = (
+        orig_prefix.replace("F", "f")
+        .replace("B", "b")
+        .replace("U", "")
+        .replace("u", "")
+    )
+
+    # Python syntax guarantees max 2 prefixes and that one of them is "r"
+    if len(new_prefix) == 2 and "r" != new_prefix[0].lower():
+        new_prefix = new_prefix[::-1]
      return f"{new_prefix}{match.group(2)}"
  
  
      return f"{new_prefix}{match.group(2)}"
  
  
@@ -156,7 +171,7 @@ def normalize_string_prefix(s: str, remove_u_prefix: bool = False) -> str:
  # performance on a long list literal of strings by 5-9% since lru_cache's
  # caching overhead is much lower.
  @lru_cache(maxsize=64)
  # performance on a long list literal of strings by 5-9% since lru_cache's
  # caching overhead is much lower.
  @lru_cache(maxsize=64)
-def _cached_compile(pattern: str) -> re.Pattern:
+def _cached_compile(pattern: str) -> Pattern[str]:
      return re.compile(pattern)
  
  
      return re.compile(pattern)
  
  
@@ -232,3 +247,34 @@ def normalize_string_quotes(s: str) -> str:
          return s  # Prefer double quotes
  
      return f"{prefix}{new_quote}{new_body}{new_quote}"
          return s  # Prefer double quotes
  
      return f"{prefix}{new_quote}{new_body}{new_quote}"
+
+
+def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
+    """Replace hex codes in Unicode escape sequences with lowercase representation."""
+    text = leaf.value
+    prefix = get_string_prefix(text)
+    if "r" in prefix.lower():
+        return
+
+    def replace(m: Match[str]) -> str:
+        groups = m.groupdict()
+        back_slashes = groups["backslashes"]
+
+        if len(back_slashes) % 2 == 0:
+            return back_slashes + groups["body"]
+
+        if groups["u"]:
+            # \u
+            return back_slashes + "u" + groups["u"].lower()
+        elif groups["U"]:
+            # \U
+            return back_slashes + "U" + groups["U"].lower()
+        elif groups["x"]:
+            # \x
+            return back_slashes + "x" + groups["x"].lower()
+        else:
+            assert groups["N"], f"Unexpected match: {m}"
+            # \N{}
+            return back_slashes + "N{" + groups["N"].upper() + "}"
+
+    leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)