]> git.madduck.net Git - etc/vim.git/blobdiff - src/black/strings.py

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

Add SECURITY.md (#3612)
[etc/vim.git] / src / black / strings.py
index 97debe3b5de08569e24f5ca8abcdbbfe4e258d21..3e3bc12fe728cd4a23f3b19395506e88b109723c 100644 (file)
@@ -2,10 +2,12 @@
 Simple formatting on strings. Further string formatting code is in trans.py.
 """
 
 Simple formatting on strings. Further string formatting code is in trans.py.
 """
 
-import regex as re
+import re
 import sys
 from functools import lru_cache
 import sys
 from functools import lru_cache
-from typing import List, Pattern
+from typing import List, Match, Pattern
+
+from blib2to3.pytree import Leaf
 
 if sys.version_info < (3, 8):
     from typing_extensions import Final
 
 if sys.version_info < (3, 8):
     from typing_extensions import Final
@@ -18,6 +20,15 @@ STRING_PREFIX_RE: Final = re.compile(
     r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
 )
 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
     r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
 )
 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
+UNICODE_ESCAPE_RE: Final = re.compile(
+    r"(?P<backslashes>\\+)(?P<body>"
+    r"(u(?P<u>[a-fA-F0-9]{4}))"  # Character with 16-bit hex value xxxx
+    r"|(U(?P<U>[a-fA-F0-9]{8}))"  # Character with 32-bit hex value xxxxxxxx
+    r"|(x(?P<x>[a-fA-F0-9]{2}))"  # Character with hex value hh
+    r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})"  # Character named name in the Unicode database
+    r")",
+    re.VERBOSE,
+)
 
 
 def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
 
 
 def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
@@ -138,17 +149,21 @@ def assert_is_leaf_string(string: str) -> None:
     ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
 
 
     ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
 
 
-def normalize_string_prefix(s: str, remove_u_prefix: bool = False) -> str:
-    """Make all string prefixes lowercase.
-
-    If remove_u_prefix is given, also removes any u prefix from the string.
-    """
+def normalize_string_prefix(s: str) -> str:
+    """Make all string prefixes lowercase."""
     match = STRING_PREFIX_RE.match(s)
     assert match is not None, f"failed to match string {s!r}"
     orig_prefix = match.group(1)
     match = STRING_PREFIX_RE.match(s)
     assert match is not None, f"failed to match string {s!r}"
     orig_prefix = match.group(1)
-    new_prefix = orig_prefix.replace("F", "f").replace("B", "b").replace("U", "u")
-    if remove_u_prefix:
-        new_prefix = new_prefix.replace("u", "")
+    new_prefix = (
+        orig_prefix.replace("F", "f")
+        .replace("B", "b")
+        .replace("U", "")
+        .replace("u", "")
+    )
+
+    # Python syntax guarantees max 2 prefixes and that one of them is "r"
+    if len(new_prefix) == 2 and "r" != new_prefix[0].lower():
+        new_prefix = new_prefix[::-1]
     return f"{new_prefix}{match.group(2)}"
 
 
     return f"{new_prefix}{match.group(2)}"
 
 
@@ -156,7 +171,7 @@ def normalize_string_prefix(s: str, remove_u_prefix: bool = False) -> str:
 # performance on a long list literal of strings by 5-9% since lru_cache's
 # caching overhead is much lower.
 @lru_cache(maxsize=64)
 # performance on a long list literal of strings by 5-9% since lru_cache's
 # caching overhead is much lower.
 @lru_cache(maxsize=64)
-def _cached_compile(pattern: str) -> re.Pattern:
+def _cached_compile(pattern: str) -> Pattern[str]:
     return re.compile(pattern)
 
 
     return re.compile(pattern)
 
 
@@ -232,3 +247,34 @@ def normalize_string_quotes(s: str) -> str:
         return s  # Prefer double quotes
 
     return f"{prefix}{new_quote}{new_body}{new_quote}"
         return s  # Prefer double quotes
 
     return f"{prefix}{new_quote}{new_body}{new_quote}"
+
+
+def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
+    """Replace hex codes in Unicode escape sequences with lowercase representation."""
+    text = leaf.value
+    prefix = get_string_prefix(text)
+    if "r" in prefix.lower():
+        return
+
+    def replace(m: Match[str]) -> str:
+        groups = m.groupdict()
+        back_slashes = groups["backslashes"]
+
+        if len(back_slashes) % 2 == 0:
+            return back_slashes + groups["body"]
+
+        if groups["u"]:
+            # \u
+            return back_slashes + "u" + groups["u"].lower()
+        elif groups["U"]:
+            # \U
+            return back_slashes + "U" + groups["U"].lower()
+        elif groups["x"]:
+            # \x
+            return back_slashes + "x" + groups["x"].lower()
+        else:
+            assert groups["N"], f"Unexpected match: {m}"
+            # \N{}
+            return back_slashes + "N{" + groups["N"].upper() + "}"
+
+    leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)