src/black/parsing.py

   1 """
   2 Parse Python code and perform AST validation.
   3 """
   4 import ast
   5 import sys
   6 from typing import Final, Iterable, Iterator, List, Set, Tuple
   7
   8 from black.mode import VERSION_TO_FEATURES, Feature, TargetVersion, supports_feature
   9 from black.nodes import syms
  10 from blib2to3 import pygram
  11 from blib2to3.pgen2 import driver
  12 from blib2to3.pgen2.grammar import Grammar
  13 from blib2to3.pgen2.parse import ParseError
  14 from blib2to3.pgen2.tokenize import TokenError
  15 from blib2to3.pytree import Leaf, Node
  16
  17 PY2_HINT: Final = "Python 2 support was removed in version 22.0."
  18
  19
  20 class InvalidInput(ValueError):
  21     """Raised when input source code fails all parse attempts."""
  22
  23
  24 def get_grammars(target_versions: Set[TargetVersion]) -> List[Grammar]:
  25     if not target_versions:
  26         # No target_version specified, so try all grammars.
  27         return [
  28             # Python 3.7-3.9
  29             pygram.python_grammar_no_print_statement_no_exec_statement_async_keywords,
  30             # Python 3.0-3.6
  31             pygram.python_grammar_no_print_statement_no_exec_statement,
  32             # Python 3.10+
  33             pygram.python_grammar_soft_keywords,
  34         ]
  35
  36     grammars = []
  37     # If we have to parse both, try to parse async as a keyword first
  38     if not supports_feature(
  39         target_versions, Feature.ASYNC_IDENTIFIERS
  40     ) and not supports_feature(target_versions, Feature.PATTERN_MATCHING):
  41         # Python 3.7-3.9
  42         grammars.append(
  43             pygram.python_grammar_no_print_statement_no_exec_statement_async_keywords
  44         )
  45     if not supports_feature(target_versions, Feature.ASYNC_KEYWORDS):
  46         # Python 3.0-3.6
  47         grammars.append(pygram.python_grammar_no_print_statement_no_exec_statement)
  48     if any(Feature.PATTERN_MATCHING in VERSION_TO_FEATURES[v] for v in target_versions):
  49         # Python 3.10+
  50         grammars.append(pygram.python_grammar_soft_keywords)
  51
  52     # At least one of the above branches must have been taken, because every Python
  53     # version has exactly one of the two 'ASYNC_*' flags
  54     return grammars
  55
  56
  57 def lib2to3_parse(src_txt: str, target_versions: Iterable[TargetVersion] = ()) -> Node:
  58     """Given a string with source, return the lib2to3 Node."""
  59     if not src_txt.endswith("\n"):
  60         src_txt += "\n"
  61
  62     grammars = get_grammars(set(target_versions))
  63     errors = {}
  64     for grammar in grammars:
  65         drv = driver.Driver(grammar)
  66         try:
  67             result = drv.parse_string(src_txt, True)
  68             break
  69
  70         except ParseError as pe:
  71             lineno, column = pe.context[1]
  72             lines = src_txt.splitlines()
  73             try:
  74                 faulty_line = lines[lineno - 1]
  75             except IndexError:
  76                 faulty_line = "<line number missing in source>"
  77             errors[grammar.version] = InvalidInput(
  78                 f"Cannot parse: {lineno}:{column}: {faulty_line}"
  79             )
  80
  81         except TokenError as te:
  82             # In edge cases these are raised; and typically don't have a "faulty_line".
  83             lineno, column = te.args[1]
  84             errors[grammar.version] = InvalidInput(
  85                 f"Cannot parse: {lineno}:{column}: {te.args[0]}"
  86             )
  87
  88     else:
  89         # Choose the latest version when raising the actual parsing error.
  90         assert len(errors) >= 1
  91         exc = errors[max(errors)]
  92
  93         if matches_grammar(src_txt, pygram.python_grammar) or matches_grammar(
  94             src_txt, pygram.python_grammar_no_print_statement
  95         ):
  96             original_msg = exc.args[0]
  97             msg = f"{original_msg}\n{PY2_HINT}"
  98             raise InvalidInput(msg) from None
  99
 100         raise exc from None
 101
 102     if isinstance(result, Leaf):
 103         result = Node(syms.file_input, [result])
 104     return result
 105
 106
 107 def matches_grammar(src_txt: str, grammar: Grammar) -> bool:
 108     drv = driver.Driver(grammar)
 109     try:
 110         drv.parse_string(src_txt, True)
 111     except (ParseError, TokenError, IndentationError):
 112         return False
 113     else:
 114         return True
 115
 116
 117 def lib2to3_unparse(node: Node) -> str:
 118     """Given a lib2to3 node, return its string representation."""
 119     code = str(node)
 120     return code
 121
 122
 123 def parse_single_version(
 124     src: str, version: Tuple[int, int], *, type_comments: bool
 125 ) -> ast.AST:
 126     filename = "<unknown>"
 127     return ast.parse(
 128         src, filename, feature_version=version, type_comments=type_comments
 129     )
 130
 131
 132 def parse_ast(src: str) -> ast.AST:
 133     # TODO: support Python 4+ ;)
 134     versions = [(3, minor) for minor in range(3, sys.version_info[1] + 1)]
 135
 136     first_error = ""
 137     for version in sorted(versions, reverse=True):
 138         try:
 139             return parse_single_version(src, version, type_comments=True)
 140         except SyntaxError as e:
 141             if not first_error:
 142                 first_error = str(e)
 143
 144     # Try to parse without type comments
 145     for version in sorted(versions, reverse=True):
 146         try:
 147             return parse_single_version(src, version, type_comments=False)
 148         except SyntaxError:
 149             pass
 150
 151     raise SyntaxError(first_error)
 152
 153
 154 def _normalize(lineend: str, value: str) -> str:
 155     # To normalize, we strip any leading and trailing space from
 156     # each line...
 157     stripped: List[str] = [i.strip() for i in value.splitlines()]
 158     normalized = lineend.join(stripped)
 159     # ...and remove any blank lines at the beginning and end of
 160     # the whole string
 161     return normalized.strip()
 162
 163
 164 def stringify_ast(node: ast.AST, depth: int = 0) -> Iterator[str]:
 165     """Simple visitor generating strings to compare ASTs by content."""
 166
 167     if (
 168         isinstance(node, ast.Constant)
 169         and isinstance(node.value, str)
 170         and node.kind == "u"
 171     ):
 172         # It's a quirk of history that we strip the u prefix over here. We used to
 173         # rewrite the AST nodes for Python version compatibility and we never copied
 174         # over the kind
 175         node.kind = None
 176
 177     yield f"{'  ' * depth}{node.__class__.__name__}("
 178
 179     for field in sorted(node._fields):  # noqa: F402
 180         # TypeIgnore has only one field 'lineno' which breaks this comparison
 181         if isinstance(node, ast.TypeIgnore):
 182             break
 183
 184         try:
 185             value: object = getattr(node, field)
 186         except AttributeError:
 187             continue
 188
 189         yield f"{'  ' * (depth+1)}{field}="
 190
 191         if isinstance(value, list):
 192             for item in value:
 193                 # Ignore nested tuples within del statements, because we may insert
 194                 # parentheses and they change the AST.
 195                 if (
 196                     field == "targets"
 197                     and isinstance(node, ast.Delete)
 198                     and isinstance(item, ast.Tuple)
 199                 ):
 200                     for elt in item.elts:
 201                         yield from stringify_ast(elt, depth + 2)
 202
 203                 elif isinstance(item, ast.AST):
 204                     yield from stringify_ast(item, depth + 2)
 205
 206         elif isinstance(value, ast.AST):
 207             yield from stringify_ast(value, depth + 2)
 208
 209         else:
 210             normalized: object
 211             # Constant strings may be indented across newlines, if they are
 212             # docstrings; fold spaces after newlines when comparing. Similarly,
 213             # trailing and leading space may be removed.
 214             if (
 215                 isinstance(node, ast.Constant)
 216                 and field == "value"
 217                 and isinstance(value, str)
 218             ):
 219                 normalized = _normalize("\n", value)
 220             else:
 221                 normalized = value
 222             yield f"{'  ' * (depth+2)}{normalized!r},  # {value.__class__.__name__}"
 223
 224     yield f"{'  ' * depth})  # /{node.__class__.__name__}"