src/black/parsing.py

   1 """
   2 Parse Python code and perform AST validation.
   3 """
   4
   5 import ast
   6 import sys
   7 from typing import Iterable, Iterator, List, Set, Tuple
   8
   9 from black.mode import VERSION_TO_FEATURES, Feature, TargetVersion, supports_feature
  10 from black.nodes import syms
  11 from blib2to3 import pygram
  12 from blib2to3.pgen2 import driver
  13 from blib2to3.pgen2.grammar import Grammar
  14 from blib2to3.pgen2.parse import ParseError
  15 from blib2to3.pgen2.tokenize import TokenError
  16 from blib2to3.pytree import Leaf, Node
  17
  18
  19 class InvalidInput(ValueError):
  20     """Raised when input source code fails all parse attempts."""
  21
  22
  23 def get_grammars(target_versions: Set[TargetVersion]) -> List[Grammar]:
  24     if not target_versions:
  25         # No target_version specified, so try all grammars.
  26         return [
  27             # Python 3.7-3.9
  28             pygram.python_grammar_async_keywords,
  29             # Python 3.0-3.6
  30             pygram.python_grammar,
  31             # Python 3.10+
  32             pygram.python_grammar_soft_keywords,
  33         ]
  34
  35     grammars = []
  36     # If we have to parse both, try to parse async as a keyword first
  37     if not supports_feature(
  38         target_versions, Feature.ASYNC_IDENTIFIERS
  39     ) and not supports_feature(target_versions, Feature.PATTERN_MATCHING):
  40         # Python 3.7-3.9
  41         grammars.append(pygram.python_grammar_async_keywords)
  42     if not supports_feature(target_versions, Feature.ASYNC_KEYWORDS):
  43         # Python 3.0-3.6
  44         grammars.append(pygram.python_grammar)
  45     if any(Feature.PATTERN_MATCHING in VERSION_TO_FEATURES[v] for v in target_versions):
  46         # Python 3.10+
  47         grammars.append(pygram.python_grammar_soft_keywords)
  48
  49     # At least one of the above branches must have been taken, because every Python
  50     # version has exactly one of the two 'ASYNC_*' flags
  51     return grammars
  52
  53
  54 def lib2to3_parse(src_txt: str, target_versions: Iterable[TargetVersion] = ()) -> Node:
  55     """Given a string with source, return the lib2to3 Node."""
  56     if not src_txt.endswith("\n"):
  57         src_txt += "\n"
  58
  59     grammars = get_grammars(set(target_versions))
  60     errors = {}
  61     for grammar in grammars:
  62         drv = driver.Driver(grammar)
  63         try:
  64             result = drv.parse_string(src_txt, True)
  65             break
  66
  67         except ParseError as pe:
  68             lineno, column = pe.context[1]
  69             lines = src_txt.splitlines()
  70             try:
  71                 faulty_line = lines[lineno - 1]
  72             except IndexError:
  73                 faulty_line = "<line number missing in source>"
  74             errors[grammar.version] = InvalidInput(
  75                 f"Cannot parse: {lineno}:{column}: {faulty_line}"
  76             )
  77
  78         except TokenError as te:
  79             # In edge cases these are raised; and typically don't have a "faulty_line".
  80             lineno, column = te.args[1]
  81             errors[grammar.version] = InvalidInput(
  82                 f"Cannot parse: {lineno}:{column}: {te.args[0]}"
  83             )
  84
  85     else:
  86         # Choose the latest version when raising the actual parsing error.
  87         assert len(errors) >= 1
  88         exc = errors[max(errors)]
  89         raise exc from None
  90
  91     if isinstance(result, Leaf):
  92         result = Node(syms.file_input, [result])
  93     return result
  94
  95
  96 def matches_grammar(src_txt: str, grammar: Grammar) -> bool:
  97     drv = driver.Driver(grammar)
  98     try:
  99         drv.parse_string(src_txt, True)
 100     except (ParseError, TokenError, IndentationError):
 101         return False
 102     else:
 103         return True
 104
 105
 106 def lib2to3_unparse(node: Node) -> str:
 107     """Given a lib2to3 node, return its string representation."""
 108     code = str(node)
 109     return code
 110
 111
 112 def parse_single_version(
 113     src: str, version: Tuple[int, int], *, type_comments: bool
 114 ) -> ast.AST:
 115     filename = "<unknown>"
 116     return ast.parse(
 117         src, filename, feature_version=version, type_comments=type_comments
 118     )
 119
 120
 121 def parse_ast(src: str) -> ast.AST:
 122     # TODO: support Python 4+ ;)
 123     versions = [(3, minor) for minor in range(3, sys.version_info[1] + 1)]
 124
 125     first_error = ""
 126     for version in sorted(versions, reverse=True):
 127         try:
 128             return parse_single_version(src, version, type_comments=True)
 129         except SyntaxError as e:
 130             if not first_error:
 131                 first_error = str(e)
 132
 133     # Try to parse without type comments
 134     for version in sorted(versions, reverse=True):
 135         try:
 136             return parse_single_version(src, version, type_comments=False)
 137         except SyntaxError:
 138             pass
 139
 140     raise SyntaxError(first_error)
 141
 142
 143 def _normalize(lineend: str, value: str) -> str:
 144     # To normalize, we strip any leading and trailing space from
 145     # each line...
 146     stripped: List[str] = [i.strip() for i in value.splitlines()]
 147     normalized = lineend.join(stripped)
 148     # ...and remove any blank lines at the beginning and end of
 149     # the whole string
 150     return normalized.strip()
 151
 152
 153 def stringify_ast(node: ast.AST, depth: int = 0) -> Iterator[str]:
 154     """Simple visitor generating strings to compare ASTs by content."""
 155
 156     if (
 157         isinstance(node, ast.Constant)
 158         and isinstance(node.value, str)
 159         and node.kind == "u"
 160     ):
 161         # It's a quirk of history that we strip the u prefix over here. We used to
 162         # rewrite the AST nodes for Python version compatibility and we never copied
 163         # over the kind
 164         node.kind = None
 165
 166     yield f"{'  ' * depth}{node.__class__.__name__}("
 167
 168     for field in sorted(node._fields):  # noqa: F402
 169         # TypeIgnore has only one field 'lineno' which breaks this comparison
 170         if isinstance(node, ast.TypeIgnore):
 171             break
 172
 173         try:
 174             value: object = getattr(node, field)
 175         except AttributeError:
 176             continue
 177
 178         yield f"{'  ' * (depth+1)}{field}="
 179
 180         if isinstance(value, list):
 181             for item in value:
 182                 # Ignore nested tuples within del statements, because we may insert
 183                 # parentheses and they change the AST.
 184                 if (
 185                     field == "targets"
 186                     and isinstance(node, ast.Delete)
 187                     and isinstance(item, ast.Tuple)
 188                 ):
 189                     for elt in item.elts:
 190                         yield from stringify_ast(elt, depth + 2)
 191
 192                 elif isinstance(item, ast.AST):
 193                     yield from stringify_ast(item, depth + 2)
 194
 195         elif isinstance(value, ast.AST):
 196             yield from stringify_ast(value, depth + 2)
 197
 198         else:
 199             normalized: object
 200             if (
 201                 isinstance(node, ast.Constant)
 202                 and field == "value"
 203                 and isinstance(value, str)
 204             ):
 205                 # Constant strings may be indented across newlines, if they are
 206                 # docstrings; fold spaces after newlines when comparing. Similarly,
 207                 # trailing and leading space may be removed.
 208                 normalized = _normalize("\n", value)
 209             elif field == "type_comment" and isinstance(value, str):
 210                 # Trailing whitespace in type comments is removed.
 211                 normalized = value.rstrip()
 212             else:
 213                 normalized = value
 214             yield f"{'  ' * (depth+2)}{normalized!r},  # {value.__class__.__name__}"
 215
 216     yield f"{'  ' * depth})  # /{node.__class__.__name__}"