src/black/parsing.py

   1 """
   2 Parse Python code and perform AST validation.
   3 """
   4 import ast
   5 import sys
   6 from typing import Iterable, Iterator, List, Set, Tuple
   7
   8 from black.mode import VERSION_TO_FEATURES, Feature, TargetVersion, supports_feature
   9 from black.nodes import syms
  10 from blib2to3 import pygram
  11 from blib2to3.pgen2 import driver
  12 from blib2to3.pgen2.grammar import Grammar
  13 from blib2to3.pgen2.parse import ParseError
  14 from blib2to3.pgen2.tokenize import TokenError
  15 from blib2to3.pytree import Leaf, Node
  16
  17
  18 class InvalidInput(ValueError):
  19     """Raised when input source code fails all parse attempts."""
  20
  21
  22 def get_grammars(target_versions: Set[TargetVersion]) -> List[Grammar]:
  23     if not target_versions:
  24         # No target_version specified, so try all grammars.
  25         return [
  26             # Python 3.7-3.9
  27             pygram.python_grammar_async_keywords,
  28             # Python 3.0-3.6
  29             pygram.python_grammar,
  30             # Python 3.10+
  31             pygram.python_grammar_soft_keywords,
  32         ]
  33
  34     grammars = []
  35     # If we have to parse both, try to parse async as a keyword first
  36     if not supports_feature(
  37         target_versions, Feature.ASYNC_IDENTIFIERS
  38     ) and not supports_feature(target_versions, Feature.PATTERN_MATCHING):
  39         # Python 3.7-3.9
  40         grammars.append(pygram.python_grammar_async_keywords)
  41     if not supports_feature(target_versions, Feature.ASYNC_KEYWORDS):
  42         # Python 3.0-3.6
  43         grammars.append(pygram.python_grammar)
  44     if any(Feature.PATTERN_MATCHING in VERSION_TO_FEATURES[v] for v in target_versions):
  45         # Python 3.10+
  46         grammars.append(pygram.python_grammar_soft_keywords)
  47
  48     # At least one of the above branches must have been taken, because every Python
  49     # version has exactly one of the two 'ASYNC_*' flags
  50     return grammars
  51
  52
  53 def lib2to3_parse(src_txt: str, target_versions: Iterable[TargetVersion] = ()) -> Node:
  54     """Given a string with source, return the lib2to3 Node."""
  55     if not src_txt.endswith("\n"):
  56         src_txt += "\n"
  57
  58     grammars = get_grammars(set(target_versions))
  59     errors = {}
  60     for grammar in grammars:
  61         drv = driver.Driver(grammar)
  62         try:
  63             result = drv.parse_string(src_txt, True)
  64             break
  65
  66         except ParseError as pe:
  67             lineno, column = pe.context[1]
  68             lines = src_txt.splitlines()
  69             try:
  70                 faulty_line = lines[lineno - 1]
  71             except IndexError:
  72                 faulty_line = "<line number missing in source>"
  73             errors[grammar.version] = InvalidInput(
  74                 f"Cannot parse: {lineno}:{column}: {faulty_line}"
  75             )
  76
  77         except TokenError as te:
  78             # In edge cases these are raised; and typically don't have a "faulty_line".
  79             lineno, column = te.args[1]
  80             errors[grammar.version] = InvalidInput(
  81                 f"Cannot parse: {lineno}:{column}: {te.args[0]}"
  82             )
  83
  84     else:
  85         # Choose the latest version when raising the actual parsing error.
  86         assert len(errors) >= 1
  87         exc = errors[max(errors)]
  88         raise exc from None
  89
  90     if isinstance(result, Leaf):
  91         result = Node(syms.file_input, [result])
  92     return result
  93
  94
  95 def matches_grammar(src_txt: str, grammar: Grammar) -> bool:
  96     drv = driver.Driver(grammar)
  97     try:
  98         drv.parse_string(src_txt, True)
  99     except (ParseError, TokenError, IndentationError):
 100         return False
 101     else:
 102         return True
 103
 104
 105 def lib2to3_unparse(node: Node) -> str:
 106     """Given a lib2to3 node, return its string representation."""
 107     code = str(node)
 108     return code
 109
 110
 111 def parse_single_version(
 112     src: str, version: Tuple[int, int], *, type_comments: bool
 113 ) -> ast.AST:
 114     filename = "<unknown>"
 115     return ast.parse(
 116         src, filename, feature_version=version, type_comments=type_comments
 117     )
 118
 119
 120 def parse_ast(src: str) -> ast.AST:
 121     # TODO: support Python 4+ ;)
 122     versions = [(3, minor) for minor in range(3, sys.version_info[1] + 1)]
 123
 124     first_error = ""
 125     for version in sorted(versions, reverse=True):
 126         try:
 127             return parse_single_version(src, version, type_comments=True)
 128         except SyntaxError as e:
 129             if not first_error:
 130                 first_error = str(e)
 131
 132     # Try to parse without type comments
 133     for version in sorted(versions, reverse=True):
 134         try:
 135             return parse_single_version(src, version, type_comments=False)
 136         except SyntaxError:
 137             pass
 138
 139     raise SyntaxError(first_error)
 140
 141
 142 def _normalize(lineend: str, value: str) -> str:
 143     # To normalize, we strip any leading and trailing space from
 144     # each line...
 145     stripped: List[str] = [i.strip() for i in value.splitlines()]
 146     normalized = lineend.join(stripped)
 147     # ...and remove any blank lines at the beginning and end of
 148     # the whole string
 149     return normalized.strip()
 150
 151
 152 def stringify_ast(node: ast.AST, depth: int = 0) -> Iterator[str]:
 153     """Simple visitor generating strings to compare ASTs by content."""
 154
 155     if (
 156         isinstance(node, ast.Constant)
 157         and isinstance(node.value, str)
 158         and node.kind == "u"
 159     ):
 160         # It's a quirk of history that we strip the u prefix over here. We used to
 161         # rewrite the AST nodes for Python version compatibility and we never copied
 162         # over the kind
 163         node.kind = None
 164
 165     yield f"{'  ' * depth}{node.__class__.__name__}("
 166
 167     for field in sorted(node._fields):  # noqa: F402
 168         # TypeIgnore has only one field 'lineno' which breaks this comparison
 169         if isinstance(node, ast.TypeIgnore):
 170             break
 171
 172         try:
 173             value: object = getattr(node, field)
 174         except AttributeError:
 175             continue
 176
 177         yield f"{'  ' * (depth+1)}{field}="
 178
 179         if isinstance(value, list):
 180             for item in value:
 181                 # Ignore nested tuples within del statements, because we may insert
 182                 # parentheses and they change the AST.
 183                 if (
 184                     field == "targets"
 185                     and isinstance(node, ast.Delete)
 186                     and isinstance(item, ast.Tuple)
 187                 ):
 188                     for elt in item.elts:
 189                         yield from stringify_ast(elt, depth + 2)
 190
 191                 elif isinstance(item, ast.AST):
 192                     yield from stringify_ast(item, depth + 2)
 193
 194         elif isinstance(value, ast.AST):
 195             yield from stringify_ast(value, depth + 2)
 196
 197         else:
 198             normalized: object
 199             if (
 200                 isinstance(node, ast.Constant)
 201                 and field == "value"
 202                 and isinstance(value, str)
 203             ):
 204                 # Constant strings may be indented across newlines, if they are
 205                 # docstrings; fold spaces after newlines when comparing. Similarly,
 206                 # trailing and leading space may be removed.
 207                 normalized = _normalize("\n", value)
 208             elif field == "type_comment" and isinstance(value, str):
 209                 # Trailing whitespace in type comments is removed.
 210                 normalized = value.rstrip()
 211             else:
 212                 normalized = value
 213             yield f"{'  ' * (depth+2)}{normalized!r},  # {value.__class__.__name__}"
 214
 215     yield f"{'  ' * depth})  # /{node.__class__.__name__}"