src/blib2to3/pgen2/grammar.py

   1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
   2 # Licensed to PSF under a Contributor Agreement.
   3
   4 """This module defines the data structures used to represent a grammar.
   5
   6 These are a bit arcane because they are derived from the data
   7 structures used by Python's 'pgen' parser generator.
   8
   9 There's also a table here mapping operators to their names in the
  10 token module; the Python tokenize module reports all operators as the
  11 fallback token code OP, but the parser needs the actual token code.
  12
  13 """
  14
  15 # Python imports
  16 import os
  17 import pickle
  18 import tempfile
  19 from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
  20
  21 # Local imports
  22 from . import token
  23
  24 _P = TypeVar("_P", bound="Grammar")
  25 Label = Tuple[int, Optional[str]]
  26 DFA = List[List[Tuple[int, int]]]
  27 DFAS = Tuple[DFA, Dict[int, int]]
  28 Path = Union[str, "os.PathLike[str]"]
  29
  30
  31 class Grammar:
  32     """Pgen parsing tables conversion class.
  33
  34     Once initialized, this class supplies the grammar tables for the
  35     parsing engine implemented by parse.py.  The parsing engine
  36     accesses the instance variables directly.  The class here does not
  37     provide initialization of the tables; several subclasses exist to
  38     do this (see the conv and pgen modules).
  39
  40     The load() method reads the tables from a pickle file, which is
  41     much faster than the other ways offered by subclasses.  The pickle
  42     file is written by calling dump() (after loading the grammar
  43     tables using a subclass).  The report() method prints a readable
  44     representation of the tables to stdout, for debugging.
  45
  46     The instance variables are as follows:
  47
  48     symbol2number -- a dict mapping symbol names to numbers.  Symbol
  49                      numbers are always 256 or higher, to distinguish
  50                      them from token numbers, which are between 0 and
  51                      255 (inclusive).
  52
  53     number2symbol -- a dict mapping numbers to symbol names;
  54                      these two are each other's inverse.
  55
  56     states        -- a list of DFAs, where each DFA is a list of
  57                      states, each state is a list of arcs, and each
  58                      arc is a (i, j) pair where i is a label and j is
  59                      a state number.  The DFA number is the index into
  60                      this list.  (This name is slightly confusing.)
  61                      Final states are represented by a special arc of
  62                      the form (0, j) where j is its own state number.
  63
  64     dfas          -- a dict mapping symbol numbers to (DFA, first)
  65                      pairs, where DFA is an item from the states list
  66                      above, and first is a set of tokens that can
  67                      begin this grammar rule (represented by a dict
  68                      whose values are always 1).
  69
  70     labels        -- a list of (x, y) pairs where x is either a token
  71                      number or a symbol number, and y is either None
  72                      or a string; the strings are keywords.  The label
  73                      number is the index in this list; label numbers
  74                      are used to mark state transitions (arcs) in the
  75                      DFAs.
  76
  77     start         -- the number of the grammar's start symbol.
  78
  79     keywords      -- a dict mapping keyword strings to arc labels.
  80
  81     tokens        -- a dict mapping token numbers to arc labels.
  82
  83     """
  84
  85     def __init__(self) -> None:
  86         self.symbol2number: Dict[str, int] = {}
  87         self.number2symbol: Dict[int, str] = {}
  88         self.states: List[DFA] = []
  89         self.dfas: Dict[int, DFAS] = {}
  90         self.labels: List[Label] = [(0, "EMPTY")]
  91         self.keywords: Dict[str, int] = {}
  92         self.soft_keywords: Dict[str, int] = {}
  93         self.tokens: Dict[int, int] = {}
  94         self.symbol2label: Dict[str, int] = {}
  95         self.version: Tuple[int, int] = (0, 0)
  96         self.start = 256
  97         # Python 3.7+ parses async as a keyword, not an identifier
  98         self.async_keywords = False
  99
 100     def dump(self, filename: Path) -> None:
 101         """Dump the grammar tables to a pickle file."""
 102
 103         # mypyc generates objects that don't have a __dict__, but they
 104         # do have __getstate__ methods that will return an equivalent
 105         # dictionary
 106         if hasattr(self, "__dict__"):
 107             d = self.__dict__
 108         else:
 109             d = self.__getstate__()  # type: ignore
 110
 111         with tempfile.NamedTemporaryFile(
 112             dir=os.path.dirname(filename), delete=False
 113         ) as f:
 114             pickle.dump(d, f, pickle.HIGHEST_PROTOCOL)
 115         os.replace(f.name, filename)
 116
 117     def _update(self, attrs: Dict[str, Any]) -> None:
 118         for k, v in attrs.items():
 119             setattr(self, k, v)
 120
 121     def load(self, filename: Path) -> None:
 122         """Load the grammar tables from a pickle file."""
 123         with open(filename, "rb") as f:
 124             d = pickle.load(f)
 125         self._update(d)
 126
 127     def loads(self, pkl: bytes) -> None:
 128         """Load the grammar tables from a pickle bytes object."""
 129         self._update(pickle.loads(pkl))
 130
 131     def copy(self: _P) -> _P:
 132         """
 133         Copy the grammar.
 134         """
 135         new = self.__class__()
 136         for dict_attr in (
 137             "symbol2number",
 138             "number2symbol",
 139             "dfas",
 140             "keywords",
 141             "soft_keywords",
 142             "tokens",
 143             "symbol2label",
 144         ):
 145             setattr(new, dict_attr, getattr(self, dict_attr).copy())
 146         new.labels = self.labels[:]
 147         new.states = self.states[:]
 148         new.start = self.start
 149         new.version = self.version
 150         new.async_keywords = self.async_keywords
 151         return new
 152
 153     def report(self) -> None:
 154         """Dump the grammar tables to standard output, for debugging."""
 155         from pprint import pprint
 156
 157         print("s2n")
 158         pprint(self.symbol2number)
 159         print("n2s")
 160         pprint(self.number2symbol)
 161         print("states")
 162         pprint(self.states)
 163         print("dfas")
 164         pprint(self.dfas)
 165         print("labels")
 166         pprint(self.labels)
 167         print("start", self.start)
 168
 169
 170 # Map from operator to number (since tokenize doesn't do this)
 171
 172 opmap_raw = """
 173 ( LPAR
 174 ) RPAR
 175 [ LSQB
 176 ] RSQB
 177 : COLON
 178 , COMMA
 179 ; SEMI
 180 + PLUS
 181 - MINUS
 182 * STAR
 183 / SLASH
 184 | VBAR
 185 & AMPER
 186 < LESS
 187 > GREATER
 188 = EQUAL
 189 . DOT
 190 % PERCENT
 191 ` BACKQUOTE
 192 { LBRACE
 193 } RBRACE
 194 @ AT
 195 @= ATEQUAL
 196 == EQEQUAL
 197 != NOTEQUAL
 198 <> NOTEQUAL
 199 <= LESSEQUAL
 200 >= GREATEREQUAL
 201 ~ TILDE
 202 ^ CIRCUMFLEX
 203 << LEFTSHIFT
 204 >> RIGHTSHIFT
 205 ** DOUBLESTAR
 206 += PLUSEQUAL
 207 -= MINEQUAL
 208 *= STAREQUAL
 209 /= SLASHEQUAL
 210 %= PERCENTEQUAL
 211 &= AMPEREQUAL
 212 |= VBAREQUAL
 213 ^= CIRCUMFLEXEQUAL
 214 <<= LEFTSHIFTEQUAL
 215 >>= RIGHTSHIFTEQUAL
 216 **= DOUBLESTAREQUAL
 217 // DOUBLESLASH
 218 //= DOUBLESLASHEQUAL
 219 -> RARROW
 220 := COLONEQUAL
 221 """
 222
 223 opmap = {}
 224 for line in opmap_raw.splitlines():
 225     if line:
 226         op, name = line.split()
 227         opmap[op] = getattr(token, name)