src/blib2to3/pgen2/grammar.py

   1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
   2 # Licensed to PSF under a Contributor Agreement.
   3
   4 """This module defines the data structures used to represent a grammar.
   5
   6 These are a bit arcane because they are derived from the data
   7 structures used by Python's 'pgen' parser generator.
   8
   9 There's also a table here mapping operators to their names in the
  10 token module; the Python tokenize module reports all operators as the
  11 fallback token code OP, but the parser needs the actual token code.
  12
  13 """
  14
  15 # Python imports
  16 import os
  17 import pickle
  18 import tempfile
  19 from typing import Any, Dict, List, Optional, Text, Tuple, TypeVar, Union
  20
  21 # Local imports
  22 from . import token
  23
  24 _P = TypeVar("_P", bound="Grammar")
  25 Label = Tuple[int, Optional[Text]]
  26 DFA = List[List[Tuple[int, int]]]
  27 DFAS = Tuple[DFA, Dict[int, int]]
  28 Path = Union[str, "os.PathLike[str]"]
  29
  30
  31 class Grammar(object):
  32     """Pgen parsing tables conversion class.
  33
  34     Once initialized, this class supplies the grammar tables for the
  35     parsing engine implemented by parse.py.  The parsing engine
  36     accesses the instance variables directly.  The class here does not
  37     provide initialization of the tables; several subclasses exist to
  38     do this (see the conv and pgen modules).
  39
  40     The load() method reads the tables from a pickle file, which is
  41     much faster than the other ways offered by subclasses.  The pickle
  42     file is written by calling dump() (after loading the grammar
  43     tables using a subclass).  The report() method prints a readable
  44     representation of the tables to stdout, for debugging.
  45
  46     The instance variables are as follows:
  47
  48     symbol2number -- a dict mapping symbol names to numbers.  Symbol
  49                      numbers are always 256 or higher, to distinguish
  50                      them from token numbers, which are between 0 and
  51                      255 (inclusive).
  52
  53     number2symbol -- a dict mapping numbers to symbol names;
  54                      these two are each other's inverse.
  55
  56     states        -- a list of DFAs, where each DFA is a list of
  57                      states, each state is a list of arcs, and each
  58                      arc is a (i, j) pair where i is a label and j is
  59                      a state number.  The DFA number is the index into
  60                      this list.  (This name is slightly confusing.)
  61                      Final states are represented by a special arc of
  62                      the form (0, j) where j is its own state number.
  63
  64     dfas          -- a dict mapping symbol numbers to (DFA, first)
  65                      pairs, where DFA is an item from the states list
  66                      above, and first is a set of tokens that can
  67                      begin this grammar rule (represented by a dict
  68                      whose values are always 1).
  69
  70     labels        -- a list of (x, y) pairs where x is either a token
  71                      number or a symbol number, and y is either None
  72                      or a string; the strings are keywords.  The label
  73                      number is the index in this list; label numbers
  74                      are used to mark state transitions (arcs) in the
  75                      DFAs.
  76
  77     start         -- the number of the grammar's start symbol.
  78
  79     keywords      -- a dict mapping keyword strings to arc labels.
  80
  81     tokens        -- a dict mapping token numbers to arc labels.
  82
  83     """
  84
  85     def __init__(self) -> None:
  86         self.symbol2number: Dict[str, int] = {}
  87         self.number2symbol: Dict[int, str] = {}
  88         self.states: List[DFA] = []
  89         self.dfas: Dict[int, DFAS] = {}
  90         self.labels: List[Label] = [(0, "EMPTY")]
  91         self.keywords: Dict[str, int] = {}
  92         self.tokens: Dict[int, int] = {}
  93         self.symbol2label: Dict[str, int] = {}
  94         self.start = 256
  95         # Python 3.7+ parses async as a keyword, not an identifier
  96         self.async_keywords = False
  97
  98     def dump(self, filename: Path) -> None:
  99         """Dump the grammar tables to a pickle file."""
 100
 101         # mypyc generates objects that don't have a __dict__, but they
 102         # do have __getstate__ methods that will return an equivalent
 103         # dictionary
 104         if hasattr(self, "__dict__"):
 105             d = self.__dict__
 106         else:
 107             d = self.__getstate__()  # type: ignore
 108
 109         with tempfile.NamedTemporaryFile(
 110             dir=os.path.dirname(filename), delete=False
 111         ) as f:
 112             pickle.dump(d, f, pickle.HIGHEST_PROTOCOL)
 113         os.replace(f.name, filename)
 114
 115     def _update(self, attrs: Dict[str, Any]) -> None:
 116         for k, v in attrs.items():
 117             setattr(self, k, v)
 118
 119     def load(self, filename: Path) -> None:
 120         """Load the grammar tables from a pickle file."""
 121         with open(filename, "rb") as f:
 122             d = pickle.load(f)
 123         self._update(d)
 124
 125     def loads(self, pkl: bytes) -> None:
 126         """Load the grammar tables from a pickle bytes object."""
 127         self._update(pickle.loads(pkl))
 128
 129     def copy(self: _P) -> _P:
 130         """
 131         Copy the grammar.
 132         """
 133         new = self.__class__()
 134         for dict_attr in (
 135             "symbol2number",
 136             "number2symbol",
 137             "dfas",
 138             "keywords",
 139             "tokens",
 140             "symbol2label",
 141         ):
 142             setattr(new, dict_attr, getattr(self, dict_attr).copy())
 143         new.labels = self.labels[:]
 144         new.states = self.states[:]
 145         new.start = self.start
 146         new.async_keywords = self.async_keywords
 147         return new
 148
 149     def report(self) -> None:
 150         """Dump the grammar tables to standard output, for debugging."""
 151         from pprint import pprint
 152
 153         print("s2n")
 154         pprint(self.symbol2number)
 155         print("n2s")
 156         pprint(self.number2symbol)
 157         print("states")
 158         pprint(self.states)
 159         print("dfas")
 160         pprint(self.dfas)
 161         print("labels")
 162         pprint(self.labels)
 163         print("start", self.start)
 164
 165
 166 # Map from operator to number (since tokenize doesn't do this)
 167
 168 opmap_raw = """
 169 ( LPAR
 170 ) RPAR
 171 [ LSQB
 172 ] RSQB
 173 : COLON
 174 , COMMA
 175 ; SEMI
 176 + PLUS
 177 - MINUS
 178 * STAR
 179 / SLASH
 180 | VBAR
 181 & AMPER
 182 < LESS
 183 > GREATER
 184 = EQUAL
 185 . DOT
 186 % PERCENT
 187 ` BACKQUOTE
 188 { LBRACE
 189 } RBRACE
 190 @ AT
 191 @= ATEQUAL
 192 == EQEQUAL
 193 != NOTEQUAL
 194 <> NOTEQUAL
 195 <= LESSEQUAL
 196 >= GREATEREQUAL
 197 ~ TILDE
 198 ^ CIRCUMFLEX
 199 << LEFTSHIFT
 200 >> RIGHTSHIFT
 201 ** DOUBLESTAR
 202 += PLUSEQUAL
 203 -= MINEQUAL
 204 *= STAREQUAL
 205 /= SLASHEQUAL
 206 %= PERCENTEQUAL
 207 &= AMPEREQUAL
 208 |= VBAREQUAL
 209 ^= CIRCUMFLEXEQUAL
 210 <<= LEFTSHIFTEQUAL
 211 >>= RIGHTSHIFTEQUAL
 212 **= DOUBLESTAREQUAL
 213 // DOUBLESLASH
 214 //= DOUBLESLASHEQUAL
 215 -> RARROW
 216 := COLONEQUAL
 217 """
 218
 219 opmap = {}
 220 for line in opmap_raw.splitlines():
 221     if line:
 222         op, name = line.split()
 223         opmap[op] = getattr(token, name)