src/blib2to3/pgen2/grammar.py

   1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
   2 # Licensed to PSF under a Contributor Agreement.
   3
   4 """This module defines the data structures used to represent a grammar.
   5
   6 These are a bit arcane because they are derived from the data
   7 structures used by Python's 'pgen' parser generator.
   8
   9 There's also a table here mapping operators to their names in the
  10 token module; the Python tokenize module reports all operators as the
  11 fallback token code OP, but the parser needs the actual token code.
  12
  13 """
  14
  15 # Python imports
  16 import os
  17 import pickle
  18 import tempfile
  19 from typing import Any, Dict, List, Optional, Text, Tuple, TypeVar, Union
  20
  21 # Local imports
  22 from . import token
  23
  24 _P = TypeVar("_P", bound="Grammar")
  25 Label = Tuple[int, Optional[Text]]
  26 DFA = List[List[Tuple[int, int]]]
  27 DFAS = Tuple[DFA, Dict[int, int]]
  28 Path = Union[str, "os.PathLike[str]"]
  29
  30
  31 class Grammar(object):
  32     """Pgen parsing tables conversion class.
  33
  34     Once initialized, this class supplies the grammar tables for the
  35     parsing engine implemented by parse.py.  The parsing engine
  36     accesses the instance variables directly.  The class here does not
  37     provide initialization of the tables; several subclasses exist to
  38     do this (see the conv and pgen modules).
  39
  40     The load() method reads the tables from a pickle file, which is
  41     much faster than the other ways offered by subclasses.  The pickle
  42     file is written by calling dump() (after loading the grammar
  43     tables using a subclass).  The report() method prints a readable
  44     representation of the tables to stdout, for debugging.
  45
  46     The instance variables are as follows:
  47
  48     symbol2number -- a dict mapping symbol names to numbers.  Symbol
  49                      numbers are always 256 or higher, to distinguish
  50                      them from token numbers, which are between 0 and
  51                      255 (inclusive).
  52
  53     number2symbol -- a dict mapping numbers to symbol names;
  54                      these two are each other's inverse.
  55
  56     states        -- a list of DFAs, where each DFA is a list of
  57                      states, each state is a list of arcs, and each
  58                      arc is a (i, j) pair where i is a label and j is
  59                      a state number.  The DFA number is the index into
  60                      this list.  (This name is slightly confusing.)
  61                      Final states are represented by a special arc of
  62                      the form (0, j) where j is its own state number.
  63
  64     dfas          -- a dict mapping symbol numbers to (DFA, first)
  65                      pairs, where DFA is an item from the states list
  66                      above, and first is a set of tokens that can
  67                      begin this grammar rule (represented by a dict
  68                      whose values are always 1).
  69
  70     labels        -- a list of (x, y) pairs where x is either a token
  71                      number or a symbol number, and y is either None
  72                      or a string; the strings are keywords.  The label
  73                      number is the index in this list; label numbers
  74                      are used to mark state transitions (arcs) in the
  75                      DFAs.
  76
  77     start         -- the number of the grammar's start symbol.
  78
  79     keywords      -- a dict mapping keyword strings to arc labels.
  80
  81     tokens        -- a dict mapping token numbers to arc labels.
  82
  83     """
  84
  85     def __init__(self) -> None:
  86         self.symbol2number: Dict[str, int] = {}
  87         self.number2symbol: Dict[int, str] = {}
  88         self.states: List[DFA] = []
  89         self.dfas: Dict[int, DFAS] = {}
  90         self.labels: List[Label] = [(0, "EMPTY")]
  91         self.keywords: Dict[str, int] = {}
  92         self.soft_keywords: Dict[str, int] = {}
  93         self.tokens: Dict[int, int] = {}
  94         self.symbol2label: Dict[str, int] = {}
  95         self.start = 256
  96         # Python 3.7+ parses async as a keyword, not an identifier
  97         self.async_keywords = False
  98
  99     def dump(self, filename: Path) -> None:
 100         """Dump the grammar tables to a pickle file."""
 101
 102         # mypyc generates objects that don't have a __dict__, but they
 103         # do have __getstate__ methods that will return an equivalent
 104         # dictionary
 105         if hasattr(self, "__dict__"):
 106             d = self.__dict__
 107         else:
 108             d = self.__getstate__()  # type: ignore
 109
 110         with tempfile.NamedTemporaryFile(
 111             dir=os.path.dirname(filename), delete=False
 112         ) as f:
 113             pickle.dump(d, f, pickle.HIGHEST_PROTOCOL)
 114         os.replace(f.name, filename)
 115
 116     def _update(self, attrs: Dict[str, Any]) -> None:
 117         for k, v in attrs.items():
 118             setattr(self, k, v)
 119
 120     def load(self, filename: Path) -> None:
 121         """Load the grammar tables from a pickle file."""
 122         with open(filename, "rb") as f:
 123             d = pickle.load(f)
 124         self._update(d)
 125
 126     def loads(self, pkl: bytes) -> None:
 127         """Load the grammar tables from a pickle bytes object."""
 128         self._update(pickle.loads(pkl))
 129
 130     def copy(self: _P) -> _P:
 131         """
 132         Copy the grammar.
 133         """
 134         new = self.__class__()
 135         for dict_attr in (
 136             "symbol2number",
 137             "number2symbol",
 138             "dfas",
 139             "keywords",
 140             "soft_keywords",
 141             "tokens",
 142             "symbol2label",
 143         ):
 144             setattr(new, dict_attr, getattr(self, dict_attr).copy())
 145         new.labels = self.labels[:]
 146         new.states = self.states[:]
 147         new.start = self.start
 148         new.async_keywords = self.async_keywords
 149         return new
 150
 151     def report(self) -> None:
 152         """Dump the grammar tables to standard output, for debugging."""
 153         from pprint import pprint
 154
 155         print("s2n")
 156         pprint(self.symbol2number)
 157         print("n2s")
 158         pprint(self.number2symbol)
 159         print("states")
 160         pprint(self.states)
 161         print("dfas")
 162         pprint(self.dfas)
 163         print("labels")
 164         pprint(self.labels)
 165         print("start", self.start)
 166
 167
 168 # Map from operator to number (since tokenize doesn't do this)
 169
 170 opmap_raw = """
 171 ( LPAR
 172 ) RPAR
 173 [ LSQB
 174 ] RSQB
 175 : COLON
 176 , COMMA
 177 ; SEMI
 178 + PLUS
 179 - MINUS
 180 * STAR
 181 / SLASH
 182 | VBAR
 183 & AMPER
 184 < LESS
 185 > GREATER
 186 = EQUAL
 187 . DOT
 188 % PERCENT
 189 ` BACKQUOTE
 190 { LBRACE
 191 } RBRACE
 192 @ AT
 193 @= ATEQUAL
 194 == EQEQUAL
 195 != NOTEQUAL
 196 <> NOTEQUAL
 197 <= LESSEQUAL
 198 >= GREATEREQUAL
 199 ~ TILDE
 200 ^ CIRCUMFLEX
 201 << LEFTSHIFT
 202 >> RIGHTSHIFT
 203 ** DOUBLESTAR
 204 += PLUSEQUAL
 205 -= MINEQUAL
 206 *= STAREQUAL
 207 /= SLASHEQUAL
 208 %= PERCENTEQUAL
 209 &= AMPEREQUAL
 210 |= VBAREQUAL
 211 ^= CIRCUMFLEXEQUAL
 212 <<= LEFTSHIFTEQUAL
 213 >>= RIGHTSHIFTEQUAL
 214 **= DOUBLESTAREQUAL
 215 // DOUBLESLASH
 216 //= DOUBLESLASHEQUAL
 217 -> RARROW
 218 := COLONEQUAL
 219 """
 220
 221 opmap = {}
 222 for line in opmap_raw.splitlines():
 223     if line:
 224         op, name = line.split()
 225         opmap[op] = getattr(token, name)