blib2to3/pgen2/grammar.py

   1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
   2 # Licensed to PSF under a Contributor Agreement.
   3
   4 """This module defines the data structures used to represent a grammar.
   5
   6 These are a bit arcane because they are derived from the data
   7 structures used by Python's 'pgen' parser generator.
   8
   9 There's also a table here mapping operators to their names in the
  10 token module; the Python tokenize module reports all operators as the
  11 fallback token code OP, but the parser needs the actual token code.
  12
  13 """
  14
  15 # Python imports
  16 import os
  17 import pickle
  18 import tempfile
  19
  20 # Local imports
  21 from . import token
  22
  23
  24 class Grammar(object):
  25     """Pgen parsing tables conversion class.
  26
  27     Once initialized, this class supplies the grammar tables for the
  28     parsing engine implemented by parse.py.  The parsing engine
  29     accesses the instance variables directly.  The class here does not
  30     provide initialization of the tables; several subclasses exist to
  31     do this (see the conv and pgen modules).
  32
  33     The load() method reads the tables from a pickle file, which is
  34     much faster than the other ways offered by subclasses.  The pickle
  35     file is written by calling dump() (after loading the grammar
  36     tables using a subclass).  The report() method prints a readable
  37     representation of the tables to stdout, for debugging.
  38
  39     The instance variables are as follows:
  40
  41     symbol2number -- a dict mapping symbol names to numbers.  Symbol
  42                      numbers are always 256 or higher, to distinguish
  43                      them from token numbers, which are between 0 and
  44                      255 (inclusive).
  45
  46     number2symbol -- a dict mapping numbers to symbol names;
  47                      these two are each other's inverse.
  48
  49     states        -- a list of DFAs, where each DFA is a list of
  50                      states, each state is a list of arcs, and each
  51                      arc is a (i, j) pair where i is a label and j is
  52                      a state number.  The DFA number is the index into
  53                      this list.  (This name is slightly confusing.)
  54                      Final states are represented by a special arc of
  55                      the form (0, j) where j is its own state number.
  56
  57     dfas          -- a dict mapping symbol numbers to (DFA, first)
  58                      pairs, where DFA is an item from the states list
  59                      above, and first is a set of tokens that can
  60                      begin this grammar rule (represented by a dict
  61                      whose values are always 1).
  62
  63     labels        -- a list of (x, y) pairs where x is either a token
  64                      number or a symbol number, and y is either None
  65                      or a string; the strings are keywords.  The label
  66                      number is the index in this list; label numbers
  67                      are used to mark state transitions (arcs) in the
  68                      DFAs.
  69
  70     start         -- the number of the grammar's start symbol.
  71
  72     keywords      -- a dict mapping keyword strings to arc labels.
  73
  74     tokens        -- a dict mapping token numbers to arc labels.
  75
  76     """
  77
  78     def __init__(self):
  79         self.symbol2number = {}
  80         self.number2symbol = {}
  81         self.states = []
  82         self.dfas = {}
  83         self.labels = [(0, "EMPTY")]
  84         self.keywords = {}
  85         self.tokens = {}
  86         self.symbol2label = {}
  87         self.start = 256
  88         # Python 3.7+ parses async as a keyword, not an identifier
  89         self.async_keywords = False
  90
  91     def dump(self, filename):
  92         """Dump the grammar tables to a pickle file."""
  93         with tempfile.NamedTemporaryFile(
  94             dir=os.path.dirname(filename), delete=False
  95         ) as f:
  96             pickle.dump(self.__dict__, f, pickle.HIGHEST_PROTOCOL)
  97         os.replace(f.name, filename)
  98
  99     def load(self, filename):
 100         """Load the grammar tables from a pickle file."""
 101         with open(filename, "rb") as f:
 102             d = pickle.load(f)
 103         self.__dict__.update(d)
 104
 105     def loads(self, pkl):
 106         """Load the grammar tables from a pickle bytes object."""
 107         self.__dict__.update(pickle.loads(pkl))
 108
 109     def copy(self):
 110         """
 111         Copy the grammar.
 112         """
 113         new = self.__class__()
 114         for dict_attr in (
 115             "symbol2number",
 116             "number2symbol",
 117             "dfas",
 118             "keywords",
 119             "tokens",
 120             "symbol2label",
 121         ):
 122             setattr(new, dict_attr, getattr(self, dict_attr).copy())
 123         new.labels = self.labels[:]
 124         new.states = self.states[:]
 125         new.start = self.start
 126         new.async_keywords = self.async_keywords
 127         return new
 128
 129     def report(self):
 130         """Dump the grammar tables to standard output, for debugging."""
 131         from pprint import pprint
 132
 133         print("s2n")
 134         pprint(self.symbol2number)
 135         print("n2s")
 136         pprint(self.number2symbol)
 137         print("states")
 138         pprint(self.states)
 139         print("dfas")
 140         pprint(self.dfas)
 141         print("labels")
 142         pprint(self.labels)
 143         print("start", self.start)
 144
 145
 146 # Map from operator to number (since tokenize doesn't do this)
 147
 148 opmap_raw = """
 149 ( LPAR
 150 ) RPAR
 151 [ LSQB
 152 ] RSQB
 153 : COLON
 154 , COMMA
 155 ; SEMI
 156 + PLUS
 157 - MINUS
 158 * STAR
 159 / SLASH
 160 | VBAR
 161 & AMPER
 162 < LESS
 163 > GREATER
 164 = EQUAL
 165 . DOT
 166 % PERCENT
 167 ` BACKQUOTE
 168 { LBRACE
 169 } RBRACE
 170 @ AT
 171 @= ATEQUAL
 172 == EQEQUAL
 173 != NOTEQUAL
 174 <> NOTEQUAL
 175 <= LESSEQUAL
 176 >= GREATEREQUAL
 177 ~ TILDE
 178 ^ CIRCUMFLEX
 179 << LEFTSHIFT
 180 >> RIGHTSHIFT
 181 ** DOUBLESTAR
 182 += PLUSEQUAL
 183 -= MINEQUAL
 184 *= STAREQUAL
 185 /= SLASHEQUAL
 186 %= PERCENTEQUAL
 187 &= AMPEREQUAL
 188 |= VBAREQUAL
 189 ^= CIRCUMFLEXEQUAL
 190 <<= LEFTSHIFTEQUAL
 191 >>= RIGHTSHIFTEQUAL
 192 **= DOUBLESTAREQUAL
 193 // DOUBLESLASH
 194 //= DOUBLESLASHEQUAL
 195 -> RARROW
 196 := COLONEQUAL
 197 """
 198
 199 opmap = {}
 200 for line in opmap_raw.splitlines():
 201     if line:
 202         op, name = line.split()
 203         opmap[op] = getattr(token, name)