blib2to3/pgen2/grammar.py

   1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
   2 # Licensed to PSF under a Contributor Agreement.
   3
   4 """This module defines the data structures used to represent a grammar.
   5
   6 These are a bit arcane because they are derived from the data
   7 structures used by Python's 'pgen' parser generator.
   8
   9 There's also a table here mapping operators to their names in the
  10 token module; the Python tokenize module reports all operators as the
  11 fallback token code OP, but the parser needs the actual token code.
  12
  13 """
  14
  15 # Python imports
  16 import collections
  17 import pickle
  18
  19 # Local imports
  20 from . import token
  21
  22
  23 class Grammar(object):
  24     """Pgen parsing tables conversion class.
  25
  26     Once initialized, this class supplies the grammar tables for the
  27     parsing engine implemented by parse.py.  The parsing engine
  28     accesses the instance variables directly.  The class here does not
  29     provide initialization of the tables; several subclasses exist to
  30     do this (see the conv and pgen modules).
  31
  32     The load() method reads the tables from a pickle file, which is
  33     much faster than the other ways offered by subclasses.  The pickle
  34     file is written by calling dump() (after loading the grammar
  35     tables using a subclass).  The report() method prints a readable
  36     representation of the tables to stdout, for debugging.
  37
  38     The instance variables are as follows:
  39
  40     symbol2number -- a dict mapping symbol names to numbers.  Symbol
  41                      numbers are always 256 or higher, to distinguish
  42                      them from token numbers, which are between 0 and
  43                      255 (inclusive).
  44
  45     number2symbol -- a dict mapping numbers to symbol names;
  46                      these two are each other's inverse.
  47
  48     states        -- a list of DFAs, where each DFA is a list of
  49                      states, each state is a list of arcs, and each
  50                      arc is a (i, j) pair where i is a label and j is
  51                      a state number.  The DFA number is the index into
  52                      this list.  (This name is slightly confusing.)
  53                      Final states are represented by a special arc of
  54                      the form (0, j) where j is its own state number.
  55
  56     dfas          -- a dict mapping symbol numbers to (DFA, first)
  57                      pairs, where DFA is an item from the states list
  58                      above, and first is a set of tokens that can
  59                      begin this grammar rule (represented by a dict
  60                      whose values are always 1).
  61
  62     labels        -- a list of (x, y) pairs where x is either a token
  63                      number or a symbol number, and y is either None
  64                      or a string; the strings are keywords.  The label
  65                      number is the index in this list; label numbers
  66                      are used to mark state transitions (arcs) in the
  67                      DFAs.
  68
  69     start         -- the number of the grammar's start symbol.
  70
  71     keywords      -- a dict mapping keyword strings to arc labels.
  72
  73     tokens        -- a dict mapping token numbers to arc labels.
  74
  75     """
  76
  77     def __init__(self):
  78         self.symbol2number = {}
  79         self.number2symbol = {}
  80         self.states = []
  81         self.dfas = {}
  82         self.labels = [(0, "EMPTY")]
  83         self.keywords = {}
  84         self.tokens = {}
  85         self.symbol2label = {}
  86         self.start = 256
  87
  88     def dump(self, filename):
  89         """Dump the grammar tables to a pickle file."""
  90         with open(filename, "wb") as f:
  91             pickle.dump(self.__dict__, f, pickle.HIGHEST_PROTOCOL)
  92
  93     def load(self, filename):
  94         """Load the grammar tables from a pickle file."""
  95         with open(filename, "rb") as f:
  96             d = pickle.load(f)
  97         self.__dict__.update(d)
  98
  99     def loads(self, pkl):
 100         """Load the grammar tables from a pickle bytes object."""
 101         self.__dict__.update(pickle.loads(pkl))
 102
 103     def copy(self):
 104         """
 105         Copy the grammar.
 106         """
 107         new = self.__class__()
 108         for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
 109                           "tokens", "symbol2label"):
 110             setattr(new, dict_attr, getattr(self, dict_attr).copy())
 111         new.labels = self.labels[:]
 112         new.states = self.states[:]
 113         new.start = self.start
 114         return new
 115
 116     def report(self):
 117         """Dump the grammar tables to standard output, for debugging."""
 118         from pprint import pprint
 119         print("s2n")
 120         pprint(self.symbol2number)
 121         print("n2s")
 122         pprint(self.number2symbol)
 123         print("states")
 124         pprint(self.states)
 125         print("dfas")
 126         pprint(self.dfas)
 127         print("labels")
 128         pprint(self.labels)
 129         print("start", self.start)
 130
 131
 132 # Map from operator to number (since tokenize doesn't do this)
 133
 134 opmap_raw = """
 135 ( LPAR
 136 ) RPAR
 137 [ LSQB
 138 ] RSQB
 139 : COLON
 140 , COMMA
 141 ; SEMI
 142 + PLUS
 143 - MINUS
 144 * STAR
 145 / SLASH
 146 | VBAR
 147 & AMPER
 148 < LESS
 149 > GREATER
 150 = EQUAL
 151 . DOT
 152 % PERCENT
 153 ` BACKQUOTE
 154 { LBRACE
 155 } RBRACE
 156 @ AT
 157 @= ATEQUAL
 158 == EQEQUAL
 159 != NOTEQUAL
 160 <> NOTEQUAL
 161 <= LESSEQUAL
 162 >= GREATEREQUAL
 163 ~ TILDE
 164 ^ CIRCUMFLEX
 165 << LEFTSHIFT
 166 >> RIGHTSHIFT
 167 ** DOUBLESTAR
 168 += PLUSEQUAL
 169 -= MINEQUAL
 170 *= STAREQUAL
 171 /= SLASHEQUAL
 172 %= PERCENTEQUAL
 173 &= AMPEREQUAL
 174 |= VBAREQUAL
 175 ^= CIRCUMFLEXEQUAL
 176 <<= LEFTSHIFTEQUAL
 177 >>= RIGHTSHIFTEQUAL
 178 **= DOUBLESTAREQUAL
 179 // DOUBLESLASH
 180 //= DOUBLESLASHEQUAL
 181 -> RARROW
 182 """
 183
 184 opmap = {}
 185 for line in opmap_raw.splitlines():
 186     if line:
 187         op, name = line.split()
 188         opmap[op] = getattr(token, name)