blib2to3/pgen2/grammar.py

   1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
   2 # Licensed to PSF under a Contributor Agreement.
   3
   4 """This module defines the data structures used to represent a grammar.
   5
   6 These are a bit arcane because they are derived from the data
   7 structures used by Python's 'pgen' parser generator.
   8
   9 There's also a table here mapping operators to their names in the
  10 token module; the Python tokenize module reports all operators as the
  11 fallback token code OP, but the parser needs the actual token code.
  12
  13 """
  14
  15 # Python imports
  16 import collections
  17 import pickle
  18
  19 # Local imports
  20 from . import token
  21
  22
  23 class Grammar(object):
  24     """Pgen parsing tables conversion class.
  25
  26     Once initialized, this class supplies the grammar tables for the
  27     parsing engine implemented by parse.py.  The parsing engine
  28     accesses the instance variables directly.  The class here does not
  29     provide initialization of the tables; several subclasses exist to
  30     do this (see the conv and pgen modules).
  31
  32     The load() method reads the tables from a pickle file, which is
  33     much faster than the other ways offered by subclasses.  The pickle
  34     file is written by calling dump() (after loading the grammar
  35     tables using a subclass).  The report() method prints a readable
  36     representation of the tables to stdout, for debugging.
  37
  38     The instance variables are as follows:
  39
  40     symbol2number -- a dict mapping symbol names to numbers.  Symbol
  41                      numbers are always 256 or higher, to distinguish
  42                      them from token numbers, which are between 0 and
  43                      255 (inclusive).
  44
  45     number2symbol -- a dict mapping numbers to symbol names;
  46                      these two are each other's inverse.
  47
  48     states        -- a list of DFAs, where each DFA is a list of
  49                      states, each state is a list of arcs, and each
  50                      arc is a (i, j) pair where i is a label and j is
  51                      a state number.  The DFA number is the index into
  52                      this list.  (This name is slightly confusing.)
  53                      Final states are represented by a special arc of
  54                      the form (0, j) where j is its own state number.
  55
  56     dfas          -- a dict mapping symbol numbers to (DFA, first)
  57                      pairs, where DFA is an item from the states list
  58                      above, and first is a set of tokens that can
  59                      begin this grammar rule (represented by a dict
  60                      whose values are always 1).
  61
  62     labels        -- a list of (x, y) pairs where x is either a token
  63                      number or a symbol number, and y is either None
  64                      or a string; the strings are keywords.  The label
  65                      number is the index in this list; label numbers
  66                      are used to mark state transitions (arcs) in the
  67                      DFAs.
  68
  69     start         -- the number of the grammar's start symbol.
  70
  71     keywords      -- a dict mapping keyword strings to arc labels.
  72
  73     tokens        -- a dict mapping token numbers to arc labels.
  74
  75     """
  76
  77     def __init__(self):
  78         self.symbol2number = {}
  79         self.number2symbol = {}
  80         self.states = []
  81         self.dfas = {}
  82         self.labels = [(0, "EMPTY")]
  83         self.keywords = {}
  84         self.tokens = {}
  85         self.symbol2label = {}
  86         self.start = 256
  87
  88     def dump(self, filename):
  89         """Dump the grammar tables to a pickle file.
  90
  91         dump() recursively changes all dict to OrderedDict, so the pickled file
  92         is not exactly the same as what was passed in to dump(). load() uses the
  93         pickled file to create the tables, but  only changes OrderedDict to dict
  94         at the top level; it does not recursively change OrderedDict to dict.
  95         So, the loaded tables are different from the original tables that were
  96         passed to load() in that some of the OrderedDict (from the pickled file)
  97         are not changed back to dict. For parsing, this has no effect on
  98         performance because OrderedDict uses dict's __getitem__ with nothing in
  99         between.
 100         """
 101         with open(filename, "wb") as f:
 102             d = _make_deterministic(self.__dict__)
 103             pickle.dump(d, f, 2)
 104
 105     def load(self, filename):
 106         """Load the grammar tables from a pickle file."""
 107         with open(filename, "rb") as f:
 108             d = pickle.load(f)
 109         self.__dict__.update(d)
 110
 111     def loads(self, pkl):
 112         """Load the grammar tables from a pickle bytes object."""
 113         self.__dict__.update(pickle.loads(pkl))
 114
 115     def copy(self):
 116         """
 117         Copy the grammar.
 118         """
 119         new = self.__class__()
 120         for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
 121                           "tokens", "symbol2label"):
 122             setattr(new, dict_attr, getattr(self, dict_attr).copy())
 123         new.labels = self.labels[:]
 124         new.states = self.states[:]
 125         new.start = self.start
 126         return new
 127
 128     def report(self):
 129         """Dump the grammar tables to standard output, for debugging."""
 130         from pprint import pprint
 131         print("s2n")
 132         pprint(self.symbol2number)
 133         print("n2s")
 134         pprint(self.number2symbol)
 135         print("states")
 136         pprint(self.states)
 137         print("dfas")
 138         pprint(self.dfas)
 139         print("labels")
 140         pprint(self.labels)
 141         print("start", self.start)
 142
 143
 144 def _make_deterministic(top):
 145     if isinstance(top, dict):
 146         return collections.OrderedDict(
 147             sorted(((k, _make_deterministic(v)) for k, v in top.items())))
 148     if isinstance(top, list):
 149         return [_make_deterministic(e) for e in top]
 150     if isinstance(top, tuple):
 151         return tuple(_make_deterministic(e) for e in top)
 152     return top
 153
 154
 155 # Map from operator to number (since tokenize doesn't do this)
 156
 157 opmap_raw = """
 158 ( LPAR
 159 ) RPAR
 160 [ LSQB
 161 ] RSQB
 162 : COLON
 163 , COMMA
 164 ; SEMI
 165 + PLUS
 166 - MINUS
 167 * STAR
 168 / SLASH
 169 | VBAR
 170 & AMPER
 171 < LESS
 172 > GREATER
 173 = EQUAL
 174 . DOT
 175 % PERCENT
 176 ` BACKQUOTE
 177 { LBRACE
 178 } RBRACE
 179 @ AT
 180 @= ATEQUAL
 181 == EQEQUAL
 182 != NOTEQUAL
 183 <> NOTEQUAL
 184 <= LESSEQUAL
 185 >= GREATEREQUAL
 186 ~ TILDE
 187 ^ CIRCUMFLEX
 188 << LEFTSHIFT
 189 >> RIGHTSHIFT
 190 ** DOUBLESTAR
 191 += PLUSEQUAL
 192 -= MINEQUAL
 193 *= STAREQUAL
 194 /= SLASHEQUAL
 195 %= PERCENTEQUAL
 196 &= AMPEREQUAL
 197 |= VBAREQUAL
 198 ^= CIRCUMFLEXEQUAL
 199 <<= LEFTSHIFTEQUAL
 200 >>= RIGHTSHIFTEQUAL
 201 **= DOUBLESTAREQUAL
 202 // DOUBLESLASH
 203 //= DOUBLESLASHEQUAL
 204 -> RARROW
 205 """
 206
 207 opmap = {}
 208 for line in opmap_raw.splitlines():
 209     if line:
 210         op, name = line.split()
 211         opmap[op] = getattr(token, name)