blib2to3/pgen2/grammar.py

   1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
   2 # Licensed to PSF under a Contributor Agreement.
   3
   4 """This module defines the data structures used to represent a grammar.
   5
   6 These are a bit arcane because they are derived from the data
   7 structures used by Python's 'pgen' parser generator.
   8
   9 There's also a table here mapping operators to their names in the
  10 token module; the Python tokenize module reports all operators as the
  11 fallback token code OP, but the parser needs the actual token code.
  12
  13 """
  14
  15 # Python imports
  16 import pickle
  17
  18 # Local imports
  19 from . import token
  20
  21
  22 class Grammar(object):
  23     """Pgen parsing tables conversion class.
  24
  25     Once initialized, this class supplies the grammar tables for the
  26     parsing engine implemented by parse.py.  The parsing engine
  27     accesses the instance variables directly.  The class here does not
  28     provide initialization of the tables; several subclasses exist to
  29     do this (see the conv and pgen modules).
  30
  31     The load() method reads the tables from a pickle file, which is
  32     much faster than the other ways offered by subclasses.  The pickle
  33     file is written by calling dump() (after loading the grammar
  34     tables using a subclass).  The report() method prints a readable
  35     representation of the tables to stdout, for debugging.
  36
  37     The instance variables are as follows:
  38
  39     symbol2number -- a dict mapping symbol names to numbers.  Symbol
  40                      numbers are always 256 or higher, to distinguish
  41                      them from token numbers, which are between 0 and
  42                      255 (inclusive).
  43
  44     number2symbol -- a dict mapping numbers to symbol names;
  45                      these two are each other's inverse.
  46
  47     states        -- a list of DFAs, where each DFA is a list of
  48                      states, each state is a list of arcs, and each
  49                      arc is a (i, j) pair where i is a label and j is
  50                      a state number.  The DFA number is the index into
  51                      this list.  (This name is slightly confusing.)
  52                      Final states are represented by a special arc of
  53                      the form (0, j) where j is its own state number.
  54
  55     dfas          -- a dict mapping symbol numbers to (DFA, first)
  56                      pairs, where DFA is an item from the states list
  57                      above, and first is a set of tokens that can
  58                      begin this grammar rule (represented by a dict
  59                      whose values are always 1).
  60
  61     labels        -- a list of (x, y) pairs where x is either a token
  62                      number or a symbol number, and y is either None
  63                      or a string; the strings are keywords.  The label
  64                      number is the index in this list; label numbers
  65                      are used to mark state transitions (arcs) in the
  66                      DFAs.
  67
  68     start         -- the number of the grammar's start symbol.
  69
  70     keywords      -- a dict mapping keyword strings to arc labels.
  71
  72     tokens        -- a dict mapping token numbers to arc labels.
  73
  74     """
  75
  76     def __init__(self):
  77         self.symbol2number = {}
  78         self.number2symbol = {}
  79         self.states = []
  80         self.dfas = {}
  81         self.labels = [(0, "EMPTY")]
  82         self.keywords = {}
  83         self.tokens = {}
  84         self.symbol2label = {}
  85         self.start = 256
  86
  87     def dump(self, filename):
  88         """Dump the grammar tables to a pickle file."""
  89         with open(filename, "wb") as f:
  90             pickle.dump(self.__dict__, f, pickle.HIGHEST_PROTOCOL)
  91
  92     def load(self, filename):
  93         """Load the grammar tables from a pickle file."""
  94         with open(filename, "rb") as f:
  95             d = pickle.load(f)
  96         self.__dict__.update(d)
  97
  98     def loads(self, pkl):
  99         """Load the grammar tables from a pickle bytes object."""
 100         self.__dict__.update(pickle.loads(pkl))
 101
 102     def copy(self):
 103         """
 104         Copy the grammar.
 105         """
 106         new = self.__class__()
 107         for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
 108                           "tokens", "symbol2label"):
 109             setattr(new, dict_attr, getattr(self, dict_attr).copy())
 110         new.labels = self.labels[:]
 111         new.states = self.states[:]
 112         new.start = self.start
 113         return new
 114
 115     def report(self):
 116         """Dump the grammar tables to standard output, for debugging."""
 117         from pprint import pprint
 118         print("s2n")
 119         pprint(self.symbol2number)
 120         print("n2s")
 121         pprint(self.number2symbol)
 122         print("states")
 123         pprint(self.states)
 124         print("dfas")
 125         pprint(self.dfas)
 126         print("labels")
 127         pprint(self.labels)
 128         print("start", self.start)
 129
 130
 131 # Map from operator to number (since tokenize doesn't do this)
 132
 133 opmap_raw = """
 134 ( LPAR
 135 ) RPAR
 136 [ LSQB
 137 ] RSQB
 138 : COLON
 139 , COMMA
 140 ; SEMI
 141 + PLUS
 142 - MINUS
 143 * STAR
 144 / SLASH
 145 | VBAR
 146 & AMPER
 147 < LESS
 148 > GREATER
 149 = EQUAL
 150 . DOT
 151 % PERCENT
 152 ` BACKQUOTE
 153 { LBRACE
 154 } RBRACE
 155 @ AT
 156 @= ATEQUAL
 157 == EQEQUAL
 158 != NOTEQUAL
 159 <> NOTEQUAL
 160 <= LESSEQUAL
 161 >= GREATEREQUAL
 162 ~ TILDE
 163 ^ CIRCUMFLEX
 164 << LEFTSHIFT
 165 >> RIGHTSHIFT
 166 ** DOUBLESTAR
 167 += PLUSEQUAL
 168 -= MINEQUAL
 169 *= STAREQUAL
 170 /= SLASHEQUAL
 171 %= PERCENTEQUAL
 172 &= AMPEREQUAL
 173 |= VBAREQUAL
 174 ^= CIRCUMFLEXEQUAL
 175 <<= LEFTSHIFTEQUAL
 176 >>= RIGHTSHIFTEQUAL
 177 **= DOUBLESTAREQUAL
 178 // DOUBLESLASH
 179 //= DOUBLESLASHEQUAL
 180 -> RARROW
 181 """
 182
 183 opmap = {}
 184 for line in opmap_raw.splitlines():
 185     if line:
 186         op, name = line.split()
 187         opmap[op] = getattr(token, name)