All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@git.madduck.net.
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2 # Licensed to PSF under a Contributor Agreement.
4 """This module defines the data structures used to represent a grammar.
6 These are a bit arcane because they are derived from the data
7 structures used by Python's 'pgen' parser generator.
9 There's also a table here mapping operators to their names in the
10 token module; the Python tokenize module reports all operators as the
11 fallback token code OP, but the parser needs the actual token code.
19 from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
24 _P = TypeVar("_P", bound="Grammar")
25 Label = Tuple[int, Optional[str]]
26 DFA = List[List[Tuple[int, int]]]
27 DFAS = Tuple[DFA, Dict[int, int]]
28 Path = Union[str, "os.PathLike[str]"]
32 """Pgen parsing tables conversion class.
34 Once initialized, this class supplies the grammar tables for the
35 parsing engine implemented by parse.py. The parsing engine
36 accesses the instance variables directly. The class here does not
37 provide initialization of the tables; several subclasses exist to
38 do this (see the conv and pgen modules).
40 The load() method reads the tables from a pickle file, which is
41 much faster than the other ways offered by subclasses. The pickle
42 file is written by calling dump() (after loading the grammar
43 tables using a subclass). The report() method prints a readable
44 representation of the tables to stdout, for debugging.
46 The instance variables are as follows:
48 symbol2number -- a dict mapping symbol names to numbers. Symbol
49 numbers are always 256 or higher, to distinguish
50 them from token numbers, which are between 0 and
53 number2symbol -- a dict mapping numbers to symbol names;
54 these two are each other's inverse.
56 states -- a list of DFAs, where each DFA is a list of
57 states, each state is a list of arcs, and each
58 arc is a (i, j) pair where i is a label and j is
59 a state number. The DFA number is the index into
60 this list. (This name is slightly confusing.)
61 Final states are represented by a special arc of
62 the form (0, j) where j is its own state number.
64 dfas -- a dict mapping symbol numbers to (DFA, first)
65 pairs, where DFA is an item from the states list
66 above, and first is a set of tokens that can
67 begin this grammar rule (represented by a dict
68 whose values are always 1).
70 labels -- a list of (x, y) pairs where x is either a token
71 number or a symbol number, and y is either None
72 or a string; the strings are keywords. The label
73 number is the index in this list; label numbers
74 are used to mark state transitions (arcs) in the
77 start -- the number of the grammar's start symbol.
79 keywords -- a dict mapping keyword strings to arc labels.
81 tokens -- a dict mapping token numbers to arc labels.
85 def __init__(self) -> None:
86 self.symbol2number: Dict[str, int] = {}
87 self.number2symbol: Dict[int, str] = {}
88 self.states: List[DFA] = []
89 self.dfas: Dict[int, DFAS] = {}
90 self.labels: List[Label] = [(0, "EMPTY")]
91 self.keywords: Dict[str, int] = {}
92 self.soft_keywords: Dict[str, int] = {}
93 self.tokens: Dict[int, int] = {}
94 self.symbol2label: Dict[str, int] = {}
95 self.version: Tuple[int, int] = (0, 0)
97 # Python 3.7+ parses async as a keyword, not an identifier
98 self.async_keywords = False
100 def dump(self, filename: Path) -> None:
101 """Dump the grammar tables to a pickle file."""
103 # mypyc generates objects that don't have a __dict__, but they
104 # do have __getstate__ methods that will return an equivalent
106 if hasattr(self, "__dict__"):
109 d = self.__getstate__() # type: ignore
111 with tempfile.NamedTemporaryFile(
112 dir=os.path.dirname(filename), delete=False
114 pickle.dump(d, f, pickle.HIGHEST_PROTOCOL)
115 os.replace(f.name, filename)
117 def _update(self, attrs: Dict[str, Any]) -> None:
118 for k, v in attrs.items():
121 def load(self, filename: Path) -> None:
122 """Load the grammar tables from a pickle file."""
123 with open(filename, "rb") as f:
127 def loads(self, pkl: bytes) -> None:
128 """Load the grammar tables from a pickle bytes object."""
129 self._update(pickle.loads(pkl))
131 def copy(self: _P) -> _P:
135 new = self.__class__()
145 setattr(new, dict_attr, getattr(self, dict_attr).copy())
146 new.labels = self.labels[:]
147 new.states = self.states[:]
148 new.start = self.start
149 new.version = self.version
150 new.async_keywords = self.async_keywords
153 def report(self) -> None:
154 """Dump the grammar tables to standard output, for debugging."""
155 from pprint import pprint
158 pprint(self.symbol2number)
160 pprint(self.number2symbol)
167 print("start", self.start)
170 # Map from operator to number (since tokenize doesn't do this)
224 for line in opmap_raw.splitlines():
226 op, name = line.split()
227 opmap[op] = getattr(token, name)