src/blib2to3/pgen2/driver.py

   1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
   2 # Licensed to PSF under a Contributor Agreement.
   3
   4 # Modifications:
   5 # Copyright 2006 Google, Inc. All Rights Reserved.
   6 # Licensed to PSF under a Contributor Agreement.
   7
   8 """Parser driver.
   9
  10 This provides a high-level interface to parse a file into a syntax tree.
  11
  12 """
  13
  14 __author__ = "Guido van Rossum <guido@python.org>"
  15
  16 __all__ = ["Driver", "load_grammar"]
  17
  18 # Python imports
  19 import io
  20 import os
  21 import logging
  22 import pkgutil
  23 import sys
  24 from typing import (
  25     Any,
  26     IO,
  27     Iterable,
  28     List,
  29     Optional,
  30     Text,
  31     Tuple,
  32     Union,
  33 )
  34
  35 # Pgen imports
  36 from . import grammar, parse, token, tokenize, pgen
  37 from logging import Logger
  38 from blib2to3.pytree import _Convert, NL
  39 from blib2to3.pgen2.grammar import Grammar
  40
  41 Path = Union[str, "os.PathLike[str]"]
  42
  43
  44 class Driver(object):
  45     def __init__(
  46         self,
  47         grammar: Grammar,
  48         convert: Optional[_Convert] = None,
  49         logger: Optional[Logger] = None,
  50     ) -> None:
  51         self.grammar = grammar
  52         if logger is None:
  53             logger = logging.getLogger(__name__)
  54         self.logger = logger
  55         self.convert = convert
  56
  57     def parse_tokens(self, tokens: Iterable[Any], debug: bool = False) -> NL:
  58         """Parse a series of tokens and return the syntax tree."""
  59         # XXX Move the prefix computation into a wrapper around tokenize.
  60         p = parse.Parser(self.grammar, self.convert)
  61         p.setup()
  62         lineno = 1
  63         column = 0
  64         indent_columns = []
  65         type = value = start = end = line_text = None
  66         prefix = ""
  67         for quintuple in tokens:
  68             type, value, start, end, line_text = quintuple
  69             if start != (lineno, column):
  70                 assert (lineno, column) <= start, ((lineno, column), start)
  71                 s_lineno, s_column = start
  72                 if lineno < s_lineno:
  73                     prefix += "\n" * (s_lineno - lineno)
  74                     lineno = s_lineno
  75                     column = 0
  76                 if column < s_column:
  77                     prefix += line_text[column:s_column]
  78                     column = s_column
  79             if type in (tokenize.COMMENT, tokenize.NL):
  80                 prefix += value
  81                 lineno, column = end
  82                 if value.endswith("\n"):
  83                     lineno += 1
  84                     column = 0
  85                 continue
  86             if type == token.OP:
  87                 type = grammar.opmap[value]
  88             if debug:
  89                 self.logger.debug(
  90                     "%s %r (prefix=%r)", token.tok_name[type], value, prefix
  91                 )
  92             if type == token.INDENT:
  93                 indent_columns.append(len(value))
  94                 _prefix = prefix + value
  95                 prefix = ""
  96                 value = ""
  97             elif type == token.DEDENT:
  98                 _indent_col = indent_columns.pop()
  99                 prefix, _prefix = self._partially_consume_prefix(prefix, _indent_col)
 100             if p.addtoken(type, value, (prefix, start)):
 101                 if debug:
 102                     self.logger.debug("Stop.")
 103                 break
 104             prefix = ""
 105             if type in {token.INDENT, token.DEDENT}:
 106                 prefix = _prefix
 107             lineno, column = end
 108             if value.endswith("\n"):
 109                 lineno += 1
 110                 column = 0
 111         else:
 112             # We never broke out -- EOF is too soon (how can this happen???)
 113             assert start is not None
 114             raise parse.ParseError("incomplete input", type, value, (prefix, start))
 115         assert p.rootnode is not None
 116         return p.rootnode
 117
 118     def parse_stream_raw(self, stream: IO[Text], debug: bool = False) -> NL:
 119         """Parse a stream and return the syntax tree."""
 120         tokens = tokenize.generate_tokens(stream.readline, grammar=self.grammar)
 121         return self.parse_tokens(tokens, debug)
 122
 123     def parse_stream(self, stream: IO[Text], debug: bool = False) -> NL:
 124         """Parse a stream and return the syntax tree."""
 125         return self.parse_stream_raw(stream, debug)
 126
 127     def parse_file(
 128         self, filename: Path, encoding: Optional[Text] = None, debug: bool = False
 129     ) -> NL:
 130         """Parse a file and return the syntax tree."""
 131         with io.open(filename, "r", encoding=encoding) as stream:
 132             return self.parse_stream(stream, debug)
 133
 134     def parse_string(self, text: Text, debug: bool = False) -> NL:
 135         """Parse a string and return the syntax tree."""
 136         tokens = tokenize.generate_tokens(
 137             io.StringIO(text).readline, grammar=self.grammar
 138         )
 139         return self.parse_tokens(tokens, debug)
 140
 141     def _partially_consume_prefix(self, prefix: Text, column: int) -> Tuple[Text, Text]:
 142         lines: List[str] = []
 143         current_line = ""
 144         current_column = 0
 145         wait_for_nl = False
 146         for char in prefix:
 147             current_line += char
 148             if wait_for_nl:
 149                 if char == "\n":
 150                     if current_line.strip() and current_column < column:
 151                         res = "".join(lines)
 152                         return res, prefix[len(res) :]
 153
 154                     lines.append(current_line)
 155                     current_line = ""
 156                     current_column = 0
 157                     wait_for_nl = False
 158             elif char in " \t":
 159                 current_column += 1
 160             elif char == "\n":
 161                 # unexpected empty line
 162                 current_column = 0
 163             else:
 164                 # indent is finished
 165                 wait_for_nl = True
 166         return "".join(lines), current_line
 167
 168
 169 def _generate_pickle_name(gt: Path, cache_dir: Optional[Path] = None) -> Text:
 170     head, tail = os.path.splitext(gt)
 171     if tail == ".txt":
 172         tail = ""
 173     name = head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
 174     if cache_dir:
 175         return os.path.join(cache_dir, os.path.basename(name))
 176     else:
 177         return name
 178
 179
 180 def load_grammar(
 181     gt: Text = "Grammar.txt",
 182     gp: Optional[Text] = None,
 183     save: bool = True,
 184     force: bool = False,
 185     logger: Optional[Logger] = None,
 186 ) -> Grammar:
 187     """Load the grammar (maybe from a pickle)."""
 188     if logger is None:
 189         logger = logging.getLogger(__name__)
 190     gp = _generate_pickle_name(gt) if gp is None else gp
 191     if force or not _newer(gp, gt):
 192         logger.info("Generating grammar tables from %s", gt)
 193         g: grammar.Grammar = pgen.generate_grammar(gt)
 194         if save:
 195             logger.info("Writing grammar tables to %s", gp)
 196             try:
 197                 g.dump(gp)
 198             except OSError as e:
 199                 logger.info("Writing failed: %s", e)
 200     else:
 201         g = grammar.Grammar()
 202         g.load(gp)
 203     return g
 204
 205
 206 def _newer(a: Text, b: Text) -> bool:
 207     """Inquire whether file a was written since file b."""
 208     if not os.path.exists(a):
 209         return False
 210     if not os.path.exists(b):
 211         return True
 212     return os.path.getmtime(a) >= os.path.getmtime(b)
 213
 214
 215 def load_packaged_grammar(
 216     package: str, grammar_source: Text, cache_dir: Optional[Path] = None
 217 ) -> grammar.Grammar:
 218     """Normally, loads a pickled grammar by doing
 219         pkgutil.get_data(package, pickled_grammar)
 220     where *pickled_grammar* is computed from *grammar_source* by adding the
 221     Python version and using a ``.pickle`` extension.
 222
 223     However, if *grammar_source* is an extant file, load_grammar(grammar_source)
 224     is called instead. This facilitates using a packaged grammar file when needed
 225     but preserves load_grammar's automatic regeneration behavior when possible.
 226
 227     """
 228     if os.path.isfile(grammar_source):
 229         gp = _generate_pickle_name(grammar_source, cache_dir) if cache_dir else None
 230         return load_grammar(grammar_source, gp=gp)
 231     pickled_name = _generate_pickle_name(os.path.basename(grammar_source), cache_dir)
 232     data = pkgutil.get_data(package, pickled_name)
 233     assert data is not None
 234     g = grammar.Grammar()
 235     g.loads(data)
 236     return g
 237
 238
 239 def main(*args: Text) -> bool:
 240     """Main program, when run as a script: produce grammar pickle files.
 241
 242     Calls load_grammar for each argument, a path to a grammar text file.
 243     """
 244     if not args:
 245         args = tuple(sys.argv[1:])
 246     logging.basicConfig(level=logging.INFO, stream=sys.stdout, format="%(message)s")
 247     for gt in args:
 248         load_grammar(gt, save=True, force=True)
 249     return True
 250
 251
 252 if __name__ == "__main__":
 253     sys.exit(int(not main()))