Cache generated comments

[etc/vim.git] / blib2to3 / pgen2 / tokenize.py
diff --git a/blib2to3/pgen2/tokenize.py b/blib2to3/pgen2/tokenize.py

index 4f031306378ab20d13aec985e3c8715f59c8606a..9a7664bbf95bf81d1809922dcaf53433bae61454 100644 (file)
--- a/blib2to3/pgen2/tokenize.py
+++ b/blib2to3/pgen2/tokenize.py
@@ -29,7 +29,7 @@ __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  __credits__ = \
      'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
  
  __credits__ = \
      'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
  
-import string, re, unicodedata
+import re
  from codecs import BOM_UTF8, lookup
  from blib2to3.pgen2.token import *
  
  from codecs import BOM_UTF8, lookup
  from blib2to3.pgen2.token import *
  
@@ -56,7 +56,7 @@ def _combinations(*l):
  Whitespace = r'[ \f\t]*'
  Comment = r'#[^\r\n]*'
  Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  Whitespace = r'[ \f\t]*'
  Comment = r'#[^\r\n]*'
  Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'[^\d\W]\w*'
+Name = r'\w+'  # this is invalid but it's fine because Name comes after Number in all groups
  
  Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
  Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
  
  Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
  Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
@@ -334,8 +334,6 @@ def untokenize(iterable):
      ut = Untokenizer()
      return ut.untokenize(iterable)
  
      ut = Untokenizer()
      return ut.untokenize(iterable)
  
-InitialCategories = {'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'}
-
  def generate_tokens(readline):
      """
      The generate_tokens() generator requires one argument, readline, which
  def generate_tokens(readline):
      """
      The generate_tokens() generator requires one argument, readline, which
@@ -353,7 +351,7 @@ def generate_tokens(readline):
      logical line; continuation lines are included.
      """
      lnum = parenlev = continued = 0
      logical line; continuation lines are included.
      """
      lnum = parenlev = continued = 0
-    namechars, numchars = string.ascii_letters + '_', '0123456789'
+    numchars = '0123456789'
      contstr, needcont = '', 0
      contline = None
      indents = [0]
      contstr, needcont = '', 0
      contline = None
      indents = [0]
@@ -412,10 +410,6 @@ def generate_tokens(readline):
                  yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
                  continue
  
                  yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
                  continue
  
-            if column > indents[-1]:           # count indents
-                indents.append(column)
-                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
-
              if line[pos] == '#':               # skip comments
                  comment_token = line[pos:].rstrip('\r\n')
                  nl_pos = pos + len(comment_token)
              if line[pos] == '#':               # skip comments
                  comment_token = line[pos:].rstrip('\r\n')
                  nl_pos = pos + len(comment_token)
@@ -425,6 +419,10 @@ def generate_tokens(readline):
                          (lnum, nl_pos), (lnum, len(line)), line)
                  continue
  
                          (lnum, nl_pos), (lnum, len(line)), line)
                  continue
  
+            if column > indents[-1]:           # count indents
+                indents.append(column)
+                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
+
              while column < indents[-1]:        # count dedents
                  if column not in indents:
                      raise IndentationError(
              while column < indents[-1]:        # count dedents
                  if column not in indents:
                      raise IndentationError(
@@ -451,8 +449,6 @@ def generate_tokens(readline):
  
          while pos < max:
              pseudomatch = pseudoprog.match(line, pos)
  
          while pos < max:
              pseudomatch = pseudoprog.match(line, pos)
-            if not pseudomatch:
-                print('no pseudomatch')
              if pseudomatch:                                # scan for tokens
                  start, end = pseudomatch.span(1)
                  spos, epos, pos = (lnum, start), (lnum, end), end
              if pseudomatch:                                # scan for tokens
                  start, end = pseudomatch.span(1)
                  spos, epos, pos = (lnum, start), (lnum, end), end
@@ -508,8 +504,7 @@ def generate_tokens(readline):
                              yield stashed
                              stashed = None
                          yield (STRING, token, spos, epos, line)
                              yield stashed
                              stashed = None
                          yield (STRING, token, spos, epos, line)
-                elif (initial in namechars or              # ordinary name
-                      unicodedata.category(initial) in InitialCategories):
+                elif initial.isidentifier():               # ordinary name
                      if token in ('async', 'await'):
                          if async_def:
                              yield (ASYNC if token == 'async' else AWAIT,
                      if token in ('async', 'await'):
                          if async_def:
                              yield (ASYNC if token == 'async' else AWAIT,