X-Git-Url: https://git.madduck.net/etc/vim.git/blobdiff_plain/5fa38d4c3bdae68abfe235709b69b1bc8ae75c3a..e36b8c71bb78f70735b8b4f239b0f574a6e0f277:/blib2to3/pgen2/tokenize.py

diff --git a/blib2to3/pgen2/tokenize.py b/blib2to3/pgen2/tokenize.py
index 6dada47..6b8a5cb 100644
--- a/blib2to3/pgen2/tokenize.py
+++ b/blib2to3/pgen2/tokenize.py
@@ -29,7 +29,7 @@ __author__ = 'Ka-Ping Yee <ping@lfw.org>'
 __credits__ = \
     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
 
-import string, re
+import string, re, unicodedata
 from codecs import BOM_UTF8, lookup
 from blib2to3.pgen2.token import *
 
@@ -52,7 +52,7 @@ def maybe(*choices): return group(*choices) + '?'
 Whitespace = r'[ \f\t]*'
 Comment = r'#[^\r\n]*'
 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'[a-zA-Z_]\w*'
+Name = r'[^\d\W]\w*'
 
 Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
 Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
@@ -103,8 +103,10 @@ ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 
-tokenprog, pseudoprog, single3prog, double3prog = list(map(
-    re.compile, (Token, PseudoToken, Single3, Double3)))
+tokenprog = re.compile(Token, re.UNICODE)
+pseudoprog = re.compile(PseudoToken, re.UNICODE)
+single3prog = re.compile(Single3)
+double3prog = re.compile(Double3)
 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
             "'''": single3prog, '"""': double3prog,
             "r'''": single3prog, 'r"""': double3prog,
@@ -358,6 +360,8 @@ def untokenize(iterable):
     ut = Untokenizer()
     return ut.untokenize(iterable)
 
+InitialCategories = {'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'}
+
 def generate_tokens(readline):
     """
     The generate_tokens() generator requires one argument, readline, which
@@ -430,23 +434,24 @@ def generate_tokens(readline):
                 yield stashed
                 stashed = None
 
-            if line[pos] in '#\r\n':           # skip comments or blank lines
-                if line[pos] == '#':
-                    comment_token = line[pos:].rstrip('\r\n')
-                    nl_pos = pos + len(comment_token)
-                    yield (COMMENT, comment_token,
-                           (lnum, pos), (lnum, pos + len(comment_token)), line)
-                    yield (NL, line[nl_pos:],
-                           (lnum, nl_pos), (lnum, len(line)), line)
-                else:
-                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
-                           (lnum, pos), (lnum, len(line)), line)
+            if line[pos] in '\r\n':            # skip blank lines
+                yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
                 continue
 
-            if column > indents[-1]:           # count indents or dedents
+            if column > indents[-1]:           # count indents
                 indents.append(column)
                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
-            while column < indents[-1]:
+
+            if line[pos] == '#':               # skip comments
+                comment_token = line[pos:].rstrip('\r\n')
+                nl_pos = pos + len(comment_token)
+                yield (COMMENT, comment_token,
+                        (lnum, pos), (lnum, pos + len(comment_token)), line)
+                yield (NL, line[nl_pos:],
+                        (lnum, nl_pos), (lnum, len(line)), line)
+                continue
+
+            while column < indents[-1]:        # count dedents
                 if column not in indents:
                     raise IndentationError(
                         "unindent does not match any outer indentation level",
@@ -472,6 +477,8 @@ def generate_tokens(readline):
 
         while pos < max:
             pseudomatch = pseudoprog.match(line, pos)
+            if not pseudomatch:
+                print('no pseudomatch')
             if pseudomatch:                                # scan for tokens
                 start, end = pseudomatch.span(1)
                 spos, epos, pos = (lnum, start), (lnum, end), end
@@ -527,7 +534,8 @@ def generate_tokens(readline):
                             yield stashed
                             stashed = None
                         yield (STRING, token, spos, epos, line)
-                elif initial in namechars:                 # ordinary name
+                elif (initial in namechars or              # ordinary name
+                      unicodedata.category(initial) in InitialCategories):
                     if token in ('async', 'await'):
                         if async_def:
                             yield (ASYNC if token == 'async' else AWAIT,