X-Git-Url: https://git.madduck.net/etc/vim.git/blobdiff_plain/ecdbf085a772e8d737b8a8735d39a7af413cecfb..3bfb66971f03da39ae1f4c98c30d55e60f63d33b:/blib2to3/pgen2/tokenize.py?ds=inline diff --git a/blib2to3/pgen2/tokenize.py b/blib2to3/pgen2/tokenize.py index 4f03130..9775489 100644 --- a/blib2to3/pgen2/tokenize.py +++ b/blib2to3/pgen2/tokenize.py @@ -29,7 +29,7 @@ __author__ = 'Ka-Ping Yee ' __credits__ = \ 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' -import string, re, unicodedata +import regex as re from codecs import BOM_UTF8, lookup from blib2to3.pgen2.token import * @@ -56,7 +56,7 @@ def _combinations(*l): Whitespace = r'[ \f\t]*' Comment = r'#[^\r\n]*' Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) -Name = r'[^\d\W]\w*' +Name = r'\w+' # this is invalid but it's fine because Name comes after Number in all groups Binnumber = r'0[bB]_?[01]+(?:_[01]+)*' Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?' @@ -89,7 +89,7 @@ String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", # recognized as two instances of =). Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", r"//=?", r"->", - r"[+\-*/%&@|^=<>]=?", + r"[+\-*/%&@|^=<>:]=?", r"~") Bracket = '[][(){}]' @@ -334,9 +334,7 @@ def untokenize(iterable): ut = Untokenizer() return ut.untokenize(iterable) -InitialCategories = {'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'} - -def generate_tokens(readline): +def generate_tokens(readline, grammar=None): """ The generate_tokens() generator requires one argument, readline, which must be a callable object which provides the same interface as the @@ -353,11 +351,14 @@ def generate_tokens(readline): logical line; continuation lines are included. """ lnum = parenlev = continued = 0 - namechars, numchars = string.ascii_letters + '_', '0123456789' + numchars = '0123456789' contstr, needcont = '', 0 contline = None indents = [0] + # If we know we're parsing 3.7+, we can unconditionally parse `async` and + # `await` as keywords. + async_keywords = False if grammar is None else grammar.async_keywords # 'stashed' and 'async_*' are used for async/await parsing stashed = None async_def = False @@ -412,10 +413,6 @@ def generate_tokens(readline): yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line) continue - if column > indents[-1]: # count indents - indents.append(column) - yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) - if line[pos] == '#': # skip comments comment_token = line[pos:].rstrip('\r\n') nl_pos = pos + len(comment_token) @@ -425,6 +422,10 @@ def generate_tokens(readline): (lnum, nl_pos), (lnum, len(line)), line) continue + if column > indents[-1]: # count indents + indents.append(column) + yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) + while column < indents[-1]: # count dedents if column not in indents: raise IndentationError( @@ -451,8 +452,6 @@ def generate_tokens(readline): while pos < max: pseudomatch = pseudoprog.match(line, pos) - if not pseudomatch: - print('no pseudomatch') if pseudomatch: # scan for tokens start, end = pseudomatch.span(1) spos, epos, pos = (lnum, start), (lnum, end), end @@ -508,10 +507,9 @@ def generate_tokens(readline): yield stashed stashed = None yield (STRING, token, spos, epos, line) - elif (initial in namechars or # ordinary name - unicodedata.category(initial) in InitialCategories): + elif initial.isidentifier(): # ordinary name if token in ('async', 'await'): - if async_def: + if async_keywords or async_def: yield (ASYNC if token == 'async' else AWAIT, token, spos, epos, line) continue @@ -521,13 +519,14 @@ def generate_tokens(readline): stashed = tok continue - if token == 'def': + if token in ('def', 'for'): if (stashed and stashed[0] == NAME and stashed[1] == 'async'): - async_def = True - async_def_indent = indents[-1] + if token == 'def': + async_def = True + async_def_indent = indents[-1] yield (ASYNC, stashed[1], stashed[2], stashed[3],