X-Git-Url: https://git.madduck.net/etc/vim.git/blobdiff_plain/e74117f172e29e8a980e2c9de929ad50d3769150..00cadd43eeeaa24d7f01badacacd5dd5f8c21d5f:/blib2to3/pgen2/tokenize.py?ds=sidebyside diff --git a/blib2to3/pgen2/tokenize.py b/blib2to3/pgen2/tokenize.py index 14560e4..6b8a5cb 100644 --- a/blib2to3/pgen2/tokenize.py +++ b/blib2to3/pgen2/tokenize.py @@ -29,9 +29,9 @@ __author__ = 'Ka-Ping Yee ' __credits__ = \ 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' -import string, re +import string, re, unicodedata from codecs import BOM_UTF8, lookup -from lib2to3.pgen2.token import * +from blib2to3.pgen2.token import * from . import token __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize", @@ -52,7 +52,7 @@ def maybe(*choices): return group(*choices) + '?' Whitespace = r'[ \f\t]*' Comment = r'#[^\r\n]*' Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) -Name = r'[a-zA-Z_]\w*' +Name = r'[^\d\W]\w*' Binnumber = r'0[bB]_?[01]+(?:_[01]+)*' Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?' @@ -103,8 +103,10 @@ ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + PseudoExtras = group(r'\\\r?\n', Comment, Triple) PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) -tokenprog, pseudoprog, single3prog, double3prog = list(map( - re.compile, (Token, PseudoToken, Single3, Double3))) +tokenprog = re.compile(Token, re.UNICODE) +pseudoprog = re.compile(PseudoToken, re.UNICODE) +single3prog = re.compile(Single3) +double3prog = re.compile(Double3) endprogs = {"'": re.compile(Single), '"': re.compile(Double), "'''": single3prog, '"""': double3prog, "r'''": single3prog, 'r"""': double3prog, @@ -234,7 +236,7 @@ class Untokenizer: for tok in iterable: toknum, tokval = tok[:2] - if toknum in (NAME, NUMBER): + if toknum in (NAME, NUMBER, ASYNC, AWAIT): tokval += ' ' if toknum == INDENT: @@ -358,6 +360,8 @@ def untokenize(iterable): ut = Untokenizer() return ut.untokenize(iterable) +InitialCategories = {'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'} + def generate_tokens(readline): """ The generate_tokens() generator requires one argument, readline, which @@ -380,6 +384,12 @@ def generate_tokens(readline): contline = None indents = [0] + # 'stashed' and 'async_*' are used for async/await parsing + stashed = None + async_def = False + async_def_indent = 0 + async_def_nl = False + while 1: # loop over lines in stream try: line = readline() @@ -420,31 +430,46 @@ def generate_tokens(readline): pos = pos + 1 if pos == max: break - if line[pos] in '#\r\n': # skip comments or blank lines - if line[pos] == '#': - comment_token = line[pos:].rstrip('\r\n') - nl_pos = pos + len(comment_token) - yield (COMMENT, comment_token, - (lnum, pos), (lnum, pos + len(comment_token)), line) - yield (NL, line[nl_pos:], - (lnum, nl_pos), (lnum, len(line)), line) - else: - yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], - (lnum, pos), (lnum, len(line)), line) + if stashed: + yield stashed + stashed = None + + if line[pos] in '\r\n': # skip blank lines + yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line) continue - if column > indents[-1]: # count indents or dedents + if column > indents[-1]: # count indents indents.append(column) yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) - while column < indents[-1]: + + if line[pos] == '#': # skip comments + comment_token = line[pos:].rstrip('\r\n') + nl_pos = pos + len(comment_token) + yield (COMMENT, comment_token, + (lnum, pos), (lnum, pos + len(comment_token)), line) + yield (NL, line[nl_pos:], + (lnum, nl_pos), (lnum, len(line)), line) + continue + + while column < indents[-1]: # count dedents if column not in indents: raise IndentationError( "unindent does not match any outer indentation level", ("", lnum, pos, line)) indents = indents[:-1] + if async_def and async_def_indent >= indents[-1]: + async_def = False + async_def_nl = False + async_def_indent = 0 + yield (DEDENT, '', (lnum, pos), (lnum, pos), line) + if async_def and async_def_nl and async_def_indent >= indents[-1]: + async_def = False + async_def_nl = False + async_def_indent = 0 + else: # continued statement if not line: raise TokenError("EOF in multi-line statement", (lnum, 0)) @@ -452,6 +477,8 @@ def generate_tokens(readline): while pos < max: pseudomatch = pseudoprog.match(line, pos) + if not pseudomatch: + print('no pseudomatch') if pseudomatch: # scan for tokens start, end = pseudomatch.span(1) spos, epos, pos = (lnum, start), (lnum, end), end @@ -464,10 +491,18 @@ def generate_tokens(readline): newline = NEWLINE if parenlev > 0: newline = NL + elif async_def: + async_def_nl = True + if stashed: + yield stashed + stashed = None yield (newline, token, spos, epos, line) elif initial == '#': assert not token.endswith("\n") + if stashed: + yield stashed + stashed = None yield (COMMENT, token, spos, epos, line) elif token in triple_quoted: endprog = endprogs[token] @@ -475,6 +510,9 @@ def generate_tokens(readline): if endmatch: # all on one line pos = endmatch.end(0) token = line[start:pos] + if stashed: + yield stashed + stashed = None yield (STRING, token, spos, (lnum, pos), line) else: strstart = (lnum, start) # multiple lines @@ -492,22 +530,64 @@ def generate_tokens(readline): contline = line break else: # ordinary string + if stashed: + yield stashed + stashed = None yield (STRING, token, spos, epos, line) - elif initial in namechars: # ordinary name - yield (NAME, token, spos, epos, line) + elif (initial in namechars or # ordinary name + unicodedata.category(initial) in InitialCategories): + if token in ('async', 'await'): + if async_def: + yield (ASYNC if token == 'async' else AWAIT, + token, spos, epos, line) + continue + + tok = (NAME, token, spos, epos, line) + if token == 'async' and not stashed: + stashed = tok + continue + + if token == 'def': + if (stashed + and stashed[0] == NAME + and stashed[1] == 'async'): + + async_def = True + async_def_indent = indents[-1] + + yield (ASYNC, stashed[1], + stashed[2], stashed[3], + stashed[4]) + stashed = None + + if stashed: + yield stashed + stashed = None + + yield tok elif initial == '\\': # continued stmt # This yield is new; needed for better idempotency: + if stashed: + yield stashed + stashed = None yield (NL, token, spos, (lnum, pos), line) continued = 1 else: if initial in '([{': parenlev = parenlev + 1 elif initial in ')]}': parenlev = parenlev - 1 + if stashed: + yield stashed + stashed = None yield (OP, token, spos, epos, line) else: yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos+1), line) pos = pos + 1 + if stashed: + yield stashed + stashed = None + for indent in indents[1:]: # pop remaining indent levels yield (DEDENT, '', (lnum, 0), (lnum, 0), '') yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')