X-Git-Url: https://git.madduck.net/etc/vim.git/blobdiff_plain/5fa38d4c3bdae68abfe235709b69b1bc8ae75c3a..be35b1ed2d1f9453090bb29ac24abe997815ff70:/blib2to3/pgen2/tokenize.py?ds=sidebyside diff --git a/blib2to3/pgen2/tokenize.py b/blib2to3/pgen2/tokenize.py index 6dada47..9775489 100644 --- a/blib2to3/pgen2/tokenize.py +++ b/blib2to3/pgen2/tokenize.py @@ -29,7 +29,7 @@ __author__ = 'Ka-Ping Yee ' __credits__ = \ 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' -import string, re +import regex as re from codecs import BOM_UTF8, lookup from blib2to3.pgen2.token import * @@ -48,11 +48,15 @@ except NameError: def group(*choices): return '(' + '|'.join(choices) + ')' def any(*choices): return group(*choices) + '*' def maybe(*choices): return group(*choices) + '?' +def _combinations(*l): + return set( + x + y for x in l for y in l + ("",) if x.casefold() != y.casefold() + ) Whitespace = r'[ \f\t]*' Comment = r'#[^\r\n]*' Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) -Name = r'[a-zA-Z_]\w*' +Name = r'\w+' # this is invalid but it's fine because Name comes after Number in all groups Binnumber = r'0[bB]_?[01]+(?:_[01]+)*' Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?' @@ -74,7 +78,7 @@ Double = r'[^"\\]*(?:\\.[^"\\]*)*"' Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" # Tail end of """ string. Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' -_litprefix = r"(?:[uUrRbBfF]|[rR][bB]|[bBuU][rR])?" +_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?" Triple = group(_litprefix + "'''", _litprefix + '"""') # Single-line ' or " string. String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", @@ -85,7 +89,7 @@ String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", # recognized as two instances of =). Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", r"//=?", r"->", - r"[+\-*/%&@|^=<>]=?", + r"[+\-*/%&@|^=<>:]=?", r"~") Bracket = '[][(){}]' @@ -103,61 +107,33 @@ ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + PseudoExtras = group(r'\\\r?\n', Comment, Triple) PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) -tokenprog, pseudoprog, single3prog, double3prog = list(map( - re.compile, (Token, PseudoToken, Single3, Double3))) +tokenprog = re.compile(Token, re.UNICODE) +pseudoprog = re.compile(PseudoToken, re.UNICODE) +single3prog = re.compile(Single3) +double3prog = re.compile(Double3) + +_strprefixes = ( + _combinations('r', 'R', 'f', 'F') | + _combinations('r', 'R', 'b', 'B') | + {'u', 'U', 'ur', 'uR', 'Ur', 'UR'} +) + endprogs = {"'": re.compile(Single), '"': re.compile(Double), "'''": single3prog, '"""': double3prog, - "r'''": single3prog, 'r"""': double3prog, - "u'''": single3prog, 'u"""': double3prog, - "b'''": single3prog, 'b"""': double3prog, - "f'''": single3prog, 'f"""': double3prog, - "ur'''": single3prog, 'ur"""': double3prog, - "br'''": single3prog, 'br"""': double3prog, - "rb'''": single3prog, 'rb"""': double3prog, - "R'''": single3prog, 'R"""': double3prog, - "U'''": single3prog, 'U"""': double3prog, - "B'''": single3prog, 'B"""': double3prog, - "F'''": single3prog, 'F"""': double3prog, - "uR'''": single3prog, 'uR"""': double3prog, - "Ur'''": single3prog, 'Ur"""': double3prog, - "UR'''": single3prog, 'UR"""': double3prog, - "bR'''": single3prog, 'bR"""': double3prog, - "Br'''": single3prog, 'Br"""': double3prog, - "BR'''": single3prog, 'BR"""': double3prog, - "rB'''": single3prog, 'rB"""': double3prog, - "Rb'''": single3prog, 'Rb"""': double3prog, - "RB'''": single3prog, 'RB"""': double3prog, - 'r': None, 'R': None, - 'u': None, 'U': None, - 'f': None, 'F': None, - 'b': None, 'B': None} - -triple_quoted = {} -for t in ("'''", '"""', - "r'''", 'r"""', "R'''", 'R"""', - "u'''", 'u"""', "U'''", 'U"""', - "b'''", 'b"""', "B'''", 'B"""', - "f'''", 'f"""', "F'''", 'F"""', - "ur'''", 'ur"""', "Ur'''", 'Ur"""', - "uR'''", 'uR"""', "UR'''", 'UR"""', - "br'''", 'br"""', "Br'''", 'Br"""', - "bR'''", 'bR"""', "BR'''", 'BR"""', - "rb'''", 'rb"""', "Rb'''", 'Rb"""', - "rB'''", 'rB"""', "RB'''", 'RB"""',): - triple_quoted[t] = t -single_quoted = {} -for t in ("'", '"', - "r'", 'r"', "R'", 'R"', - "u'", 'u"', "U'", 'U"', - "b'", 'b"', "B'", 'B"', - "f'", 'f"', "F'", 'F"', - "ur'", 'ur"', "Ur'", 'Ur"', - "uR'", 'uR"', "UR'", 'UR"', - "br'", 'br"', "Br'", 'Br"', - "bR'", 'bR"', "BR'", 'BR"', - "rb'", 'rb"', "Rb'", 'Rb"', - "rB'", 'rB"', "RB'", 'RB"',): - single_quoted[t] = t + **{f"{prefix}'''": single3prog for prefix in _strprefixes}, + **{f'{prefix}"""': double3prog for prefix in _strprefixes}, + **{prefix: None for prefix in _strprefixes}} + +triple_quoted = ( + {"'''", '"""'} | + {f"{prefix}'''" for prefix in _strprefixes} | + {f'{prefix}"""' for prefix in _strprefixes} +) +single_quoted = ( + {"'", '"'} | + {f"{prefix}'" for prefix in _strprefixes} | + {f'{prefix}"' for prefix in _strprefixes} +) tabsize = 8 @@ -358,7 +334,7 @@ def untokenize(iterable): ut = Untokenizer() return ut.untokenize(iterable) -def generate_tokens(readline): +def generate_tokens(readline, grammar=None): """ The generate_tokens() generator requires one argument, readline, which must be a callable object which provides the same interface as the @@ -375,11 +351,14 @@ def generate_tokens(readline): logical line; continuation lines are included. """ lnum = parenlev = continued = 0 - namechars, numchars = string.ascii_letters + '_', '0123456789' + numchars = '0123456789' contstr, needcont = '', 0 contline = None indents = [0] + # If we know we're parsing 3.7+, we can unconditionally parse `async` and + # `await` as keywords. + async_keywords = False if grammar is None else grammar.async_keywords # 'stashed' and 'async_*' are used for async/await parsing stashed = None async_def = False @@ -430,23 +409,24 @@ def generate_tokens(readline): yield stashed stashed = None - if line[pos] in '#\r\n': # skip comments or blank lines - if line[pos] == '#': - comment_token = line[pos:].rstrip('\r\n') - nl_pos = pos + len(comment_token) - yield (COMMENT, comment_token, - (lnum, pos), (lnum, pos + len(comment_token)), line) - yield (NL, line[nl_pos:], - (lnum, nl_pos), (lnum, len(line)), line) - else: - yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], - (lnum, pos), (lnum, len(line)), line) + if line[pos] in '\r\n': # skip blank lines + yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line) continue - if column > indents[-1]: # count indents or dedents + if line[pos] == '#': # skip comments + comment_token = line[pos:].rstrip('\r\n') + nl_pos = pos + len(comment_token) + yield (COMMENT, comment_token, + (lnum, pos), (lnum, pos + len(comment_token)), line) + yield (NL, line[nl_pos:], + (lnum, nl_pos), (lnum, len(line)), line) + continue + + if column > indents[-1]: # count indents indents.append(column) yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) - while column < indents[-1]: + + while column < indents[-1]: # count dedents if column not in indents: raise IndentationError( "unindent does not match any outer indentation level", @@ -527,9 +507,9 @@ def generate_tokens(readline): yield stashed stashed = None yield (STRING, token, spos, epos, line) - elif initial in namechars: # ordinary name + elif initial.isidentifier(): # ordinary name if token in ('async', 'await'): - if async_def: + if async_keywords or async_def: yield (ASYNC if token == 'async' else AWAIT, token, spos, epos, line) continue @@ -539,13 +519,14 @@ def generate_tokens(readline): stashed = tok continue - if token == 'def': + if token in ('def', 'for'): if (stashed and stashed[0] == NAME and stashed[1] == 'async'): - async_def = True - async_def_indent = indents[-1] + if token == 'def': + async_def = True + async_def_indent = indents[-1] yield (ASYNC, stashed[1], stashed[2], stashed[3],