__credits__ = \
'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
-import string, re
+import regex as re
from codecs import BOM_UTF8, lookup
-from lib2to3.pgen2.token import *
+from blib2to3.pgen2.token import *
from . import token
__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
def group(*choices): return '(' + '|'.join(choices) + ')'
def any(*choices): return group(*choices) + '*'
def maybe(*choices): return group(*choices) + '?'
+def _combinations(*l):
+ return set(
+ x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()
+ )
Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'[a-zA-Z_]\w*'
+Name = r'\w+' # this is invalid but it's fine because Name comes after Number in all groups
Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-_litprefix = r"(?:[uUrRbBfF]|[rR][bB]|[bBuU][rR])?"
+_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
Triple = group(_litprefix + "'''", _litprefix + '"""')
# Single-line ' or " string.
String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
# recognized as two instances of =).
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
r"//=?", r"->",
- r"[+\-*/%&@|^=<>]=?",
+ r"[+\-*/%&@|^=<>:]=?",
r"~")
Bracket = '[][(){}]'
PseudoExtras = group(r'\\\r?\n', Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
-tokenprog, pseudoprog, single3prog, double3prog = list(map(
- re.compile, (Token, PseudoToken, Single3, Double3)))
+tokenprog = re.compile(Token, re.UNICODE)
+pseudoprog = re.compile(PseudoToken, re.UNICODE)
+single3prog = re.compile(Single3)
+double3prog = re.compile(Double3)
+
+_strprefixes = (
+ _combinations('r', 'R', 'f', 'F') |
+ _combinations('r', 'R', 'b', 'B') |
+ {'u', 'U', 'ur', 'uR', 'Ur', 'UR'}
+)
+
endprogs = {"'": re.compile(Single), '"': re.compile(Double),
"'''": single3prog, '"""': double3prog,
- "r'''": single3prog, 'r"""': double3prog,
- "u'''": single3prog, 'u"""': double3prog,
- "b'''": single3prog, 'b"""': double3prog,
- "f'''": single3prog, 'f"""': double3prog,
- "ur'''": single3prog, 'ur"""': double3prog,
- "br'''": single3prog, 'br"""': double3prog,
- "rb'''": single3prog, 'rb"""': double3prog,
- "R'''": single3prog, 'R"""': double3prog,
- "U'''": single3prog, 'U"""': double3prog,
- "B'''": single3prog, 'B"""': double3prog,
- "F'''": single3prog, 'F"""': double3prog,
- "uR'''": single3prog, 'uR"""': double3prog,
- "Ur'''": single3prog, 'Ur"""': double3prog,
- "UR'''": single3prog, 'UR"""': double3prog,
- "bR'''": single3prog, 'bR"""': double3prog,
- "Br'''": single3prog, 'Br"""': double3prog,
- "BR'''": single3prog, 'BR"""': double3prog,
- "rB'''": single3prog, 'rB"""': double3prog,
- "Rb'''": single3prog, 'Rb"""': double3prog,
- "RB'''": single3prog, 'RB"""': double3prog,
- 'r': None, 'R': None,
- 'u': None, 'U': None,
- 'f': None, 'F': None,
- 'b': None, 'B': None}
-
-triple_quoted = {}
-for t in ("'''", '"""',
- "r'''", 'r"""', "R'''", 'R"""',
- "u'''", 'u"""', "U'''", 'U"""',
- "b'''", 'b"""', "B'''", 'B"""',
- "f'''", 'f"""', "F'''", 'F"""',
- "ur'''", 'ur"""', "Ur'''", 'Ur"""',
- "uR'''", 'uR"""', "UR'''", 'UR"""',
- "br'''", 'br"""', "Br'''", 'Br"""',
- "bR'''", 'bR"""', "BR'''", 'BR"""',
- "rb'''", 'rb"""', "Rb'''", 'Rb"""',
- "rB'''", 'rB"""', "RB'''", 'RB"""',):
- triple_quoted[t] = t
-single_quoted = {}
-for t in ("'", '"',
- "r'", 'r"', "R'", 'R"',
- "u'", 'u"', "U'", 'U"',
- "b'", 'b"', "B'", 'B"',
- "f'", 'f"', "F'", 'F"',
- "ur'", 'ur"', "Ur'", 'Ur"',
- "uR'", 'uR"', "UR'", 'UR"',
- "br'", 'br"', "Br'", 'Br"',
- "bR'", 'bR"', "BR'", 'BR"',
- "rb'", 'rb"', "Rb'", 'Rb"',
- "rB'", 'rB"', "RB'", 'RB"',):
- single_quoted[t] = t
+ **{f"{prefix}'''": single3prog for prefix in _strprefixes},
+ **{f'{prefix}"""': double3prog for prefix in _strprefixes},
+ **{prefix: None for prefix in _strprefixes}}
+
+triple_quoted = (
+ {"'''", '"""'} |
+ {f"{prefix}'''" for prefix in _strprefixes} |
+ {f'{prefix}"""' for prefix in _strprefixes}
+)
+single_quoted = (
+ {"'", '"'} |
+ {f"{prefix}'" for prefix in _strprefixes} |
+ {f'{prefix}"' for prefix in _strprefixes}
+)
tabsize = 8
ut = Untokenizer()
return ut.untokenize(iterable)
-def generate_tokens(readline):
+def generate_tokens(readline, grammar=None):
"""
The generate_tokens() generator requires one argument, readline, which
must be a callable object which provides the same interface as the
logical line; continuation lines are included.
"""
lnum = parenlev = continued = 0
- namechars, numchars = string.ascii_letters + '_', '0123456789'
+ numchars = '0123456789'
contstr, needcont = '', 0
contline = None
indents = [0]
+ # If we know we're parsing 3.7+, we can unconditionally parse `async` and
+ # `await` as keywords.
+ async_keywords = False if grammar is None else grammar.async_keywords
# 'stashed' and 'async_*' are used for async/await parsing
stashed = None
async_def = False
yield stashed
stashed = None
- if line[pos] in '#\r\n': # skip comments or blank lines
- if line[pos] == '#':
- comment_token = line[pos:].rstrip('\r\n')
- nl_pos = pos + len(comment_token)
- yield (COMMENT, comment_token,
- (lnum, pos), (lnum, pos + len(comment_token)), line)
- yield (NL, line[nl_pos:],
- (lnum, nl_pos), (lnum, len(line)), line)
- else:
- yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
- (lnum, pos), (lnum, len(line)), line)
+ if line[pos] in '\r\n': # skip blank lines
+ yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
continue
- if column > indents[-1]: # count indents or dedents
+ if line[pos] == '#': # skip comments
+ comment_token = line[pos:].rstrip('\r\n')
+ nl_pos = pos + len(comment_token)
+ yield (COMMENT, comment_token,
+ (lnum, pos), (lnum, pos + len(comment_token)), line)
+ yield (NL, line[nl_pos:],
+ (lnum, nl_pos), (lnum, len(line)), line)
+ continue
+
+ if column > indents[-1]: # count indents
indents.append(column)
yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
- while column < indents[-1]:
+
+ while column < indents[-1]: # count dedents
if column not in indents:
raise IndentationError(
"unindent does not match any outer indentation level",
yield stashed
stashed = None
yield (STRING, token, spos, epos, line)
- elif initial in namechars: # ordinary name
+ elif initial.isidentifier(): # ordinary name
if token in ('async', 'await'):
- if async_def:
+ if async_keywords or async_def:
yield (ASYNC if token == 'async' else AWAIT,
token, spos, epos, line)
continue
stashed = tok
continue
- if token == 'def':
+ if token in ('def', 'for'):
if (stashed
and stashed[0] == NAME
and stashed[1] == 'async'):
- async_def = True
- async_def_indent = indents[-1]
+ if token == 'def':
+ async_def = True
+ async_def_indent = indents[-1]
yield (ASYNC, stashed[1],
stashed[2], stashed[3],