function to which the 5 fields described above are passed as 5 arguments,
each time a new token is found."""
-__author__ = 'Ka-Ping Yee <ping@lfw.org>'
-__credits__ = \
- 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
+__author__ = "Ka-Ping Yee <ping@lfw.org>"
+__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
-import string, re, unicodedata
+import regex as re
from codecs import BOM_UTF8, lookup
from blib2to3.pgen2.token import *
from . import token
-__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
- "generate_tokens", "untokenize"]
+
+__all__ = [x for x in dir(token) if x[0] != "_"] + [
+ "tokenize",
+ "generate_tokens",
+ "untokenize",
+]
del token
try:
# valid Python 3 code.
bytes = str
-def group(*choices): return '(' + '|'.join(choices) + ')'
-def any(*choices): return group(*choices) + '*'
-def maybe(*choices): return group(*choices) + '?'
-Whitespace = r'[ \f\t]*'
-Comment = r'#[^\r\n]*'
-Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'[^\d\W]\w*'
+def group(*choices):
+ return "(" + "|".join(choices) + ")"
+
+
+def any(*choices):
+ return group(*choices) + "*"
+
+
+def maybe(*choices):
+ return group(*choices) + "?"
+
+
+def _combinations(*l):
+ return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
+
+
+Whitespace = r"[ \f\t]*"
+Comment = r"#[^\r\n]*"
+Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
+Name = r"\w+" # this is invalid but it's fine because Name comes after Number in all groups
-Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
-Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
-Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
-Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
+Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
+Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
+Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
+Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
-Exponent = r'[eE][-+]?\d+(?:_\d+)*'
-Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
-Expfloat = r'\d+(?:_\d+)*' + Exponent
+Exponent = r"[eE][-+]?\d+(?:_\d+)*"
+Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
+ Exponent
+)
+Expfloat = r"\d+(?:_\d+)*" + Exponent
Floatnumber = group(Pointfloat, Expfloat)
-Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
+Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
Number = group(Imagnumber, Floatnumber, Intnumber)
# Tail end of ' string.
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-_litprefix = r"(?:[uUrRbBfF]|[rR][bB]|[bBuU][rR])?"
+_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
Triple = group(_litprefix + "'''", _litprefix + '"""')
# Single-line ' or " string.
-String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
- _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
+String = group(
+ _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
+ _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
+)
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
-Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
- r"//=?", r"->",
- r"[+\-*/%&@|^=<>]=?",
- r"~")
-
-Bracket = '[][(){}]'
-Special = group(r'\r?\n', r'[:;.,`@]')
+Operator = group(
+ r"\*\*=?",
+ r">>=?",
+ r"<<=?",
+ r"<>",
+ r"!=",
+ r"//=?",
+ r"->",
+ r"[+\-*/%&@|^=<>:]=?",
+ r"~",
+)
+
+Bracket = "[][(){}]"
+Special = group(r"\r?\n", r"[:;.,`@]")
Funny = group(Operator, Bracket, Special)
PlainToken = group(Number, Funny, String, Name)
Token = Ignore + PlainToken
# First (or only) line of ' or " string.
-ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
- group("'", r'\\\r?\n'),
- _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
- group('"', r'\\\r?\n'))
-PseudoExtras = group(r'\\\r?\n', Comment, Triple)
+ContStr = group(
+ _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
+ _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
+)
+PseudoExtras = group(r"\\\r?\n", Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
tokenprog = re.compile(Token, re.UNICODE)
pseudoprog = re.compile(PseudoToken, re.UNICODE)
single3prog = re.compile(Single3)
double3prog = re.compile(Double3)
-endprogs = {"'": re.compile(Single), '"': re.compile(Double),
- "'''": single3prog, '"""': double3prog,
- "r'''": single3prog, 'r"""': double3prog,
- "u'''": single3prog, 'u"""': double3prog,
- "b'''": single3prog, 'b"""': double3prog,
- "f'''": single3prog, 'f"""': double3prog,
- "ur'''": single3prog, 'ur"""': double3prog,
- "br'''": single3prog, 'br"""': double3prog,
- "rb'''": single3prog, 'rb"""': double3prog,
- "R'''": single3prog, 'R"""': double3prog,
- "U'''": single3prog, 'U"""': double3prog,
- "B'''": single3prog, 'B"""': double3prog,
- "F'''": single3prog, 'F"""': double3prog,
- "uR'''": single3prog, 'uR"""': double3prog,
- "Ur'''": single3prog, 'Ur"""': double3prog,
- "UR'''": single3prog, 'UR"""': double3prog,
- "bR'''": single3prog, 'bR"""': double3prog,
- "Br'''": single3prog, 'Br"""': double3prog,
- "BR'''": single3prog, 'BR"""': double3prog,
- "rB'''": single3prog, 'rB"""': double3prog,
- "Rb'''": single3prog, 'Rb"""': double3prog,
- "RB'''": single3prog, 'RB"""': double3prog,
- 'r': None, 'R': None,
- 'u': None, 'U': None,
- 'f': None, 'F': None,
- 'b': None, 'B': None}
-
-triple_quoted = {}
-for t in ("'''", '"""',
- "r'''", 'r"""', "R'''", 'R"""',
- "u'''", 'u"""', "U'''", 'U"""',
- "b'''", 'b"""', "B'''", 'B"""',
- "f'''", 'f"""', "F'''", 'F"""',
- "ur'''", 'ur"""', "Ur'''", 'Ur"""',
- "uR'''", 'uR"""', "UR'''", 'UR"""',
- "br'''", 'br"""', "Br'''", 'Br"""',
- "bR'''", 'bR"""', "BR'''", 'BR"""',
- "rb'''", 'rb"""', "Rb'''", 'Rb"""',
- "rB'''", 'rB"""', "RB'''", 'RB"""',):
- triple_quoted[t] = t
-single_quoted = {}
-for t in ("'", '"',
- "r'", 'r"', "R'", 'R"',
- "u'", 'u"', "U'", 'U"',
- "b'", 'b"', "B'", 'B"',
- "f'", 'f"', "F'", 'F"',
- "ur'", 'ur"', "Ur'", 'Ur"',
- "uR'", 'uR"', "UR'", 'UR"',
- "br'", 'br"', "Br'", 'Br"',
- "bR'", 'bR"', "BR'", 'BR"',
- "rb'", 'rb"', "Rb'", 'Rb"',
- "rB'", 'rB"', "RB'", 'RB"',):
- single_quoted[t] = t
+
+_strprefixes = (
+ _combinations("r", "R", "f", "F")
+ | _combinations("r", "R", "b", "B")
+ | {"u", "U", "ur", "uR", "Ur", "UR"}
+)
+
+endprogs = {
+ "'": re.compile(Single),
+ '"': re.compile(Double),
+ "'''": single3prog,
+ '"""': double3prog,
+ **{f"{prefix}'''": single3prog for prefix in _strprefixes},
+ **{f'{prefix}"""': double3prog for prefix in _strprefixes},
+ **{prefix: None for prefix in _strprefixes},
+}
+
+triple_quoted = (
+ {"'''", '"""'}
+ | {f"{prefix}'''" for prefix in _strprefixes}
+ | {f'{prefix}"""' for prefix in _strprefixes}
+)
+single_quoted = (
+ {"'", '"'}
+ | {f"{prefix}'" for prefix in _strprefixes}
+ | {f'{prefix}"' for prefix in _strprefixes}
+)
tabsize = 8
-class TokenError(Exception): pass
-class StopTokenizing(Exception): pass
+class TokenError(Exception):
+ pass
+
+
+class StopTokenizing(Exception):
+ pass
+
-def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
+def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
(srow, scol) = xxx_todo_changeme
(erow, ecol) = xxx_todo_changeme1
- print("%d,%d-%d,%d:\t%s\t%s" % \
- (srow, scol, erow, ecol, tok_name[type], repr(token)))
+ print(
+ "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
+ )
+
def tokenize(readline, tokeneater=printtoken):
"""
except StopTokenizing:
pass
+
# backwards compatible interface
def tokenize_loop(readline, tokeneater):
for token_info in generate_tokens(readline):
tokeneater(*token_info)
-class Untokenizer:
+class Untokenizer:
def __init__(self):
self.tokens = []
self.prev_row = 1
toks_append = self.tokens.append
toknum, tokval = token
if toknum in (NAME, NUMBER):
- tokval += ' '
+ tokval += " "
if toknum in (NEWLINE, NL):
startline = True
for tok in iterable:
toknum, tokval = tok[:2]
if toknum in (NAME, NUMBER, ASYNC, AWAIT):
- tokval += ' '
+ tokval += " "
if toknum == INDENT:
indents.append(tokval)
startline = False
toks_append(tokval)
-cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
-blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
+
+cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
+blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
+
def _get_normal_name(orig_enc):
"""Imitates get_normal_name in tokenizer.c."""
enc = orig_enc[:12].lower().replace("_", "-")
if enc == "utf-8" or enc.startswith("utf-8-"):
return "utf-8"
- if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
- enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
+ if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
+ ("latin-1-", "iso-8859-1-", "iso-latin-1-")
+ ):
return "iso-8859-1"
return orig_enc
+
def detect_encoding(readline):
"""
The detect_encoding() function is used to detect the encoding that should
"""
bom_found = False
encoding = None
- default = 'utf-8'
+ default = "utf-8"
+
def read_or_stop():
try:
return readline()
def find_cookie(line):
try:
- line_string = line.decode('ascii')
+ line_string = line.decode("ascii")
except UnicodeDecodeError:
return None
match = cookie_re.match(line_string)
raise SyntaxError("unknown encoding: " + encoding)
if bom_found:
- if codec.name != 'utf-8':
+ if codec.name != "utf-8":
# This behaviour mimics the Python interpreter
- raise SyntaxError('encoding problem: utf-8')
- encoding += '-sig'
+ raise SyntaxError("encoding problem: utf-8")
+ encoding += "-sig"
return encoding
first = read_or_stop()
if first.startswith(BOM_UTF8):
bom_found = True
first = first[3:]
- default = 'utf-8-sig'
+ default = "utf-8-sig"
if not first:
return default, []
return default, [first, second]
+
def untokenize(iterable):
"""Transform tokens back into Python source code.
ut = Untokenizer()
return ut.untokenize(iterable)
-InitialCategories = {'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'}
-def generate_tokens(readline):
+def generate_tokens(readline, grammar=None):
"""
The generate_tokens() generator requires one argument, readline, which
must be a callable object which provides the same interface as the
logical line; continuation lines are included.
"""
lnum = parenlev = continued = 0
- namechars, numchars = string.ascii_letters + '_', '0123456789'
- contstr, needcont = '', 0
+ numchars = "0123456789"
+ contstr, needcont = "", 0
contline = None
indents = [0]
+ # If we know we're parsing 3.7+, we can unconditionally parse `async` and
+ # `await` as keywords.
+ async_keywords = False if grammar is None else grammar.async_keywords
# 'stashed' and 'async_*' are used for async/await parsing
stashed = None
async_def = False
async_def_indent = 0
async_def_nl = False
- while 1: # loop over lines in stream
+ while 1: # loop over lines in stream
try:
line = readline()
except StopIteration:
- line = ''
+ line = ""
lnum = lnum + 1
pos, max = 0, len(line)
- if contstr: # continued string
+ if contstr: # continued string
if not line:
raise TokenError("EOF in multi-line string", strstart)
endmatch = endprog.match(line)
if endmatch:
pos = end = endmatch.end(0)
- yield (STRING, contstr + line[:end],
- strstart, (lnum, end), contline + line)
- contstr, needcont = '', 0
+ yield (
+ STRING,
+ contstr + line[:end],
+ strstart,
+ (lnum, end),
+ contline + line,
+ )
+ contstr, needcont = "", 0
contline = None
- elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
- yield (ERRORTOKEN, contstr + line,
- strstart, (lnum, len(line)), contline)
- contstr = ''
+ elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
+ yield (
+ ERRORTOKEN,
+ contstr + line,
+ strstart,
+ (lnum, len(line)),
+ contline,
+ )
+ contstr = ""
contline = None
continue
else:
continue
elif parenlev == 0 and not continued: # new statement
- if not line: break
+ if not line:
+ break
column = 0
- while pos < max: # measure leading whitespace
- if line[pos] == ' ': column = column + 1
- elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
- elif line[pos] == '\f': column = 0
- else: break
+ while pos < max: # measure leading whitespace
+ if line[pos] == " ":
+ column = column + 1
+ elif line[pos] == "\t":
+ column = (column // tabsize + 1) * tabsize
+ elif line[pos] == "\f":
+ column = 0
+ else:
+ break
pos = pos + 1
- if pos == max: break
+ if pos == max:
+ break
if stashed:
yield stashed
stashed = None
- if line[pos] in '\r\n': # skip blank lines
+ if line[pos] in "\r\n": # skip blank lines
yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
continue
- if column > indents[-1]: # count indents
- indents.append(column)
- yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
-
- if line[pos] == '#': # skip comments
- comment_token = line[pos:].rstrip('\r\n')
+ if line[pos] == "#": # skip comments
+ comment_token = line[pos:].rstrip("\r\n")
nl_pos = pos + len(comment_token)
- yield (COMMENT, comment_token,
- (lnum, pos), (lnum, pos + len(comment_token)), line)
- yield (NL, line[nl_pos:],
- (lnum, nl_pos), (lnum, len(line)), line)
+ yield (
+ COMMENT,
+ comment_token,
+ (lnum, pos),
+ (lnum, pos + len(comment_token)),
+ line,
+ )
+ yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
continue
- while column < indents[-1]: # count dedents
+ if column > indents[-1]: # count indents
+ indents.append(column)
+ yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
+
+ while column < indents[-1]: # count dedents
if column not in indents:
raise IndentationError(
"unindent does not match any outer indentation level",
- ("<tokenize>", lnum, pos, line))
+ ("<tokenize>", lnum, pos, line),
+ )
indents = indents[:-1]
if async_def and async_def_indent >= indents[-1]:
async_def_nl = False
async_def_indent = 0
- yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
+ yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
if async_def and async_def_nl and async_def_indent >= indents[-1]:
async_def = False
async_def_nl = False
async_def_indent = 0
- else: # continued statement
+ else: # continued statement
if not line:
raise TokenError("EOF in multi-line statement", (lnum, 0))
continued = 0
while pos < max:
pseudomatch = pseudoprog.match(line, pos)
- if not pseudomatch:
- print('no pseudomatch')
- if pseudomatch: # scan for tokens
+ if pseudomatch: # scan for tokens
start, end = pseudomatch.span(1)
spos, epos, pos = (lnum, start), (lnum, end), end
token, initial = line[start:end], line[start]
- if initial in numchars or \
- (initial == '.' and token != '.'): # ordinary number
+ if initial in numchars or (
+ initial == "." and token != "."
+ ): # ordinary number
yield (NUMBER, token, spos, epos, line)
- elif initial in '\r\n':
+ elif initial in "\r\n":
newline = NEWLINE
if parenlev > 0:
newline = NL
stashed = None
yield (newline, token, spos, epos, line)
- elif initial == '#':
+ elif initial == "#":
assert not token.endswith("\n")
if stashed:
yield stashed
elif token in triple_quoted:
endprog = endprogs[token]
endmatch = endprog.match(line, pos)
- if endmatch: # all on one line
+ if endmatch: # all on one line
pos = endmatch.end(0)
token = line[start:pos]
if stashed:
stashed = None
yield (STRING, token, spos, (lnum, pos), line)
else:
- strstart = (lnum, start) # multiple lines
+ strstart = (lnum, start) # multiple lines
contstr = line[start:]
contline = line
break
- elif initial in single_quoted or \
- token[:2] in single_quoted or \
- token[:3] in single_quoted:
- if token[-1] == '\n': # continued string
+ elif (
+ initial in single_quoted
+ or token[:2] in single_quoted
+ or token[:3] in single_quoted
+ ):
+ if token[-1] == "\n": # continued string
strstart = (lnum, start)
- endprog = (endprogs[initial] or endprogs[token[1]] or
- endprogs[token[2]])
+ endprog = (
+ endprogs[initial]
+ or endprogs[token[1]]
+ or endprogs[token[2]]
+ )
contstr, needcont = line[start:], 1
contline = line
break
- else: # ordinary string
+ else: # ordinary string
if stashed:
yield stashed
stashed = None
yield (STRING, token, spos, epos, line)
- elif (initial in namechars or # ordinary name
- unicodedata.category(initial) in InitialCategories):
- if token in ('async', 'await'):
- if async_def:
- yield (ASYNC if token == 'async' else AWAIT,
- token, spos, epos, line)
+ elif initial.isidentifier(): # ordinary name
+ if token in ("async", "await"):
+ if async_keywords or async_def:
+ yield (
+ ASYNC if token == "async" else AWAIT,
+ token,
+ spos,
+ epos,
+ line,
+ )
continue
tok = (NAME, token, spos, epos, line)
- if token == 'async' and not stashed:
+ if token == "async" and not stashed:
stashed = tok
continue
- if token == 'def':
- if (stashed
- and stashed[0] == NAME
- and stashed[1] == 'async'):
+ if token in ("def", "for"):
+ if stashed and stashed[0] == NAME and stashed[1] == "async":
- async_def = True
- async_def_indent = indents[-1]
+ if token == "def":
+ async_def = True
+ async_def_indent = indents[-1]
- yield (ASYNC, stashed[1],
- stashed[2], stashed[3],
- stashed[4])
+ yield (
+ ASYNC,
+ stashed[1],
+ stashed[2],
+ stashed[3],
+ stashed[4],
+ )
stashed = None
if stashed:
stashed = None
yield tok
- elif initial == '\\': # continued stmt
+ elif initial == "\\": # continued stmt
# This yield is new; needed for better idempotency:
if stashed:
yield stashed
yield (NL, token, spos, (lnum, pos), line)
continued = 1
else:
- if initial in '([{': parenlev = parenlev + 1
- elif initial in ')]}': parenlev = parenlev - 1
+ if initial in "([{":
+ parenlev = parenlev + 1
+ elif initial in ")]}":
+ parenlev = parenlev - 1
if stashed:
yield stashed
stashed = None
yield (OP, token, spos, epos, line)
else:
- yield (ERRORTOKEN, line[pos],
- (lnum, pos), (lnum, pos+1), line)
+ yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
pos = pos + 1
if stashed:
yield stashed
stashed = None
- for indent in indents[1:]: # pop remaining indent levels
- yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
- yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
+ for indent in indents[1:]: # pop remaining indent levels
+ yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
+ yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
-if __name__ == '__main__': # testing
+
+if __name__ == "__main__": # testing
import sys
- if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
- else: tokenize(sys.stdin.readline)
+
+ if len(sys.argv) > 1:
+ tokenize(open(sys.argv[1]).readline)
+ else:
+ tokenize(sys.stdin.readline)