__credits__ = \
'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
-import string, re
+import string, re, unicodedata
from codecs import BOM_UTF8, lookup
-from lib2to3.pgen2.token import *
+from blib2to3.pgen2.token import *
from . import token
__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'[a-zA-Z_]\w*'
+Name = r'[^\d\W]\w*'
Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
PseudoExtras = group(r'\\\r?\n', Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
-tokenprog, pseudoprog, single3prog, double3prog = list(map(
- re.compile, (Token, PseudoToken, Single3, Double3)))
+tokenprog = re.compile(Token, re.UNICODE)
+pseudoprog = re.compile(PseudoToken, re.UNICODE)
+single3prog = re.compile(Single3)
+double3prog = re.compile(Double3)
endprogs = {"'": re.compile(Single), '"': re.compile(Double),
"'''": single3prog, '"""': double3prog,
"r'''": single3prog, 'r"""': double3prog,
ut = Untokenizer()
return ut.untokenize(iterable)
+InitialCategories = {'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'}
+
def generate_tokens(readline):
"""
The generate_tokens() generator requires one argument, readline, which
yield stashed
stashed = None
- if line[pos] in '#\r\n': # skip comments or blank lines
- if line[pos] == '#':
- comment_token = line[pos:].rstrip('\r\n')
- nl_pos = pos + len(comment_token)
- yield (COMMENT, comment_token,
- (lnum, pos), (lnum, pos + len(comment_token)), line)
- yield (NL, line[nl_pos:],
- (lnum, nl_pos), (lnum, len(line)), line)
- else:
- yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
- (lnum, pos), (lnum, len(line)), line)
+ if line[pos] in '\r\n': # skip blank lines
+ yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
continue
- if column > indents[-1]: # count indents or dedents
+ if column > indents[-1]: # count indents
indents.append(column)
yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
- while column < indents[-1]:
+
+ if line[pos] == '#': # skip comments
+ comment_token = line[pos:].rstrip('\r\n')
+ nl_pos = pos + len(comment_token)
+ yield (COMMENT, comment_token,
+ (lnum, pos), (lnum, pos + len(comment_token)), line)
+ yield (NL, line[nl_pos:],
+ (lnum, nl_pos), (lnum, len(line)), line)
+ continue
+
+ while column < indents[-1]: # count dedents
if column not in indents:
raise IndentationError(
"unindent does not match any outer indentation level",
while pos < max:
pseudomatch = pseudoprog.match(line, pos)
+ if not pseudomatch:
+ print('no pseudomatch')
if pseudomatch: # scan for tokens
start, end = pseudomatch.span(1)
spos, epos, pos = (lnum, start), (lnum, end), end
yield stashed
stashed = None
yield (STRING, token, spos, epos, line)
- elif initial in namechars: # ordinary name
+ elif (initial in namechars or # ordinary name
+ unicodedata.category(initial) in InitialCategories):
if token in ('async', 'await'):
if async_def:
yield (ASYNC if token == 'async' else AWAIT,