From e36b8c71bb78f70735b8b4f239b0f574a6e0f277 Mon Sep 17 00:00:00 2001 From: =?utf8?q?=C5=81ukasz=20Langa?= Date: Wed, 4 Apr 2018 21:38:25 -0700 Subject: [PATCH] [blib2to3] Support non-ASCII identifiers This support isn't *exactly* right per PEP 3131 as the regex engine is a bit too limited for that and I didn't want to spend time on Other_ID_Start and Other_ID_Continue unless they're actually needed. Hopefully this doesn't slow it down too much. --- blib2to3/pgen2/tokenize.py | 17 ++++++++++++----- tests/expression.diff | 7 ++++--- tests/expression.py | 4 ++++ 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/blib2to3/pgen2/tokenize.py b/blib2to3/pgen2/tokenize.py index b6bbf4e..6b8a5cb 100644 --- a/blib2to3/pgen2/tokenize.py +++ b/blib2to3/pgen2/tokenize.py @@ -29,7 +29,7 @@ __author__ = 'Ka-Ping Yee ' __credits__ = \ 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' -import string, re +import string, re, unicodedata from codecs import BOM_UTF8, lookup from blib2to3.pgen2.token import * @@ -52,7 +52,7 @@ def maybe(*choices): return group(*choices) + '?' Whitespace = r'[ \f\t]*' Comment = r'#[^\r\n]*' Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) -Name = r'[a-zA-Z_]\w*' +Name = r'[^\d\W]\w*' Binnumber = r'0[bB]_?[01]+(?:_[01]+)*' Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?' @@ -103,8 +103,10 @@ ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + PseudoExtras = group(r'\\\r?\n', Comment, Triple) PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) -tokenprog, pseudoprog, single3prog, double3prog = list(map( - re.compile, (Token, PseudoToken, Single3, Double3))) +tokenprog = re.compile(Token, re.UNICODE) +pseudoprog = re.compile(PseudoToken, re.UNICODE) +single3prog = re.compile(Single3) +double3prog = re.compile(Double3) endprogs = {"'": re.compile(Single), '"': re.compile(Double), "'''": single3prog, '"""': double3prog, "r'''": single3prog, 'r"""': double3prog, @@ -358,6 +360,8 @@ def untokenize(iterable): ut = Untokenizer() return ut.untokenize(iterable) +InitialCategories = {'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'} + def generate_tokens(readline): """ The generate_tokens() generator requires one argument, readline, which @@ -473,6 +477,8 @@ def generate_tokens(readline): while pos < max: pseudomatch = pseudoprog.match(line, pos) + if not pseudomatch: + print('no pseudomatch') if pseudomatch: # scan for tokens start, end = pseudomatch.span(1) spos, epos, pos = (lnum, start), (lnum, end), end @@ -528,7 +534,8 @@ def generate_tokens(readline): yield stashed stashed = None yield (STRING, token, spos, epos, line) - elif initial in namechars: # ordinary name + elif (initial in namechars or # ordinary name + unicodedata.category(initial) in InitialCategories): if token in ('async', 'await'): if async_def: yield (ASYNC if token == 'async' else AWAIT, diff --git a/tests/expression.diff b/tests/expression.diff index 4cdf803..f37b16b 100644 --- a/tests/expression.diff +++ b/tests/expression.diff @@ -103,7 +103,7 @@ ] slice[0] slice[0:1] -@@ -114,71 +123,90 @@ +@@ -114,73 +123,92 @@ numpy[-(c + 1):, d] numpy[:, l[-2]] numpy[:, ::-1] @@ -142,8 +142,10 @@ +).order_by( + models.Customer.id.asc() +).all() -+ + Ø = set() + authors.łukasz.say_thanks() ++ def gen(): yield from outside_of_generator + @@ -235,4 +237,3 @@ + last_call() # standalone comment at ENDMARKER - diff --git a/tests/expression.py b/tests/expression.py index e0c819b..3cd0c61 100644 --- a/tests/expression.py +++ b/tests/expression.py @@ -135,6 +135,8 @@ e = (1,).count(1) what_is_up_with_those_new_coord_names = (coord_names + set(vars_to_create)) + set(vars_to_remove) what_is_up_with_those_new_coord_names = (coord_names | set(vars_to_create)) - set(vars_to_remove) result = session.query(models.Customer.id).filter(models.Customer.account_id == account_id, models.Customer.email == email_address).order_by(models.Customer.id.asc(),).all() +Ø = set() +authors.łukasz.say_thanks() def gen(): yield from outside_of_generator @@ -340,6 +342,8 @@ result = session.query(models.Customer.id).filter( ).order_by( models.Customer.id.asc() ).all() +Ø = set() +authors.łukasz.say_thanks() def gen(): -- 2.39.5