[blib2to3] Support non-ASCII identifiers

author Łukasz Langa <lukasz@langa.pl>

Thu, 5 Apr 2018 04:38:25 +0000 (21:38 -0700)

committer Łukasz Langa <lukasz@langa.pl>

Thu, 5 Apr 2018 09:29:01 +0000 (02:29 -0700)
author Łukasz Langa <lukasz@langa.pl>
Thu, 5 Apr 2018 04:38:25 +0000 (21:38 -0700)
committer Łukasz Langa <lukasz@langa.pl>
Thu, 5 Apr 2018 09:29:01 +0000 (02:29 -0700)
diff --git a/blib2to3/pgen2/tokenize.py b/blib2to3/pgen2/tokenize.py

index b6bbf4ec7dde2d912690be28a938bfb7f0742cc0..6b8a5cb2ef54fb0bdbd98f2d2e20ac73f7ae3c3c 100644 (file)
--- a/blib2to3/pgen2/tokenize.py
+++ b/blib2to3/pgen2/tokenize.py
@@ -29,7 +29,7 @@ __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  __credits__ = \
      'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
  
-import string, re
+import string, re, unicodedata
  from codecs import BOM_UTF8, lookup
  from blib2to3.pgen2.token import *
  
@@ -52,7 +52,7 @@ def maybe(*choices): return group(*choices) + '?'
  Whitespace = r'[ \f\t]*'
  Comment = r'#[^\r\n]*'
  Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'[a-zA-Z_]\w*'
+Name = r'[^\d\W]\w*'
  
  Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
  Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
@@ -103,8 +103,10 @@ ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
  PseudoExtras = group(r'\\\r?\n', Comment, Triple)
  PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
  
-tokenprog, pseudoprog, single3prog, double3prog = list(map(
-    re.compile, (Token, PseudoToken, Single3, Double3)))
+tokenprog = re.compile(Token, re.UNICODE)
+pseudoprog = re.compile(PseudoToken, re.UNICODE)
+single3prog = re.compile(Single3)
+double3prog = re.compile(Double3)
  endprogs = {"'": re.compile(Single), '"': re.compile(Double),
              "'''": single3prog, '"""': double3prog,
              "r'''": single3prog, 'r"""': double3prog,
@@ -358,6 +360,8 @@ def untokenize(iterable):
      ut = Untokenizer()
      return ut.untokenize(iterable)
  
+InitialCategories = {'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'}
+
  def generate_tokens(readline):
      """
      The generate_tokens() generator requires one argument, readline, which
@@ -473,6 +477,8 @@ def generate_tokens(readline):
  
          while pos < max:
              pseudomatch = pseudoprog.match(line, pos)
+            if not pseudomatch:
+                print('no pseudomatch')
              if pseudomatch:                                # scan for tokens
                  start, end = pseudomatch.span(1)
                  spos, epos, pos = (lnum, start), (lnum, end), end
@@ -528,7 +534,8 @@ def generate_tokens(readline):
                              yield stashed
                              stashed = None
                          yield (STRING, token, spos, epos, line)
-                elif initial in namechars:                 # ordinary name
+                elif (initial in namechars or              # ordinary name
+                      unicodedata.category(initial) in InitialCategories):
                      if token in ('async', 'await'):
                          if async_def:
                              yield (ASYNC if token == 'async' else AWAIT,
diff --git a/tests/expression.diff b/tests/expression.diff

index 4cdf803e8889d263ef98fdf40b0485301fc49f9c..f37b16bde745c601f6c692bcdb951c4161a12267 100644 (file)
--- a/tests/expression.diff
+++ b/tests/expression.diff
@@ -103,7 +103,7 @@
   ]
   slice[0]
   slice[0:1]
-@@ -114,71 +123,90 @@
+@@ -114,73 +123,92 @@
   numpy[-(c + 1):, d]
   numpy[:, l[-2]]
   numpy[:, ::-1]
@@ -142,8 +142,10 @@
  +).order_by(
  +    models.Customer.id.asc()
  +).all()
-+
+ Ø = set()
+ authors.łukasz.say_thanks()
   
++
   def gen():
       yield from outside_of_generator
  +
@@ -235,4 +237,3 @@
  +
   last_call()
   # standalone comment at ENDMARKER
-
diff --git a/tests/expression.py b/tests/expression.py

index e0c819b68cb322a2c24b8f6007b0a0bf71e2360e..3cd0c61984b684b8bdf1ad33efd5516c62f36cf5 100644 (file)
--- a/tests/expression.py
+++ b/tests/expression.py
@@ -135,6 +135,8 @@ e = (1,).count(1)
  what_is_up_with_those_new_coord_names = (coord_names + set(vars_to_create)) + set(vars_to_remove)
  what_is_up_with_those_new_coord_names = (coord_names | set(vars_to_create)) - set(vars_to_remove)
  result = session.query(models.Customer.id).filter(models.Customer.account_id == account_id, models.Customer.email == email_address).order_by(models.Customer.id.asc(),).all()
+Ø = set()
+authors.łukasz.say_thanks()
  
  def gen():
      yield from outside_of_generator
@@ -340,6 +342,8 @@ result = session.query(models.Customer.id).filter(
  ).order_by(
      models.Customer.id.asc()
  ).all()
+Ø = set()
+authors.łukasz.say_thanks()
  
  
  def gen():
author	Łukasz Langa <lukasz@langa.pl>
	Thu, 5 Apr 2018 04:38:25 +0000 (21:38 -0700)
committer	Łukasz Langa <lukasz@langa.pl>
	Thu, 5 Apr 2018 09:29:01 +0000 (02:29 -0700)
blib2to3/pgen2/tokenize.py		patch \| blob \| history
tests/expression.diff		patch \| blob \| history
tests/expression.py		patch \| blob \| history