Do not load incompatible cache (#875) (#1034)

[etc/vim.git] / blib2to3 / pgen2 / tokenize.py
diff --git a/blib2to3/pgen2/tokenize.py b/blib2to3/pgen2/tokenize.py

index 6b8a5cb2ef54fb0bdbd98f2d2e20ac73f7ae3c3c..8c1c4cfac680a2f4815d880c989d6e3b3d115bb0 100644 (file)
--- a/blib2to3/pgen2/tokenize.py
+++ b/blib2to3/pgen2/tokenize.py
@@ -25,17 +25,20 @@ are the same, except instead of generating tokens, tokeneater is a callback
  function to which the 5 fields described above are passed as 5 arguments,
  each time a new token is found."""
  
  function to which the 5 fields described above are passed as 5 arguments,
  each time a new token is found."""
  
-__author__ = 'Ka-Ping Yee <ping@lfw.org>'
-__credits__ = \
-    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
+__author__ = "Ka-Ping Yee <ping@lfw.org>"
+__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
  
  
-import string, re, unicodedata
+import regex as re
  from codecs import BOM_UTF8, lookup
  from blib2to3.pgen2.token import *
  
  from . import token
  from codecs import BOM_UTF8, lookup
  from blib2to3.pgen2.token import *
  
  from . import token
-__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
-           "generate_tokens", "untokenize"]
+
+__all__ = [x for x in dir(token) if x[0] != "_"] + [
+    "tokenize",
+    "generate_tokens",
+    "untokenize",
+]
  del token
  
  try:
  del token
  
  try:
@@ -45,25 +48,40 @@ except NameError:
      # valid Python 3 code.
      bytes = str
  
      # valid Python 3 code.
      bytes = str
  
-def group(*choices): return '(' + '|'.join(choices) + ')'
-def any(*choices): return group(*choices) + '*'
-def maybe(*choices): return group(*choices) + '?'
  
  
-Whitespace = r'[ \f\t]*'
-Comment = r'#[^\r\n]*'
-Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'[^\d\W]\w*'
+def group(*choices):
+    return "(" + "|".join(choices) + ")"
+
+
+def any(*choices):
+    return group(*choices) + "*"
+
+
+def maybe(*choices):
+    return group(*choices) + "?"
+
+
+def _combinations(*l):
+    return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
+
+
+Whitespace = r"[ \f\t]*"
+Comment = r"#[^\r\n]*"
+Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
+Name = r"\w+"  # this is invalid but it's fine because Name comes after Number in all groups
  
  
-Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
-Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
-Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
-Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
+Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
+Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
+Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
+Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
  Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
  Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
-Exponent = r'[eE][-+]?\d+(?:_\d+)*'
-Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
-Expfloat = r'\d+(?:_\d+)*' + Exponent
+Exponent = r"[eE][-+]?\d+(?:_\d+)*"
+Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
+    Exponent
+)
+Expfloat = r"\d+(?:_\d+)*" + Exponent
  Floatnumber = group(Pointfloat, Expfloat)
  Floatnumber = group(Pointfloat, Expfloat)
-Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
+Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
  Number = group(Imagnumber, Floatnumber, Intnumber)
  
  # Tail end of ' string.
  Number = group(Imagnumber, Floatnumber, Intnumber)
  
  # Tail end of ' string.
@@ -74,104 +92,94 @@ Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  # Tail end of """ string.
  Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  # Tail end of """ string.
  Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-_litprefix = r"(?:[uUrRbBfF]|[rR][bB]|[bBuU][rR])?"
+_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
  Triple = group(_litprefix + "'''", _litprefix + '"""')
  # Single-line ' or " string.
  Triple = group(_litprefix + "'''", _litprefix + '"""')
  # Single-line ' or " string.
-String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
-               _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
+String = group(
+    _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
+    _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
+)
  
  # Because of leftmost-then-longest match semantics, be sure to put the
  # longest operators first (e.g., if = came before ==, == would get
  # recognized as two instances of =).
  
  # Because of leftmost-then-longest match semantics, be sure to put the
  # longest operators first (e.g., if = came before ==, == would get
  # recognized as two instances of =).
-Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
-                 r"//=?", r"->",
-                 r"[+\-*/%&@|^=<>]=?",
-                 r"~")
-
-Bracket = '[][(){}]'
-Special = group(r'\r?\n', r'[:;.,`@]')
+Operator = group(
+    r"\*\*=?",
+    r">>=?",
+    r"<<=?",
+    r"<>",
+    r"!=",
+    r"//=?",
+    r"->",
+    r"[+\-*/%&@|^=<>:]=?",
+    r"~",
+)
+
+Bracket = "[][(){}]"
+Special = group(r"\r?\n", r"[:;.,`@]")
  Funny = group(Operator, Bracket, Special)
  
  PlainToken = group(Number, Funny, String, Name)
  Token = Ignore + PlainToken
  
  # First (or only) line of ' or " string.
  Funny = group(Operator, Bracket, Special)
  
  PlainToken = group(Number, Funny, String, Name)
  Token = Ignore + PlainToken
  
  # First (or only) line of ' or " string.
-ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
-                group("'", r'\\\r?\n'),
-                _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
-                group('"', r'\\\r?\n'))
-PseudoExtras = group(r'\\\r?\n', Comment, Triple)
+ContStr = group(
+    _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
+    _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
+)
+PseudoExtras = group(r"\\\r?\n", Comment, Triple)
  PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
  
  tokenprog = re.compile(Token, re.UNICODE)
  pseudoprog = re.compile(PseudoToken, re.UNICODE)
  single3prog = re.compile(Single3)
  double3prog = re.compile(Double3)
  PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
  
  tokenprog = re.compile(Token, re.UNICODE)
  pseudoprog = re.compile(PseudoToken, re.UNICODE)
  single3prog = re.compile(Single3)
  double3prog = re.compile(Double3)
-endprogs = {"'": re.compile(Single), '"': re.compile(Double),
-            "'''": single3prog, '"""': double3prog,
-            "r'''": single3prog, 'r"""': double3prog,
-            "u'''": single3prog, 'u"""': double3prog,
-            "b'''": single3prog, 'b"""': double3prog,
-            "f'''": single3prog, 'f"""': double3prog,
-            "ur'''": single3prog, 'ur"""': double3prog,
-            "br'''": single3prog, 'br"""': double3prog,
-            "rb'''": single3prog, 'rb"""': double3prog,
-            "R'''": single3prog, 'R"""': double3prog,
-            "U'''": single3prog, 'U"""': double3prog,
-            "B'''": single3prog, 'B"""': double3prog,
-            "F'''": single3prog, 'F"""': double3prog,
-            "uR'''": single3prog, 'uR"""': double3prog,
-            "Ur'''": single3prog, 'Ur"""': double3prog,
-            "UR'''": single3prog, 'UR"""': double3prog,
-            "bR'''": single3prog, 'bR"""': double3prog,
-            "Br'''": single3prog, 'Br"""': double3prog,
-            "BR'''": single3prog, 'BR"""': double3prog,
-            "rB'''": single3prog, 'rB"""': double3prog,
-            "Rb'''": single3prog, 'Rb"""': double3prog,
-            "RB'''": single3prog, 'RB"""': double3prog,
-            'r': None, 'R': None,
-            'u': None, 'U': None,
-            'f': None, 'F': None,
-            'b': None, 'B': None}
-
-triple_quoted = {}
-for t in ("'''", '"""',
-          "r'''", 'r"""', "R'''", 'R"""',
-          "u'''", 'u"""', "U'''", 'U"""',
-          "b'''", 'b"""', "B'''", 'B"""',
-          "f'''", 'f"""', "F'''", 'F"""',
-          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
-          "uR'''", 'uR"""', "UR'''", 'UR"""',
-          "br'''", 'br"""', "Br'''", 'Br"""',
-          "bR'''", 'bR"""', "BR'''", 'BR"""',
-          "rb'''", 'rb"""', "Rb'''", 'Rb"""',
-          "rB'''", 'rB"""', "RB'''", 'RB"""',):
-    triple_quoted[t] = t
-single_quoted = {}
-for t in ("'", '"',
-          "r'", 'r"', "R'", 'R"',
-          "u'", 'u"', "U'", 'U"',
-          "b'", 'b"', "B'", 'B"',
-          "f'", 'f"', "F'", 'F"',
-          "ur'", 'ur"', "Ur'", 'Ur"',
-          "uR'", 'uR"', "UR'", 'UR"',
-          "br'", 'br"', "Br'", 'Br"',
-          "bR'", 'bR"', "BR'", 'BR"',
-          "rb'", 'rb"', "Rb'", 'Rb"',
-          "rB'", 'rB"', "RB'", 'RB"',):
-    single_quoted[t] = t
+
+_strprefixes = (
+    _combinations("r", "R", "f", "F")
+    | _combinations("r", "R", "b", "B")
+    | {"u", "U", "ur", "uR", "Ur", "UR"}
+)
+
+endprogs = {
+    "'": re.compile(Single),
+    '"': re.compile(Double),
+    "'''": single3prog,
+    '"""': double3prog,
+    **{f"{prefix}'''": single3prog for prefix in _strprefixes},
+    **{f'{prefix}"""': double3prog for prefix in _strprefixes},
+    **{prefix: None for prefix in _strprefixes},
+}
+
+triple_quoted = (
+    {"'''", '"""'}
+    | {f"{prefix}'''" for prefix in _strprefixes}
+    | {f'{prefix}"""' for prefix in _strprefixes}
+)
+single_quoted = (
+    {"'", '"'}
+    | {f"{prefix}'" for prefix in _strprefixes}
+    | {f'{prefix}"' for prefix in _strprefixes}
+)
  
  tabsize = 8
  
  
  tabsize = 8
  
-class TokenError(Exception): pass
  
  
-class StopTokenizing(Exception): pass
+class TokenError(Exception):
+    pass
+
+
+class StopTokenizing(Exception):
+    pass
+
  
  
-def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
+def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line):  # for testing
      (srow, scol) = xxx_todo_changeme
      (erow, ecol) = xxx_todo_changeme1
      (srow, scol) = xxx_todo_changeme
      (erow, ecol) = xxx_todo_changeme1
-    print("%d,%d-%d,%d:\t%s\t%s" % \
-        (srow, scol, erow, ecol, tok_name[type], repr(token)))
+    print(
+        "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
+    )
+
  
  def tokenize(readline, tokeneater=printtoken):
      """
  
  def tokenize(readline, tokeneater=printtoken):
      """
@@ -191,13 +199,14 @@ def tokenize(readline, tokeneater=printtoken):
      except StopTokenizing:
          pass
  
      except StopTokenizing:
          pass
  
+
  # backwards compatible interface
  def tokenize_loop(readline, tokeneater):
      for token_info in generate_tokens(readline):
          tokeneater(*token_info)
  
  # backwards compatible interface
  def tokenize_loop(readline, tokeneater):
      for token_info in generate_tokens(readline):
          tokeneater(*token_info)
  
-class Untokenizer:
  
  
+class Untokenizer:
      def __init__(self):
          self.tokens = []
          self.prev_row = 1
      def __init__(self):
          self.tokens = []
          self.prev_row = 1
@@ -230,14 +239,14 @@ class Untokenizer:
          toks_append = self.tokens.append
          toknum, tokval = token
          if toknum in (NAME, NUMBER):
          toks_append = self.tokens.append
          toknum, tokval = token
          if toknum in (NAME, NUMBER):
-            tokval += ' '
+            tokval += " "
          if toknum in (NEWLINE, NL):
              startline = True
          for tok in iterable:
              toknum, tokval = tok[:2]
  
              if toknum in (NAME, NUMBER, ASYNC, AWAIT):
          if toknum in (NEWLINE, NL):
              startline = True
          for tok in iterable:
              toknum, tokval = tok[:2]
  
              if toknum in (NAME, NUMBER, ASYNC, AWAIT):
-                tokval += ' '
+                tokval += " "
  
              if toknum == INDENT:
                  indents.append(tokval)
  
              if toknum == INDENT:
                  indents.append(tokval)
@@ -252,8 +261,10 @@ class Untokenizer:
                  startline = False
              toks_append(tokval)
  
                  startline = False
              toks_append(tokval)
  
-cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
-blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
+
+cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
+blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
+
  
  def _get_normal_name(orig_enc):
      """Imitates get_normal_name in tokenizer.c."""
  
  def _get_normal_name(orig_enc):
      """Imitates get_normal_name in tokenizer.c."""
@@ -261,11 +272,13 @@ def _get_normal_name(orig_enc):
      enc = orig_enc[:12].lower().replace("_", "-")
      if enc == "utf-8" or enc.startswith("utf-8-"):
          return "utf-8"
      enc = orig_enc[:12].lower().replace("_", "-")
      if enc == "utf-8" or enc.startswith("utf-8-"):
          return "utf-8"
-    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
-       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
+    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
+        ("latin-1-", "iso-8859-1-", "iso-latin-1-")
+    ):
          return "iso-8859-1"
      return orig_enc
  
          return "iso-8859-1"
      return orig_enc
  
+
  def detect_encoding(readline):
      """
      The detect_encoding() function is used to detect the encoding that should
  def detect_encoding(readline):
      """
      The detect_encoding() function is used to detect the encoding that should
@@ -286,7 +299,8 @@ def detect_encoding(readline):
      """
      bom_found = False
      encoding = None
      """
      bom_found = False
      encoding = None
-    default = 'utf-8'
+    default = "utf-8"
+
      def read_or_stop():
          try:
              return readline()
      def read_or_stop():
          try:
              return readline()
@@ -295,7 +309,7 @@ def detect_encoding(readline):
  
      def find_cookie(line):
          try:
  
      def find_cookie(line):
          try:
-            line_string = line.decode('ascii')
+            line_string = line.decode("ascii")
          except UnicodeDecodeError:
              return None
          match = cookie_re.match(line_string)
          except UnicodeDecodeError:
              return None
          match = cookie_re.match(line_string)
@@ -309,17 +323,17 @@ def detect_encoding(readline):
              raise SyntaxError("unknown encoding: " + encoding)
  
          if bom_found:
              raise SyntaxError("unknown encoding: " + encoding)
  
          if bom_found:
-            if codec.name != 'utf-8':
+            if codec.name != "utf-8":
                  # This behaviour mimics the Python interpreter
                  # This behaviour mimics the Python interpreter
-                raise SyntaxError('encoding problem: utf-8')
-            encoding += '-sig'
+                raise SyntaxError("encoding problem: utf-8")
+            encoding += "-sig"
          return encoding
  
      first = read_or_stop()
      if first.startswith(BOM_UTF8):
          bom_found = True
          first = first[3:]
          return encoding
  
      first = read_or_stop()
      if first.startswith(BOM_UTF8):
          bom_found = True
          first = first[3:]
-        default = 'utf-8-sig'
+        default = "utf-8-sig"
      if not first:
          return default, []
  
      if not first:
          return default, []
  
@@ -339,6 +353,7 @@ def detect_encoding(readline):
  
      return default, [first, second]
  
  
      return default, [first, second]
  
+
  def untokenize(iterable):
      """Transform tokens back into Python source code.
  
  def untokenize(iterable):
      """Transform tokens back into Python source code.
  
@@ -360,9 +375,8 @@ def untokenize(iterable):
      ut = Untokenizer()
      return ut.untokenize(iterable)
  
      ut = Untokenizer()
      return ut.untokenize(iterable)
  
-InitialCategories = {'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'}
  
  
-def generate_tokens(readline):
+def generate_tokens(readline, grammar=None):
      """
      The generate_tokens() generator requires one argument, readline, which
      must be a callable object which provides the same interface as the
      """
      The generate_tokens() generator requires one argument, readline, which
      must be a callable object which provides the same interface as the
@@ -379,39 +393,52 @@ def generate_tokens(readline):
      logical line; continuation lines are included.
      """
      lnum = parenlev = continued = 0
      logical line; continuation lines are included.
      """
      lnum = parenlev = continued = 0
-    namechars, numchars = string.ascii_letters + '_', '0123456789'
-    contstr, needcont = '', 0
+    numchars = "0123456789"
+    contstr, needcont = "", 0
      contline = None
      indents = [0]
  
      contline = None
      indents = [0]
  
+    # If we know we're parsing 3.7+, we can unconditionally parse `async` and
+    # `await` as keywords.
+    async_keywords = False if grammar is None else grammar.async_keywords
      # 'stashed' and 'async_*' are used for async/await parsing
      stashed = None
      async_def = False
      async_def_indent = 0
      async_def_nl = False
  
      # 'stashed' and 'async_*' are used for async/await parsing
      stashed = None
      async_def = False
      async_def_indent = 0
      async_def_nl = False
  
-    while 1:                                   # loop over lines in stream
+    while 1:  # loop over lines in stream
          try:
              line = readline()
          except StopIteration:
          try:
              line = readline()
          except StopIteration:
-            line = ''
+            line = ""
          lnum = lnum + 1
          pos, max = 0, len(line)
  
          lnum = lnum + 1
          pos, max = 0, len(line)
  
-        if contstr:                            # continued string
+        if contstr:  # continued string
              if not line:
                  raise TokenError("EOF in multi-line string", strstart)
              endmatch = endprog.match(line)
              if endmatch:
                  pos = end = endmatch.end(0)
              if not line:
                  raise TokenError("EOF in multi-line string", strstart)
              endmatch = endprog.match(line)
              if endmatch:
                  pos = end = endmatch.end(0)
-                yield (STRING, contstr + line[:end],
-                       strstart, (lnum, end), contline + line)
-                contstr, needcont = '', 0
+                yield (
+                    STRING,
+                    contstr + line[:end],
+                    strstart,
+                    (lnum, end),
+                    contline + line,
+                )
+                contstr, needcont = "", 0
                  contline = None
                  contline = None
-            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
-                yield (ERRORTOKEN, contstr + line,
-                           strstart, (lnum, len(line)), contline)
-                contstr = ''
+            elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
+                yield (
+                    ERRORTOKEN,
+                    contstr + line,
+                    strstart,
+                    (lnum, len(line)),
+                    contline,
+                )
+                contstr = ""
                  contline = None
                  continue
              else:
                  contline = None
                  continue
              else:
@@ -420,42 +447,53 @@ def generate_tokens(readline):
                  continue
  
          elif parenlev == 0 and not continued:  # new statement
                  continue
  
          elif parenlev == 0 and not continued:  # new statement
-            if not line: break
+            if not line:
+                break
              column = 0
              column = 0
-            while pos < max:                   # measure leading whitespace
-                if line[pos] == ' ': column = column + 1
-                elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
-                elif line[pos] == '\f': column = 0
-                else: break
+            while pos < max:  # measure leading whitespace
+                if line[pos] == " ":
+                    column = column + 1
+                elif line[pos] == "\t":
+                    column = (column // tabsize + 1) * tabsize
+                elif line[pos] == "\f":
+                    column = 0
+                else:
+                    break
                  pos = pos + 1
                  pos = pos + 1
-            if pos == max: break
+            if pos == max:
+                break
  
              if stashed:
                  yield stashed
                  stashed = None
  
  
              if stashed:
                  yield stashed
                  stashed = None
  
-            if line[pos] in '\r\n':            # skip blank lines
+            if line[pos] in "\r\n":  # skip blank lines
                  yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
                  continue
  
                  yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
                  continue
  
-            if column > indents[-1]:           # count indents
-                indents.append(column)
-                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
-
-            if line[pos] == '#':               # skip comments
-                comment_token = line[pos:].rstrip('\r\n')
+            if line[pos] == "#":  # skip comments
+                comment_token = line[pos:].rstrip("\r\n")
                  nl_pos = pos + len(comment_token)
                  nl_pos = pos + len(comment_token)
-                yield (COMMENT, comment_token,
-                        (lnum, pos), (lnum, pos + len(comment_token)), line)
-                yield (NL, line[nl_pos:],
-                        (lnum, nl_pos), (lnum, len(line)), line)
+                yield (
+                    COMMENT,
+                    comment_token,
+                    (lnum, pos),
+                    (lnum, pos + len(comment_token)),
+                    line,
+                )
+                yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
                  continue
  
                  continue
  
-            while column < indents[-1]:        # count dedents
+            if column > indents[-1]:  # count indents
+                indents.append(column)
+                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
+
+            while column < indents[-1]:  # count dedents
                  if column not in indents:
                      raise IndentationError(
                          "unindent does not match any outer indentation level",
                  if column not in indents:
                      raise IndentationError(
                          "unindent does not match any outer indentation level",
-                        ("<tokenize>", lnum, pos, line))
+                        ("<tokenize>", lnum, pos, line),
+                    )
                  indents = indents[:-1]
  
                  if async_def and async_def_indent >= indents[-1]:
                  indents = indents[:-1]
  
                  if async_def and async_def_indent >= indents[-1]:
@@ -463,31 +501,30 @@ def generate_tokens(readline):
                      async_def_nl = False
                      async_def_indent = 0
  
                      async_def_nl = False
                      async_def_indent = 0
  
-                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
+                yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
  
              if async_def and async_def_nl and async_def_indent >= indents[-1]:
                  async_def = False
                  async_def_nl = False
                  async_def_indent = 0
  
  
              if async_def and async_def_nl and async_def_indent >= indents[-1]:
                  async_def = False
                  async_def_nl = False
                  async_def_indent = 0
  
-        else:                                  # continued statement
+        else:  # continued statement
              if not line:
                  raise TokenError("EOF in multi-line statement", (lnum, 0))
              continued = 0
  
          while pos < max:
              pseudomatch = pseudoprog.match(line, pos)
              if not line:
                  raise TokenError("EOF in multi-line statement", (lnum, 0))
              continued = 0
  
          while pos < max:
              pseudomatch = pseudoprog.match(line, pos)
-            if not pseudomatch:
-                print('no pseudomatch')
-            if pseudomatch:                                # scan for tokens
+            if pseudomatch:  # scan for tokens
                  start, end = pseudomatch.span(1)
                  spos, epos, pos = (lnum, start), (lnum, end), end
                  token, initial = line[start:end], line[start]
  
                  start, end = pseudomatch.span(1)
                  spos, epos, pos = (lnum, start), (lnum, end), end
                  token, initial = line[start:end], line[start]
  
-                if initial in numchars or \
-                   (initial == '.' and token != '.'):      # ordinary number
+                if initial in numchars or (
+                    initial == "." and token != "."
+                ):  # ordinary number
                      yield (NUMBER, token, spos, epos, line)
                      yield (NUMBER, token, spos, epos, line)
-                elif initial in '\r\n':
+                elif initial in "\r\n":
                      newline = NEWLINE
                      if parenlev > 0:
                          newline = NL
                      newline = NEWLINE
                      if parenlev > 0:
                          newline = NL
@@ -498,7 +535,7 @@ def generate_tokens(readline):
                          stashed = None
                      yield (newline, token, spos, epos, line)
  
                          stashed = None
                      yield (newline, token, spos, epos, line)
  
-                elif initial == '#':
+                elif initial == "#":
                      assert not token.endswith("\n")
                      if stashed:
                          yield stashed
                      assert not token.endswith("\n")
                      if stashed:
                          yield stashed
@@ -507,7 +544,7 @@ def generate_tokens(readline):
                  elif token in triple_quoted:
                      endprog = endprogs[token]
                      endmatch = endprog.match(line, pos)
                  elif token in triple_quoted:
                      endprog = endprogs[token]
                      endmatch = endprog.match(line, pos)
-                    if endmatch:                           # all on one line
+                    if endmatch:  # all on one line
                          pos = endmatch.end(0)
                          token = line[start:pos]
                          if stashed:
                          pos = endmatch.end(0)
                          token = line[start:pos]
                          if stashed:
@@ -515,49 +552,61 @@ def generate_tokens(readline):
                              stashed = None
                          yield (STRING, token, spos, (lnum, pos), line)
                      else:
                              stashed = None
                          yield (STRING, token, spos, (lnum, pos), line)
                      else:
-                        strstart = (lnum, start)           # multiple lines
+                        strstart = (lnum, start)  # multiple lines
                          contstr = line[start:]
                          contline = line
                          break
                          contstr = line[start:]
                          contline = line
                          break
-                elif initial in single_quoted or \
-                    token[:2] in single_quoted or \
-                    token[:3] in single_quoted:
-                    if token[-1] == '\n':                  # continued string
+                elif (
+                    initial in single_quoted
+                    or token[:2] in single_quoted
+                    or token[:3] in single_quoted
+                ):
+                    if token[-1] == "\n":  # continued string
                          strstart = (lnum, start)
                          strstart = (lnum, start)
-                        endprog = (endprogs[initial] or endprogs[token[1]] or
-                                   endprogs[token[2]])
+                        endprog = (
+                            endprogs[initial]
+                            or endprogs[token[1]]
+                            or endprogs[token[2]]
+                        )
                          contstr, needcont = line[start:], 1
                          contline = line
                          break
                          contstr, needcont = line[start:], 1
                          contline = line
                          break
-                    else:                                  # ordinary string
+                    else:  # ordinary string
                          if stashed:
                              yield stashed
                              stashed = None
                          yield (STRING, token, spos, epos, line)
                          if stashed:
                              yield stashed
                              stashed = None
                          yield (STRING, token, spos, epos, line)
-                elif (initial in namechars or              # ordinary name
-                      unicodedata.category(initial) in InitialCategories):
-                    if token in ('async', 'await'):
-                        if async_def:
-                            yield (ASYNC if token == 'async' else AWAIT,
-                                   token, spos, epos, line)
+                elif initial.isidentifier():  # ordinary name
+                    if token in ("async", "await"):
+                        if async_keywords or async_def:
+                            yield (
+                                ASYNC if token == "async" else AWAIT,
+                                token,
+                                spos,
+                                epos,
+                                line,
+                            )
                              continue
  
                      tok = (NAME, token, spos, epos, line)
                              continue
  
                      tok = (NAME, token, spos, epos, line)
-                    if token == 'async' and not stashed:
+                    if token == "async" and not stashed:
                          stashed = tok
                          continue
  
                          stashed = tok
                          continue
  
-                    if token == 'def':
-                        if (stashed
-                                and stashed[0] == NAME
-                                and stashed[1] == 'async'):
+                    if token in ("def", "for"):
+                        if stashed and stashed[0] == NAME and stashed[1] == "async":
  
  
-                            async_def = True
-                            async_def_indent = indents[-1]
+                            if token == "def":
+                                async_def = True
+                                async_def_indent = indents[-1]
  
  
-                            yield (ASYNC, stashed[1],
-                                   stashed[2], stashed[3],
-                                   stashed[4])
+                            yield (
+                                ASYNC,
+                                stashed[1],
+                                stashed[2],
+                                stashed[3],
+                                stashed[4],
+                            )
                              stashed = None
  
                      if stashed:
                              stashed = None
  
                      if stashed:
@@ -565,7 +614,7 @@ def generate_tokens(readline):
                          stashed = None
  
                      yield tok
                          stashed = None
  
                      yield tok
-                elif initial == '\\':                      # continued stmt
+                elif initial == "\\":  # continued stmt
                      # This yield is new; needed for better idempotency:
                      if stashed:
                          yield stashed
                      # This yield is new; needed for better idempotency:
                      if stashed:
                          yield stashed
@@ -573,26 +622,31 @@ def generate_tokens(readline):
                      yield (NL, token, spos, (lnum, pos), line)
                      continued = 1
                  else:
                      yield (NL, token, spos, (lnum, pos), line)
                      continued = 1
                  else:
-                    if initial in '([{': parenlev = parenlev + 1
-                    elif initial in ')]}': parenlev = parenlev - 1
+                    if initial in "([{":
+                        parenlev = parenlev + 1
+                    elif initial in ")]}":
+                        parenlev = parenlev - 1
                      if stashed:
                          yield stashed
                          stashed = None
                      yield (OP, token, spos, epos, line)
              else:
                      if stashed:
                          yield stashed
                          stashed = None
                      yield (OP, token, spos, epos, line)
              else:
-                yield (ERRORTOKEN, line[pos],
-                           (lnum, pos), (lnum, pos+1), line)
+                yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
                  pos = pos + 1
  
      if stashed:
          yield stashed
          stashed = None
  
                  pos = pos + 1
  
      if stashed:
          yield stashed
          stashed = None
  
-    for indent in indents[1:]:                 # pop remaining indent levels
-        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
-    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
+    for indent in indents[1:]:  # pop remaining indent levels
+        yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
+    yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
  
  
-if __name__ == '__main__':                     # testing
+
+if __name__ == "__main__":  # testing
      import sys
      import sys
-    if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
-    else: tokenize(sys.stdin.readline)
+
+    if len(sys.argv) > 1:
+        tokenize(open(sys.argv[1]).readline)
+    else:
+        tokenize(sys.stdin.readline)