]>
git.madduck.net Git - etc/vim.git/blobdiff - blib2to3/pgen2/tokenize.py
madduck's git repository
Every one of the projects in this repository is available at the canonical
URL git://git.madduck.net/madduck/pub/<projectpath> — see
each project's metadata for the exact URL.
All patches and comments are welcome. Please squash your changes to logical
commits before using git-format-patch and git-send-email to
patches@ git. madduck. net .
If you'd read over the Git project's submission guidelines and adhered to them,
I'd be especially grateful.
SSH access, as well as push access can be individually
arranged .
If you use my repositories frequently, consider adding the following
snippet to ~/.gitconfig and using the third clone URL listed for each
project:
[url "git://git.madduck.net/madduck/"]
insteadOf = madduck:
# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
# All rights reserved.
# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
# All rights reserved.
+# mypy: allow-untyped-defs, allow-untyped-calls
+
"""Tokenization help for Python programs.
generate_tokens(readline) is a generator that breaks a stream of
"""Tokenization help for Python programs.
generate_tokens(readline) is a generator that breaks a stream of
function to which the 5 fields described above are passed as 5 arguments,
each time a new token is found."""
function to which the 5 fields described above are passed as 5 arguments,
each time a new token is found."""
+from typing import (
+ Callable,
+ Iterable,
+ Iterator,
+ List,
+ Optional,
+ Text,
+ Tuple,
+ Pattern,
+ Union,
+ cast,
+)
+from blib2to3.pgen2.token import *
+from blib2to3.pgen2.grammar import Grammar
+
__author__ = "Ka-Ping Yee <ping@lfw.org>"
__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
__author__ = "Ka-Ping Yee <ping@lfw.org>"
__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
-try:
- bytes
-except NameError:
- # Support bytes type in Python <= 2.5, so 2to3 turns itself into
- # valid Python 3 code.
- bytes = str
-
def group(*choices):
return "(" + "|".join(choices) + ")"
def group(*choices):
return "(" + "|".join(choices) + ")"
Whitespace = r"[ \f\t]*"
Comment = r"#[^\r\n]*"
Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
Whitespace = r"[ \f\t]*"
Comment = r"#[^\r\n]*"
Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
-Name = r"\w+" # this is invalid but it's fine because Name comes after Number in all groups
+Name = ( # this is invalid but it's fine because Name comes after Number in all groups
+ r"\w+"
+)
Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
Special = group(r"\r?\n", r"[:;.,`@]")
Funny = group(Operator, Bracket, Special)
Special = group(r"\r?\n", r"[:;.,`@]")
Funny = group(Operator, Bracket, Special)
-PlainToken = group(Number, Funny, String, Name)
-Token = Ignore + PlainToken
-
# First (or only) line of ' or " string.
ContStr = group(
_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
# First (or only) line of ' or " string.
ContStr = group(
_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
PseudoExtras = group(r"\\\r?\n", Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
PseudoExtras = group(r"\\\r?\n", Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
-tokenprog = re.compile(Token, re.UNICODE)
pseudoprog = re.compile(PseudoToken, re.UNICODE)
single3prog = re.compile(Single3)
double3prog = re.compile(Double3)
pseudoprog = re.compile(PseudoToken, re.UNICODE)
single3prog = re.compile(Single3)
double3prog = re.compile(Double3)
-def tokenize(readline, tokeneater=printtoken):
+Coord = Tuple[int, int]
+TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
+
+
+def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
"""
The tokenize() function accepts two parameters: one representing the
input stream, and one providing an output mechanism for tokenize().
"""
The tokenize() function accepts two parameters: one representing the
input stream, and one providing an output mechanism for tokenize().
+GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
+TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
+
+
+
+ tokens: List[Text]
+ prev_row: int
+ prev_col: int
+
+ def __init__(self) -> None:
self.tokens = []
self.prev_row = 1
self.prev_col = 0
self.tokens = []
self.prev_row = 1
self.prev_col = 0
- def add_whitespace(self, start) :
+ def add_whitespace(self, start: Coord) -> None :
row, col = start
assert row <= self.prev_row
col_offset = col - self.prev_col
if col_offset:
self.tokens.append(" " * col_offset)
row, col = start
assert row <= self.prev_row
col_offset = col - self.prev_col
if col_offset:
self.tokens.append(" " * col_offset)
- def untokenize(self, iterable) :
+ def untokenize(self, iterable: Iterable[TokenInfo]) -> Text :
for t in iterable:
if len(t) == 2:
for t in iterable:
if len(t) == 2:
- self.compat(t , iterable)
+ self.compat(cast(Tuple[int, str], t) , iterable)
- tok_type, token, start, end, line = t
+ tok_type, token, start, end, line = cast(
+ Tuple[int, Text, Coord, Coord, Text], t
+ )
self.add_whitespace(start)
self.tokens.append(token)
self.prev_row, self.prev_col = end
self.add_whitespace(start)
self.tokens.append(token)
self.prev_row, self.prev_col = end
self.prev_col = 0
return "".join(self.tokens)
self.prev_col = 0
return "".join(self.tokens)
- def compat(self, token, iterable) :
+ def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None :
startline = False
indents = []
toks_append = self.tokens.append
startline = False
indents = []
toks_append = self.tokens.append
blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
blank_re = re.compile(br"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
-def _get_normal_name(orig_enc) :
+def _get_normal_name(orig_enc: str) -> str :
"""Imitates get_normal_name in tokenizer.c."""
# Only care about the first 12 characters.
enc = orig_enc[:12].lower().replace("_", "-")
"""Imitates get_normal_name in tokenizer.c."""
# Only care about the first 12 characters.
enc = orig_enc[:12].lower().replace("_", "-")
-def detect_encoding(readline) :
+def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]] :
"""
The detect_encoding() function is used to detect the encoding that should
be used to decode a Python source file. It requires one argument, readline,
"""
The detect_encoding() function is used to detect the encoding that should
be used to decode a Python source file. It requires one argument, readline,
encoding = None
default = "utf-8"
encoding = None
default = "utf-8"
+ def read_or_stop() -> bytes :
try:
return readline()
except StopIteration:
return bytes()
try:
return readline()
except StopIteration:
return bytes()
+ def find_cookie(line: bytes) -> Optional[str] :
try:
line_string = line.decode("ascii")
except UnicodeDecodeError:
try:
line_string = line.decode("ascii")
except UnicodeDecodeError:
return default, [first, second]
return default, [first, second]
-def untokenize(iterable) :
+def untokenize(iterable: Iterable[TokenInfo]) -> Text :
"""Transform tokens back into Python source code.
Each element returned by the iterable must be a token sequence
"""Transform tokens back into Python source code.
Each element returned by the iterable must be a token sequence
Round-trip invariant for full input:
Untokenized source will match input source exactly
Round-trip invariant for full input:
Untokenized source will match input source exactly
- Round-trip invariant for limited int put:
+ Round-trip invariant for limited input:
# Output text will tokenize the back to the input
t1 = [tok[:2] for tok in generate_tokens(f.readline)]
newcode = untokenize(t1)
# Output text will tokenize the back to the input
t1 = [tok[:2] for tok in generate_tokens(f.readline)]
newcode = untokenize(t1)
return ut.untokenize(iterable)
return ut.untokenize(iterable)
-def generate_tokens(readline, grammar=None):
+def generate_tokens(
+ readline: Callable[[], Text], grammar: Optional[Grammar] = None
+) -> Iterator[GoodTokenInfo]:
"""
The generate_tokens() generator requires one argument, readline, which
must be a callable object which provides the same interface as the
"""
The generate_tokens() generator requires one argument, readline, which
must be a callable object which provides the same interface as the
lnum = parenlev = continued = 0
numchars = "0123456789"
contstr, needcont = "", 0
lnum = parenlev = continued = 0
numchars = "0123456789"
contstr, needcont = "", 0
+ contline: Optional[str] = None
indents = [0]
# If we know we're parsing 3.7+, we can unconditionally parse `async` and
indents = [0]
# If we know we're parsing 3.7+, we can unconditionally parse `async` and
async_def_indent = 0
async_def_nl = False
async_def_indent = 0
async_def_nl = False
+ strstart: Tuple[int, int]
+ endprog: Pattern[str]
+
while 1: # loop over lines in stream
try:
line = readline()
while 1: # loop over lines in stream
try:
line = readline()
pos, max = 0, len(line)
if contstr: # continued string
pos, max = 0, len(line)
if contstr: # continued string
+ assert contline is not None
if not line:
raise TokenError("EOF in multi-line string", strstart)
endmatch = endprog.match(line)
if not line:
raise TokenError("EOF in multi-line string", strstart)
endmatch = endprog.match(line)