208 lines
7.6 KiB
Python
208 lines
7.6 KiB
Python
|
# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
|
||
|
# For details: https://github.com/nedbat/coveragepy/blob/master/NOTICE.txt
|
||
|
|
||
|
"""Better tokenizing for coverage.py."""
|
||
|
|
||
|
from __future__ import annotations
|
||
|
|
||
|
import ast
|
||
|
import io
|
||
|
import keyword
|
||
|
import re
|
||
|
import sys
|
||
|
import token
|
||
|
import tokenize
|
||
|
|
||
|
from typing import Iterable, List, Optional, Set, Tuple
|
||
|
|
||
|
from coverage import env
|
||
|
from coverage.types import TLineNo, TSourceTokenLines
|
||
|
|
||
|
|
||
|
TokenInfos = Iterable[tokenize.TokenInfo]
|
||
|
|
||
|
|
||
|
def _phys_tokens(toks: TokenInfos) -> TokenInfos:
|
||
|
"""Return all physical tokens, even line continuations.
|
||
|
|
||
|
tokenize.generate_tokens() doesn't return a token for the backslash that
|
||
|
continues lines. This wrapper provides those tokens so that we can
|
||
|
re-create a faithful representation of the original source.
|
||
|
|
||
|
Returns the same values as generate_tokens()
|
||
|
|
||
|
"""
|
||
|
last_line: Optional[str] = None
|
||
|
last_lineno = -1
|
||
|
last_ttext: str = ""
|
||
|
for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
|
||
|
if last_lineno != elineno:
|
||
|
if last_line and last_line.endswith("\\\n"):
|
||
|
# We are at the beginning of a new line, and the last line
|
||
|
# ended with a backslash. We probably have to inject a
|
||
|
# backslash token into the stream. Unfortunately, there's more
|
||
|
# to figure out. This code::
|
||
|
#
|
||
|
# usage = """\
|
||
|
# HEY THERE
|
||
|
# """
|
||
|
#
|
||
|
# triggers this condition, but the token text is::
|
||
|
#
|
||
|
# '"""\\\nHEY THERE\n"""'
|
||
|
#
|
||
|
# so we need to figure out if the backslash is already in the
|
||
|
# string token or not.
|
||
|
inject_backslash = True
|
||
|
if last_ttext.endswith("\\"):
|
||
|
inject_backslash = False
|
||
|
elif ttype == token.STRING:
|
||
|
if "\n" in ttext and ttext.split("\n", 1)[0][-1] == "\\":
|
||
|
# It's a multi-line string and the first line ends with
|
||
|
# a backslash, so we don't need to inject another.
|
||
|
inject_backslash = False
|
||
|
if inject_backslash:
|
||
|
# Figure out what column the backslash is in.
|
||
|
ccol = len(last_line.split("\n")[-2]) - 1
|
||
|
# Yield the token, with a fake token type.
|
||
|
yield tokenize.TokenInfo(
|
||
|
99999, "\\\n",
|
||
|
(slineno, ccol), (slineno, ccol+2),
|
||
|
last_line
|
||
|
)
|
||
|
last_line = ltext
|
||
|
if ttype not in (tokenize.NEWLINE, tokenize.NL):
|
||
|
last_ttext = ttext
|
||
|
yield tokenize.TokenInfo(ttype, ttext, (slineno, scol), (elineno, ecol), ltext)
|
||
|
last_lineno = elineno
|
||
|
|
||
|
|
||
|
class MatchCaseFinder(ast.NodeVisitor):
|
||
|
"""Helper for finding match/case lines."""
|
||
|
def __init__(self, source: str) -> None:
|
||
|
# This will be the set of line numbers that start match or case statements.
|
||
|
self.match_case_lines: Set[TLineNo] = set()
|
||
|
self.visit(ast.parse(source))
|
||
|
|
||
|
if sys.version_info >= (3, 10):
|
||
|
def visit_Match(self, node: ast.Match) -> None:
|
||
|
"""Invoked by ast.NodeVisitor.visit"""
|
||
|
self.match_case_lines.add(node.lineno)
|
||
|
for case in node.cases:
|
||
|
self.match_case_lines.add(case.pattern.lineno)
|
||
|
self.generic_visit(node)
|
||
|
|
||
|
|
||
|
def source_token_lines(source: str) -> TSourceTokenLines:
|
||
|
"""Generate a series of lines, one for each line in `source`.
|
||
|
|
||
|
Each line is a list of pairs, each pair is a token::
|
||
|
|
||
|
[('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
|
||
|
|
||
|
Each pair has a token class, and the token text.
|
||
|
|
||
|
If you concatenate all the token texts, and then join them with newlines,
|
||
|
you should have your original `source` back, with two differences:
|
||
|
trailing white space is not preserved, and a final line with no newline
|
||
|
is indistinguishable from a final line with a newline.
|
||
|
|
||
|
"""
|
||
|
|
||
|
ws_tokens = {token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL}
|
||
|
line: List[Tuple[str, str]] = []
|
||
|
col = 0
|
||
|
|
||
|
source = source.expandtabs(8).replace("\r\n", "\n")
|
||
|
tokgen = generate_tokens(source)
|
||
|
|
||
|
if env.PYBEHAVIOR.soft_keywords:
|
||
|
match_case_lines = MatchCaseFinder(source).match_case_lines
|
||
|
|
||
|
for ttype, ttext, (sline, scol), (_, ecol), _ in _phys_tokens(tokgen):
|
||
|
mark_start = True
|
||
|
for part in re.split("(\n)", ttext):
|
||
|
if part == "\n":
|
||
|
yield line
|
||
|
line = []
|
||
|
col = 0
|
||
|
mark_end = False
|
||
|
elif part == "":
|
||
|
mark_end = False
|
||
|
elif ttype in ws_tokens:
|
||
|
mark_end = False
|
||
|
else:
|
||
|
if mark_start and scol > col:
|
||
|
line.append(("ws", " " * (scol - col)))
|
||
|
mark_start = False
|
||
|
tok_class = tokenize.tok_name.get(ttype, "xx").lower()[:3]
|
||
|
if ttype == token.NAME:
|
||
|
if keyword.iskeyword(ttext):
|
||
|
# Hard keywords are always keywords.
|
||
|
tok_class = "key"
|
||
|
elif sys.version_info >= (3, 10): # PYVERSIONS
|
||
|
# Need the version_info check to keep mypy from borking
|
||
|
# on issoftkeyword here.
|
||
|
if env.PYBEHAVIOR.soft_keywords and keyword.issoftkeyword(ttext):
|
||
|
# Soft keywords appear at the start of the line,
|
||
|
# on lines that start match or case statements.
|
||
|
if len(line) == 0:
|
||
|
is_start_of_line = True
|
||
|
elif (len(line) == 1) and line[0][0] == "ws":
|
||
|
is_start_of_line = True
|
||
|
else:
|
||
|
is_start_of_line = False
|
||
|
if is_start_of_line and sline in match_case_lines:
|
||
|
tok_class = "key"
|
||
|
line.append((tok_class, part))
|
||
|
mark_end = True
|
||
|
scol = 0
|
||
|
if mark_end:
|
||
|
col = ecol
|
||
|
|
||
|
if line:
|
||
|
yield line
|
||
|
|
||
|
|
||
|
class CachedTokenizer:
|
||
|
"""A one-element cache around tokenize.generate_tokens.
|
||
|
|
||
|
When reporting, coverage.py tokenizes files twice, once to find the
|
||
|
structure of the file, and once to syntax-color it. Tokenizing is
|
||
|
expensive, and easily cached.
|
||
|
|
||
|
This is a one-element cache so that our twice-in-a-row tokenizing doesn't
|
||
|
actually tokenize twice.
|
||
|
|
||
|
"""
|
||
|
def __init__(self) -> None:
|
||
|
self.last_text: Optional[str] = None
|
||
|
self.last_tokens: List[tokenize.TokenInfo] = []
|
||
|
|
||
|
def generate_tokens(self, text: str) -> TokenInfos:
|
||
|
"""A stand-in for `tokenize.generate_tokens`."""
|
||
|
if text != self.last_text:
|
||
|
self.last_text = text
|
||
|
readline = io.StringIO(text).readline
|
||
|
try:
|
||
|
self.last_tokens = list(tokenize.generate_tokens(readline))
|
||
|
except:
|
||
|
self.last_text = None
|
||
|
raise
|
||
|
return self.last_tokens
|
||
|
|
||
|
# Create our generate_tokens cache as a callable replacement function.
|
||
|
generate_tokens = CachedTokenizer().generate_tokens
|
||
|
|
||
|
|
||
|
def source_encoding(source: bytes) -> str:
|
||
|
"""Determine the encoding for `source`, according to PEP 263.
|
||
|
|
||
|
`source` is a byte string: the text of the program.
|
||
|
|
||
|
Returns a string, the name of the encoding.
|
||
|
|
||
|
"""
|
||
|
readline = iter(source.splitlines(True)).__next__
|
||
|
return tokenize.detect_encoding(readline)[0]
|