Back to Repositories

Testing Python Source Code Tokenization in Black Formatter

This test suite validates the functionality of the blib2to3 tokenizer implementation in the Black code formatter. It ensures accurate token generation and handling for Python source code parsing.

Test Coverage Overview

The test suite provides comprehensive coverage of tokenizer functionality, focusing on basic Python syntax elements and f-strings.

Key areas tested include:

Basic token generation for numbers, strings, and identifiers
Complex f-string parsing with nested expressions
Multi-line string handling
Token position tracking and coordination

Implementation Analysis

The testing approach utilizes a dataclass-based Token structure for consistent verification of tokenizer output. The implementation employs helper functions to streamline token generation and assertion checking, with particular attention to token type, string content, and position coordinates.

Notable patterns include:

Token comparison using dataclass equality
Structured test case organization
Comprehensive f-string parsing validation

Technical Details

Testing tools and components:

blib2to3.pgen2 tokenizer library
Python’s built-in io and sys modules
Custom Token dataclass implementation
Black formatter integration for test case generation
StringIO for input stream simulation

Best Practices Demonstrated

The test suite exemplifies high-quality testing practices through modular test organization and thorough edge case coverage.

Notable practices include:

Helper functions for test case setup and verification
Explicit token position tracking
Comprehensive f-string parsing scenarios
Command-line test case generation support
Integration with Black formatting standards

psf/black

tests/test_tokenize.py

            
"""Tests for the blib2to3 tokenizer."""

import io
import sys
import textwrap
from dataclasses import dataclass

import black
from blib2to3.pgen2 import token, tokenize


@dataclass
class Token:
    type: str
    string: str
    start: tokenize.Coord
    end: tokenize.Coord


def get_tokens(text: str) -> list[Token]:
    """Return the tokens produced by the tokenizer."""
    readline = io.StringIO(text).readline
    tokens: list[Token] = []

    def tokeneater(
        type: int, string: str, start: tokenize.Coord, end: tokenize.Coord, line: str
    ) -> None:
        tokens.append(Token(token.tok_name[type], string, start, end))

    tokenize.tokenize(readline, tokeneater)
    return tokens


def assert_tokenizes(text: str, tokens: list[Token]) -> None:
    """Assert that the tokenizer produces the expected tokens."""
    actual_tokens = get_tokens(text)
    assert actual_tokens == tokens


def test_simple() -> None:
    assert_tokenizes(
        "1",
        [Token("NUMBER", "1", (1, 0), (1, 1)), Token("ENDMARKER", "", (2, 0), (2, 0))],
    )
    assert_tokenizes(
        "'a'",
        [
            Token("STRING", "'a'", (1, 0), (1, 3)),
            Token("ENDMARKER", "", (2, 0), (2, 0)),
        ],
    )
    assert_tokenizes(
        "a",
        [Token("NAME", "a", (1, 0), (1, 1)), Token("ENDMARKER", "", (2, 0), (2, 0))],
    )


def test_fstring() -> None:
    assert_tokenizes(
        'f"x"',
        [
            Token("FSTRING_START", 'f"', (1, 0), (1, 2)),
            Token("FSTRING_MIDDLE", "x", (1, 2), (1, 3)),
            Token("FSTRING_END", '"', (1, 3), (1, 4)),
            Token("ENDMARKER", "", (2, 0), (2, 0)),
        ],
    )
    assert_tokenizes(
        'f"{x}"',
        [
            Token("FSTRING_START", 'f"', (1, 0), (1, 2)),
            Token("FSTRING_MIDDLE", "", (1, 2), (1, 2)),
            Token("LBRACE", "{", (1, 2), (1, 3)),
            Token("NAME", "x", (1, 3), (1, 4)),
            Token("RBRACE", "}", (1, 4), (1, 5)),
            Token("FSTRING_MIDDLE", "", (1, 5), (1, 5)),
            Token("FSTRING_END", '"', (1, 5), (1, 6)),
            Token("ENDMARKER", "", (2, 0), (2, 0)),
        ],
    )
    assert_tokenizes(
        'f"{x:y}"
',
        [
            Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)),
            Token(type="FSTRING_MIDDLE", string="", start=(1, 2), end=(1, 2)),
            Token(type="LBRACE", string="{", start=(1, 2), end=(1, 3)),
            Token(type="NAME", string="x", start=(1, 3), end=(1, 4)),
            Token(type="OP", string=":", start=(1, 4), end=(1, 5)),
            Token(type="FSTRING_MIDDLE", string="y", start=(1, 5), end=(1, 6)),
            Token(type="RBRACE", string="}", start=(1, 6), end=(1, 7)),
            Token(type="FSTRING_MIDDLE", string="", start=(1, 7), end=(1, 7)),
            Token(type="FSTRING_END", string='"', start=(1, 7), end=(1, 8)),
            Token(type="NEWLINE", string="
", start=(1, 8), end=(1, 9)),
            Token(type="ENDMARKER", string="", start=(2, 0), end=(2, 0)),
        ],
    )
    assert_tokenizes(
        'f"x\\
{a}"
',
        [
            Token(type="FSTRING_START", string='f"', start=(1, 0), end=(1, 2)),
            Token(type="FSTRING_MIDDLE", string="x\\
", start=(1, 2), end=(2, 0)),
            Token(type="LBRACE", string="{", start=(2, 0), end=(2, 1)),
            Token(type="NAME", string="a", start=(2, 1), end=(2, 2)),
            Token(type="RBRACE", string="}", start=(2, 2), end=(2, 3)),
            Token(type="FSTRING_MIDDLE", string="", start=(2, 3), end=(2, 3)),
            Token(type="FSTRING_END", string='"', start=(2, 3), end=(2, 4)),
            Token(type="NEWLINE", string="
", start=(2, 4), end=(2, 5)),
            Token(type="ENDMARKER", string="", start=(3, 0), end=(3, 0)),
        ],
    )


# Run "echo some code | python tests/test_tokenize.py" to generate test cases.
if __name__ == "__main__":
    code = sys.stdin.read()
    tokens = get_tokens(code)
    text = f"assert_tokenizes({code!r}, {tokens!r})"
    text = black.format_str(text, mode=black.Mode())
    print(textwrap.indent(text, "    "))