pydis-qualifier-25/tests/test_tokenizer.py
2025-07-27 12:51:35 +02:00

200 lines
5.9 KiB
Python

import pytest
from hypothesis import given
from hypothesis import strategies as st
from src.tokenizer import Token, TokenStream, TokenType, tokenize_selector
from tests.hypot import css_class, css_class_multi, css_id, css_tag, selector
# region: Tokenization tests
@pytest.mark.parametrize(
("input_str", "expected_type", "expected_value"),
[
("div", TokenType.TAG, "div"),
("#id", TokenType.ID, "#id"),
(".class", TokenType.CLASS, ".class"),
("123", TokenType.NUMBER, "123"),
(",", TokenType.COMMA, ", "),
(">", TokenType.DIRECT_CHILD, " > "),
(" ", TokenType.DESCENDANT, " "),
(":hover", TokenType.PSEUDO_CLASS, ":hover"),
("(", TokenType.LPARENS, "("),
(")", TokenType.RPARENS, ")"),
("::after", TokenType.PSEUDO_ELEMENT, "::after"),
("+", TokenType.ADJACENT_SIBLING, " + "),
("~", TokenType.SUBSEQUENT_SIBLING, " ~ "),
("$", TokenType.UNKNOWN, "$"),
],
)
def test_individual_tokens(input_str: str, expected_type: TokenType, expected_value: str) -> None:
"""Test each token type in isolation."""
tokens = list(tokenize_selector(input_str))
assert len(tokens) == 1
token = tokens[0]
assert token.type == expected_type
assert token.value == expected_value
@pytest.mark.parametrize(
("selector", "expected"),
[
("div.class", [TokenType.TAG, TokenType.CLASS]),
("div > .class", [TokenType.TAG, TokenType.DIRECT_CHILD, TokenType.CLASS]),
("div, span", [TokenType.TAG, TokenType.COMMA, TokenType.TAG]),
("a:b::c", [TokenType.TAG, TokenType.PSEUDO_CLASS, TokenType.PSEUDO_ELEMENT]),
("a + b", [TokenType.TAG, TokenType.ADJACENT_SIBLING, TokenType.TAG]),
("a ~ b", [TokenType.TAG, TokenType.SUBSEQUENT_SIBLING, TokenType.TAG]),
("div (", [TokenType.TAG, TokenType.DESCENDANT, TokenType.LPARENS]),
],
)
def test_token_combinations(selector: str, expected: list[TokenType]) -> None:
"""Test combinations of tokens (not necessarily valid ones)."""
tokens = list(tokenize_selector(selector))
assert [t.type for t in tokens] == expected
def test_empty_string() -> None:
"""Test tokenizing empty string returns no tokens."""
tokens = list(tokenize_selector(""))
assert len(tokens) == 0
@given(css_tag)
def test_valid_tags(tag: str) -> None:
"""Test valid tag names."""
tokens = list(tokenize_selector(tag))
assert len(tokens) == 1
assert tokens[0].type == TokenType.TAG
assert tokens[0].value == tag
@given(css_id)
def test_valid_ids(id_val: str) -> None:
"""Test valid ID values."""
tokens = list(tokenize_selector(id_val))
assert len(tokens) == 1
assert tokens[0].type == TokenType.ID
assert tokens[0].value == id_val
@given(css_class)
def test_valid_class(val: str) -> None:
"""Test valid single class values."""
tokens = list(tokenize_selector(val))
assert len(tokens) == 1
assert tokens[0].type == TokenType.CLASS
assert tokens[0].value == val
@given(css_class_multi)
def test_valid_class_multi(val: str) -> None:
"""Test valid multi class values."""
tokens = list(tokenize_selector(val))
assert all(tok.type == TokenType.CLASS for tok in tokens)
@given(selector)
def test_arbitrary_valid_selector(selector: str) -> None:
"""Ensure tokenizer can handle any valid selector string."""
tokens = list(tokenize_selector(selector))
tok_types = {tok.type for tok in tokens}
assert TokenType.UNKNOWN not in tok_types
@given(st.text())
def test_no_crashes_on_arbitrary_text(s: str) -> None:
"""Ensure tokenizer doesn't crash on any input.
(We should instead handle this with unknown tokens.)
"""
_ = list(tokenize_selector(s))
# endregion
# region: TokenStream tests
def test_peek_and_pop() -> None:
tokens = [Token(TokenType.TAG, "div"), Token(TokenType.CLASS, ".main")]
stream = TokenStream(tokens)
# Initial peek
assert stream.peek() == tokens[0]
# Pop moves the stream
assert stream.pop() == tokens[0]
assert stream.peek() == tokens[1]
def test_peek_trusted() -> None:
stream = TokenStream([Token(TokenType.TAG, "div")])
tok = stream.peek_trusted()
assert tok.type == TokenType.TAG
_ = stream.pop()
with pytest.raises(AssertionError):
_ = stream.peek_trusted()
def test_has_more() -> None:
stream = TokenStream([Token(TokenType.TAG, "div")])
assert stream.has_more()
_ = stream.pop()
assert not stream.has_more()
def test_pop_exhausted_raises() -> None:
stream = TokenStream([Token(TokenType.TAG, "div")])
_ = stream.pop()
with pytest.raises(StopIteration):
_ = stream.pop()
def test_consume_while() -> None:
tokens = [Token(TokenType.TAG, "div"), Token(TokenType.CLASS, ".main"), Token(TokenType.ID, "#id")]
stream = TokenStream(tokens)
# Consume until we see an ID token
consumed = list(stream.consume_while(lambda t: t.type != TokenType.ID))
assert consumed == tokens[:2]
assert stream.peek() == tokens[2]
def test_consume_while_all() -> None:
tokens = [Token(TokenType.TAG, "div"), Token(TokenType.CLASS, ".main"), Token(TokenType.ID, "#id")]
stream = TokenStream(tokens)
# Consume until we see an ID token
consumed = list(stream.consume_while(lambda t: t.type != TokenType.LPARENS))
assert consumed == tokens
assert stream.peek() is None
def test_reset() -> None:
tokens = [Token(TokenType.TAG, "div"), Token(TokenType.CLASS, ".main")]
stream = TokenStream(tokens)
_ = stream.pop()
assert stream.peek() == tokens[1]
stream.reset()
assert stream.peek() == tokens[0]
def test_str_and_raw_str() -> None:
tokens = [
Token(TokenType.TAG, "div"),
Token(TokenType.CLASS, ".main"),
Token(TokenType.COMMA, ", "),
Token(TokenType.TAG, "a"),
]
stream = TokenStream(tokens)
assert str(stream) == "div.main, a"
assert stream.raw_str == "div.main, a"
# endregion