tenets

Overview Schema Related Servers Score Discussions

test_tokens.py•11 KiB

"""Tests for token utilities.""" from unittest.mock import patch import pytest from tenets.utils.tokens import ( _HAS_TIKTOKEN, _get_encoding_for_model, chunk_text, count_tokens, get_model_max_tokens, ) class TestTokenCounting: """Test suite for token counting functionality.""" def test_count_tokens_empty(self): """Test counting tokens in empty string.""" assert count_tokens("") == 0 assert count_tokens("", model="gpt-4") == 0 def test_count_tokens_heuristic(self): """Test token counting with heuristic (no tiktoken).""" # Mock tiktoken not being available with patch("tenets.utils.tokens._HAS_TIKTOKEN", False): # Heuristic is ~4 chars per token text = "This is a test string with some words." token_count = count_tokens(text) # Should be approximately len(text) / 4 expected = max(1, len(text) // 4) assert token_count == expected def test_count_tokens_short_text(self): """Test token counting with short text.""" text = "Hello" count = count_tokens(text) assert count > 0 # Short text should be at least 1 token assert count >= 1 @pytest.mark.skipif(not _HAS_TIKTOKEN, reason="tiktoken not installed") def test_count_tokens_with_tiktoken(self): """Test token counting with tiktoken.""" text = "The quick brown fox jumps over the lazy dog." # Test with different models count_gpt4 = count_tokens(text, model="gpt-4") count_gpt4o = count_tokens(text, model="gpt-4o") assert count_gpt4 > 0 assert count_gpt4o > 0 # Both should give reasonable counts assert 5 <= count_gpt4 <= 20 assert 5 <= count_gpt4o <= 20 @pytest.mark.skipif(not _HAS_TIKTOKEN, reason="tiktoken not installed") def test_count_tokens_model_specific(self): """Test model-specific token counting.""" text = "Test text for tokenization" # Different models might have different tokenizations models = ["gpt-4", "gpt-4o", "gpt-4o-mini"] counts = {} for model in models: counts[model] = count_tokens(text, model=model) # All should produce counts for model, count in counts.items(): assert count > 0 def test_count_tokens_unicode(self): """Test token counting with unicode text.""" texts = [ "Hello 世界", # Chinese "Привет мир", # Russian "🚀 Emoji text 🎉", # Emojis "Ñoño", # Spanish ] for text in texts: count = count_tokens(text) assert count > 0 def test_count_tokens_code(self): """Test token counting with code.""" code = """ def fibonacci(n): if n <= 1: return n return fibonacci(n-1) + fibonacci(n-2) """ count = count_tokens(code) assert count > 10 # Code should have reasonable token count @pytest.mark.skipif(not _HAS_TIKTOKEN, reason="tiktoken not installed") def test_get_encoding_for_model(self): """Test getting encoding for different models.""" # Known models enc = _get_encoding_for_model("gpt-4") assert enc is not None enc = _get_encoding_for_model("gpt-4o") assert enc is not None # Unknown model should fall back to cl100k_base enc = _get_encoding_for_model("unknown-model") assert enc is not None # No model should still return an encoding enc = _get_encoding_for_model(None) assert enc is not None def test_get_encoding_error_handling(self): """Test encoding error handling.""" with patch("tenets.utils.tokens._HAS_TIKTOKEN", True): with patch("tiktoken.get_encoding", side_effect=Exception("Encoding error")): enc = _get_encoding_for_model("gpt-4") assert enc is None # Should fall back to heuristic count = count_tokens("test text") assert count > 0 class TestModelMaxTokens: """Test suite for model max tokens functionality.""" def test_get_model_max_tokens_known_models(self): """Test getting max tokens for known models.""" test_cases = [ ("gpt-4", 8_192), ("gpt-4.1", 128_000), ("gpt-4o", 128_000), ("gpt-4o-mini", 128_000), # ("gpt-3.5-turbo", 16_385), # legacy ("claude-3-opus", 200_000), ("claude-3-5-sonnet", 200_000), ("claude-3-haiku", 200_000), ] for model, expected in test_cases: assert get_model_max_tokens(model) == expected def test_get_model_max_tokens_unknown_model(self): """Test getting max tokens for unknown model.""" assert get_model_max_tokens("unknown-model") == 100_000 assert get_model_max_tokens("gpt-5-future") == 100_000 def test_get_model_max_tokens_none(self): """Test getting max tokens with None model.""" assert get_model_max_tokens(None) == 100_000 class TestTextChunking: """Test suite for text chunking functionality.""" def test_chunk_text_empty(self): """Test chunking empty text.""" chunks = chunk_text("", max_tokens=100) assert chunks == [""] def test_chunk_text_small(self): """Test chunking text smaller than limit.""" text = "This is a small text." chunks = chunk_text(text, max_tokens=100) assert len(chunks) == 1 assert chunks[0] == text def test_chunk_text_multiline(self): """Test chunking multiline text.""" text = """Line 1 Line 2 Line 3 Line 4 Line 5""" # Force small chunks chunks = chunk_text(text, max_tokens=5) assert len(chunks) > 1 # Chunks should be line-aware for chunk in chunks: # Each chunk should contain complete lines assert chunk.strip() != "" def test_chunk_text_preserves_lines(self): """Test that chunking preserves line boundaries.""" lines = [f"This is line number {i} with some content" for i in range(20)] text = "\n".join(lines) chunks = chunk_text(text, max_tokens=50) # Reconstruct text from chunks reconstructed = "".join(chunks) # Should preserve all content assert reconstructed == text # Each chunk should end at line boundaries (except possibly the last) for chunk in chunks[:-1]: if chunk and not chunk.endswith("\n"): # If no newline at end, it should be the last chunk assert chunk == chunks[-1] def test_chunk_text_with_model(self): """Test chunking with specific model.""" text = "Test text " * 100 # Repeat to make longer chunks = chunk_text(text, max_tokens=50, model="gpt-4") assert len(chunks) > 1 # Each chunk should be within token limit for chunk in chunks: token_count = count_tokens(chunk, model="gpt-4") # Allow some flexibility for edge cases assert token_count <= 60 # Small buffer for boundary issues def test_chunk_text_zero_max_tokens(self): """Test chunking with zero or negative max tokens.""" text = "Some text content" chunks = chunk_text(text, max_tokens=0) assert chunks == [text] chunks = chunk_text(text, max_tokens=-10) assert chunks == [text] def test_chunk_text_single_long_line(self): """Test chunking a single very long line.""" # Single line that's too long for one chunk text = "word " * 1000 # No newlines chunks = chunk_text(text, max_tokens=100) # Should still chunk even without line breaks assert len(chunks) > 1 # Reconstruct should preserve content assert "".join(chunks) == text def test_chunk_text_code(self): """Test chunking code with proper line preservation.""" code = """ def example_function(param1, param2): '''This is a docstring.''' result = param1 + param2 if result > 100: print("Large result") return result * 2 else: print("Small result") return result # This should not be reached raise ValueError("Unexpected") class ExampleClass: def __init__(self): self.value = 42 def method(self): return self.value * 2 """ chunks = chunk_text(code, max_tokens=50) # Should preserve code structure reconstructed = "".join(chunks) assert reconstructed == code # Each chunk should be valid code lines for chunk in chunks: lines = chunk.split("\n") # Check indentation is preserved for line in lines: if line.strip(): # Non-empty lines # Indentation should be spaces or nothing assert line[0] in " #'\"def class" or line[0].isalpha() def test_chunk_text_mixed_content(self): """Test chunking mixed content (prose and code).""" content = ( """ # Documentation This is some documentation about the following code: ```python def hello(): print("Hello, world!") ``` And here's more text after the code block. """ * 10 ) # Repeat to make longer chunks = chunk_text(content, max_tokens=100) assert len(chunks) > 1 # Content should be preserved assert "".join(chunks) == content def test_chunk_text_boundary_cases(self): """Test chunk boundaries with exact token limits.""" # Create text with known character count # The heuristic uses ~4 chars per token text_small = "x" * 40 # ~10 tokens by heuristic text_large = "x" * 100 # ~25 tokens by heuristic # Small text should fit in one chunk with reasonable limit chunks = chunk_text(text_small, max_tokens=15) # Should fit in one chunk assert len(chunks) == 1 # Large text should need multiple chunks with small limit chunks = chunk_text(text_large, max_tokens=10) # Should need multiple chunks (100 chars / (10 tokens * 4 chars/token) = 2.5) assert len(chunks) >= 2 # Verify concatenation preserves content assert "".join(chunks) == text_large def test_chunk_text_unicode_handling(self): """Test chunking with unicode characters.""" text = ( """ English text 中文文本 Текст на русском 日本語のテキスト 한국어 텍스트 عربي نص 🚀 Emoji line 🎉 """ * 5 ) chunks = chunk_text(text, max_tokens=50) # Should handle unicode properly reconstructed = "".join(chunks) assert reconstructed == text # No corruption of unicode assert "中文文本" in reconstructed assert "🚀" in reconstructed

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/jddunn/tenets'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_tokens.py•11 KiB