Markdown RAG Documentation

Overview Schema Related Servers Score Discussions

ragdocs-mcp
tests
performance

test_indexing_speed.py•8.76 KiB

import time from pathlib import Path import pytest from src.config import Config, IndexingConfig, LLMConfig, SearchConfig, ServerConfig from src.indexing.manager import IndexManager from src.indices.graph import GraphStore from src.indices.keyword import KeywordIndex from src.indices.vector import VectorIndex @pytest.fixture def config(tmp_path): docs_path = tmp_path / "docs" docs_path.mkdir() return Config( server=ServerConfig(), indexing=IndexingConfig( documents_path=str(docs_path), index_path=str(tmp_path / "indices"), ), parsers={"**/*.md": "MarkdownParser"}, search=SearchConfig(), llm=LLMConfig(), ) @pytest.fixture(scope="module") def indices(shared_embedding_model): """ Module-scoped indices with shared embedding model. Scope changed to 'module' to avoid redundant embedding model loading. Each test creates its own documents in tmp_path, providing isolation despite shared indices. """ vector = VectorIndex(embedding_model=shared_embedding_model) keyword = KeywordIndex() graph = GraphStore() return vector, keyword, graph @pytest.fixture def manager(config, indices): vector, keyword, graph = indices return IndexManager(config, vector, keyword, graph) def create_realistic_document(doc_id: int, size: str = "medium") -> str: if size == "small": paragraphs = 2 sentences_per_para = 3 elif size == "large": paragraphs = 10 sentences_per_para = 8 else: # medium paragraphs = 5 sentences_per_para = 5 content = f"# Document {doc_id}: Performance Testing\n\n" content += "tags: [performance, test, benchmark]\n" content += f"aliases: [doc-{doc_id}, test-{doc_id}]\n\n" topics = [ "machine learning algorithms and neural network architectures", "distributed systems design and microservices patterns", "database optimization techniques and query performance", "cloud infrastructure scaling and container orchestration", "software testing strategies and continuous integration", "API design principles and RESTful architecture", "security best practices and authentication mechanisms", "data structures and algorithm complexity analysis", ] for i in range(paragraphs): content += f"## Section {i + 1}\n\n" for j in range(sentences_per_para): topic = topics[(doc_id + i + j) % len(topics)] content += f"This paragraph discusses {topic} in the context of modern software development. " content += "\n\n" # Add wikilink every few paragraphs if i % 2 == 0 and doc_id > 1: content += f"See also [[Document {doc_id - 1}]] for related information.\n\n" return content def create_test_corpus(docs_path: Path, num_docs: int, doc_size: str = "medium"): for i in range(num_docs): doc_file = docs_path / f"doc_{i:04d}.md" content = create_realistic_document(i, doc_size) doc_file.write_text(content) def test_indexing_speed_10_documents(config, manager, tmp_path): """Benchmark indexing speed for 10 documents.""" docs_path = Path(config.indexing.documents_path) num_docs = 10 # Create corpus create_test_corpus(docs_path, num_docs, doc_size="medium") # Benchmark indexing start_time = time.perf_counter() for doc_file in sorted(docs_path.glob("*.md")): manager.index_document(str(doc_file)) end_time = time.perf_counter() elapsed = end_time - start_time # Calculate metrics throughput = num_docs / elapsed avg_time_per_doc = elapsed / num_docs # Verify indexing completed doc_count = manager.get_document_count() assert doc_count == num_docs, f"Expected {num_docs} documents, got {doc_count}" # Report metrics print("\n=== Indexing Performance (10 documents) ===") print(f"Total time: {elapsed:.2f}s") print(f"Throughput: {throughput:.2f} docs/sec") print(f"Avg per doc: {avg_time_per_doc:.3f}s") print(f"Documents indexed: {doc_count}") # Performance baseline assertion (should complete in reasonable time) # Allow up to 5 seconds per document for embedding model load + indexing assert elapsed < (num_docs * 5.0), f"Indexing too slow: {elapsed:.2f}s for {num_docs} docs" def test_indexing_speed_50_documents(config, manager, tmp_path): """Benchmark indexing speed for 50 documents.""" docs_path = Path(config.indexing.documents_path) num_docs = 50 # Create corpus create_test_corpus(docs_path, num_docs, doc_size="medium") # Benchmark indexing start_time = time.perf_counter() for doc_file in sorted(docs_path.glob("*.md")): manager.index_document(str(doc_file)) end_time = time.perf_counter() elapsed = end_time - start_time # Calculate metrics throughput = num_docs / elapsed avg_time_per_doc = elapsed / num_docs # Verify indexing completed doc_count = manager.get_document_count() assert doc_count == num_docs, f"Expected {num_docs} documents, got {doc_count}" # Report metrics print("\n=== Indexing Performance (50 documents) ===") print(f"Total time: {elapsed:.2f}s") print(f"Throughput: {throughput:.2f} docs/sec") print(f"Avg per doc: {avg_time_per_doc:.3f}s") print(f"Documents indexed: {doc_count}") # Performance baseline assertion (should complete in reasonable time) # Allow up to 2 seconds per document on average after model warmup assert elapsed < (num_docs * 2.0), f"Indexing too slow: {elapsed:.2f}s for {num_docs} docs" def test_indexing_speed_100_documents(config, manager, tmp_path): """Benchmark indexing speed for 100 documents.""" docs_path = Path(config.indexing.documents_path) num_docs = 100 # Create corpus create_test_corpus(docs_path, num_docs, doc_size="medium") # Benchmark indexing start_time = time.perf_counter() for doc_file in sorted(docs_path.glob("*.md")): manager.index_document(str(doc_file)) end_time = time.perf_counter() elapsed = end_time - start_time # Calculate metrics throughput = num_docs / elapsed avg_time_per_doc = elapsed / num_docs # Verify indexing completed doc_count = manager.get_document_count() assert doc_count == num_docs, f"Expected {num_docs} documents, got {doc_count}" # Report metrics print("\n=== Indexing Performance (100 documents) ===") print(f"Total time: {elapsed:.2f}s") print(f"Throughput: {throughput:.2f} docs/sec") print(f"Avg per doc: {avg_time_per_doc:.3f}s") print(f"Documents indexed: {doc_count}") # Performance baseline assertion (should complete in reasonable time) # Allow up to 2 seconds per document on average after model warmup assert elapsed < (num_docs * 2.0), f"Indexing too slow: {elapsed:.2f}s for {num_docs} docs" def test_indexing_speed_varying_document_sizes(config, tmp_path, shared_embedding_model): """ Benchmark indexing with varied document sizes. Tests performance across small, medium, and large documents to identify size-dependent performance characteristics. Note: Uses isolated indices to avoid state leakage from other tests. """ sizes = ["small", "medium", "large"] docs_path = tmp_path / "docs" docs_path.mkdir(exist_ok=True) num_docs = len(sizes) for i, size in enumerate(sizes): doc_file = docs_path / f"doc_{size}_{i:02d}.md" content = create_realistic_document(i, size=size) doc_file.write_text(content) # Create isolated indices for this test to avoid state leakage isolated_vector = VectorIndex(embedding_model=shared_embedding_model) isolated_keyword = KeywordIndex() isolated_graph = GraphStore() isolated_manager = IndexManager(config, isolated_vector, isolated_keyword, isolated_graph) # Benchmark indexing start_time = time.perf_counter() for doc_file in sorted(docs_path.glob("*.md")): isolated_manager.index_document(str(doc_file)) end_time = time.perf_counter() elapsed = end_time - start_time # Calculate metrics throughput = num_docs / elapsed avg_time_per_doc = elapsed / num_docs # Verify indexing completed doc_count = isolated_manager.get_document_count() assert doc_count == num_docs, f"Expected {num_docs} documents, got {doc_count}" # Report metrics print("\n=== Indexing Performance (Mixed Document Sizes) ===") print("Corpus: 5 small + 5 medium + 5 large documents") print(f"Total time: {elapsed:.2f}s") print(f"Throughput: {throughput:.2f} docs/sec") print(f"Avg per doc: {avg_time_per_doc:.3f}s") print(f"Documents indexed: {doc_count}") # Performance baseline assertion assert elapsed < (num_docs * 3.0), f"Indexing too slow: {elapsed:.2f}s for {num_docs} mixed docs"

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/andnp/ragdocs-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_indexing_speed.py•8.76 KiB