Claude Conversation Memory System

generate_test_data.py•14.3 KiB

#!/usr/bin/env python3 """ Generate test data for performance benchmarking. Creates realistic conversations with varied sizes and content to match the README claim of 159 conversations totaling 8.8MB. """ import json import random import os import sys from datetime import datetime, timedelta from pathlib import Path from typing import List, Dict, Tuple # Add parent directory to path sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) from conversation_memory import ConversationMemoryServer # Constants TECH_TOPICS = [ "python", "javascript", "react", "nodejs", "aws", "docker", "kubernetes", "terraform", "api", "database", "sql", "mongodb", "redis", "git", "github", "authentication", "security", "testing", "deployment", "ci/cd", "microservices", "machine learning", "data science", "frontend", "backend", "devops" ] CODE_SNIPPETS = [ """ def process_data(items): '''Process a list of items and return results''' results = [] for item in items: if validate_item(item): results.append(transform_item(item)) return results """, """ async function fetchUserData(userId) { try { const response = await fetch(`/api/users/${userId}`); const data = await response.json(); return data; } catch (error) { console.error('Failed to fetch user:', error); throw error; } } """, """ SELECT u.name, u.email, COUNT(o.id) as order_count FROM users u LEFT JOIN orders o ON u.id = o.user_id WHERE u.created_at > '2024-01-01' GROUP BY u.id, u.name, u.email HAVING COUNT(o.id) > 5 ORDER BY order_count DESC; """, """ apiVersion: apps/v1 kind: Deployment metadata: name: web-app spec: replicas: 3 selector: matchLabels: app: web-app template: metadata: labels: app: web-app spec: containers: - name: app image: myapp:latest ports: - containerPort: 8080 """ ] CONVERSATION_TEMPLATES = [ { "type": "code_review", "title_template": "Code review: {topic} implementation", "content_template": """ I'm reviewing the {topic} implementation and have some questions about the architecture. Current code: {code_snippet} My main concerns are: 1. {concern1} 2. {concern2} 3. {concern3} What would you recommend for improving this? I'm particularly interested in {specific_interest}. Also, I've been reading about {related_topic} and wondering if that pattern would apply here. """ }, { "type": "debugging", "title_template": "Debugging {topic} issue in production", "content_template": """ We're experiencing an issue with our {topic} system in production. Error message: ``` {error_message} ``` Stack trace shows the problem originates from: {code_snippet} I've tried the following approaches: - {approach1} - {approach2} - {approach3} The issue seems to be related to {related_topic}. Any suggestions for troubleshooting this? """ }, { "type": "architecture", "title_template": "Architecture decision: {topic} vs {related_topic}", "content_template": """ We need to make a decision about our {topic} architecture. Current situation: - {situation1} - {situation2} - {situation3} We're considering two approaches: Option A: {topic} based solution {code_snippet} Option B: {related_topic} based solution - {benefit1} - {benefit2} - {tradeoff1} What factors should we consider? Our main priorities are {priority1} and {priority2}. """ }, { "type": "learning", "title_template": "Learning {topic}: best practices and patterns", "content_template": """ I'm trying to understand {topic} better and how it relates to {related_topic}. From what I understand: - {understanding1} - {understanding2} - {understanding3} Example I found: {code_snippet} Questions: 1. {question1} 2. {question2} 3. {question3} Can you explain how this connects to {another_topic} and what best practices I should follow? """ } ] CONCERNS = [ "performance implications", "scalability concerns", "security vulnerabilities", "maintainability", "testing coverage", "error handling", "memory usage", "compatibility issues", "dependency management", "documentation clarity" ] ERROR_MESSAGES = [ "TypeError: Cannot read property 'undefined' of null", "ConnectionError: Unable to connect to database", "MemoryError: Heap out of memory", "AuthenticationError: Invalid credentials", "TimeoutError: Request exceeded timeout of 30s", "ValidationError: Required field missing", "PermissionError: Access denied to resource" ] class TestDataGenerator: """Generate realistic test conversations for benchmarking.""" def __init__(self, storage_path: str = "~/claude-memory-test"): self.storage_path = Path(storage_path).expanduser() self.server = ConversationMemoryServer(str(self.storage_path)) def generate_conversation_content(self, size_category: str) -> Tuple[str, str]: """Generate a single conversation with title and content.""" template = random.choice(CONVERSATION_TEMPLATES) topic = random.choice(TECH_TOPICS) related_topic = random.choice([t for t in TECH_TOPICS if t != topic]) another_topic = random.choice([t for t in TECH_TOPICS if t not in [topic, related_topic]]) # Generate title title = template["title_template"].format( topic=topic, related_topic=related_topic ) # Generate content content = template["content_template"].format( topic=topic, related_topic=related_topic, another_topic=another_topic, code_snippet=random.choice(CODE_SNIPPETS), concern1=random.choice(CONCERNS), concern2=random.choice(CONCERNS), concern3=random.choice(CONCERNS), specific_interest=random.choice(CONCERNS), error_message=random.choice(ERROR_MESSAGES), approach1=f"Checking {random.choice(TECH_TOPICS)} configuration", approach2=f"Updating {random.choice(TECH_TOPICS)} dependencies", approach3=f"Refactoring {random.choice(TECH_TOPICS)} implementation", situation1=f"Using {random.choice(TECH_TOPICS)} for {random.choice(['data processing', 'API handling', 'user management'])}", situation2=f"Need to scale to {random.choice(['10k', '100k', '1M'])} users", situation3=f"Integrating with {random.choice(TECH_TOPICS)} services", benefit1=f"Better {random.choice(['performance', 'scalability', 'maintainability'])}", benefit2=f"Easier {random.choice(['testing', 'deployment', 'monitoring'])}", tradeoff1=f"Increased {random.choice(['complexity', 'cost', 'learning curve'])}", priority1=random.choice(['performance', 'reliability', 'developer experience']), priority2=random.choice(['cost efficiency', 'scalability', 'security']), understanding1=f"{topic} is used for {random.choice(['data processing', 'service communication', 'state management'])}", understanding2=f"It integrates well with {random.choice(TECH_TOPICS)}", understanding3=f"Common patterns include {random.choice(['singleton', 'factory', 'observer', 'MVC'])}", question1=f"How does {topic} handle {random.choice(['concurrency', 'errors', 'state'])}?", question2=f"What's the difference between {topic} and {related_topic}?", question3=f"When should I use {topic} vs {another_topic}?" ) # Adjust content size based on category if size_category == "small": # Target 1-5KB while len(content) < 1000: content += f"\n\nMore details about {topic}: " + content[:500] content = content[:5000] # Cap at 5KB elif size_category == "medium": # Target 5-50KB while len(content) < 5000: content += f"\n\nAdditional context about {topic}:\n{content}" content = content[:50000] # Cap at 50KB elif size_category == "large": # Target 50-100KB while len(content) < 50000: extra_content = f"\n\nDeep dive into {topic} and {related_topic}:\n{content}" content += extra_content content = content[:100000] # Cap at 100KB return title, content async def generate_dataset(self, num_conversations: int, start_date: datetime = None) -> Dict[str, any]: """Generate a complete dataset of conversations.""" if start_date is None: start_date = datetime.now() - timedelta(days=90) # 3 months ago stats = { "total_conversations": 0, "total_size_bytes": 0, "size_distribution": {"small": 0, "medium": 0, "large": 0}, "topics_covered": set(), "date_range": {"start": None, "end": None} } for i in range(num_conversations): # Determine size category based on distribution # Adjust to get average ~55KB per conversation (8.8MB / 159) rand = random.random() if rand < 0.15: # 15% small size_category = "small" elif rand < 0.45: # 30% medium size_category = "medium" else: # 55% large size_category = "large" # Generate conversation title, content = self.generate_conversation_content(size_category) # Generate timestamp (spread across time period) days_offset = random.randint(0, 90) hours_offset = random.randint(0, 23) timestamp = start_date + timedelta(days=days_offset, hours=hours_offset) # Add conversation result = await self.server.add_conversation( content=content, title=title, conversation_date=timestamp.isoformat() ) if result["status"] == "success": stats["total_conversations"] += 1 stats["size_distribution"][size_category] += 1 # Track file size file_path = Path(result["file_path"]) if file_path.exists(): stats["total_size_bytes"] += file_path.stat().st_size # Track topics stats["topics_covered"].update(result.get("topics", [])) # Track date range if stats["date_range"]["start"] is None or timestamp < stats["date_range"]["start"]: stats["date_range"]["start"] = timestamp if stats["date_range"]["end"] is None or timestamp > stats["date_range"]["end"]: stats["date_range"]["end"] = timestamp if (i + 1) % 10 == 0: print(f"Generated {i + 1}/{num_conversations} conversations...") # Convert topics set to list for JSON serialization stats["topics_covered"] = list(stats["topics_covered"]) stats["date_range"]["start"] = stats["date_range"]["start"].isoformat() if stats["date_range"]["start"] else None stats["date_range"]["end"] = stats["date_range"]["end"].isoformat() if stats["date_range"]["end"] else None return stats def print_stats(self, stats: Dict[str, any]): """Print generation statistics.""" total_mb = stats["total_size_bytes"] / (1024 * 1024) avg_kb = (stats["total_size_bytes"] / stats["total_conversations"]) / 1024 if stats["total_conversations"] > 0 else 0 print("\n=== Generation Statistics ===") print(f"Total conversations: {stats['total_conversations']}") print(f"Total size: {total_mb:.2f} MB ({stats['total_size_bytes']} bytes)") print(f"Average size: {avg_kb:.2f} KB per conversation") print(f"\nSize distribution:") print(f" Small (1-5KB): {stats['size_distribution']['small']}") print(f" Medium (5-50KB): {stats['size_distribution']['medium']}") print(f" Large (50-100KB): {stats['size_distribution']['large']}") print(f"\nTopics covered: {len(stats['topics_covered'])}") print(f"Date range: {stats['date_range']['start']} to {stats['date_range']['end']}") # Check against README claims print(f"\n=== README Claim Validation ===") print(f"Claimed: 159 conversations, 8.8MB") print(f"Generated: {stats['total_conversations']} conversations, {total_mb:.2f}MB") if stats['total_conversations'] == 159: size_diff = abs(total_mb - 8.8) if size_diff < 0.5: # Within 0.5MB print("✅ Matches README claim!") else: print(f"⚠️ Size difference: {size_diff:.2f}MB") async def main(): """Main entry point.""" import argparse parser = argparse.ArgumentParser(description="Generate test data for performance benchmarking") parser.add_argument("--conversations", "-n", type=int, default=159, help="Number of conversations to generate (default: 159)") parser.add_argument("--storage-path", "-p", type=str, default="~/claude-memory-test", help="Storage path for generated data (default: ~/claude-memory-test)") parser.add_argument("--clean", action="store_true", help="Clean existing data before generating") args = parser.parse_args() # Clean if requested if args.clean: storage_path = Path(args.storage_path).expanduser() if storage_path.exists(): import shutil shutil.rmtree(storage_path) print(f"Cleaned existing data at {storage_path}") # Generate data generator = TestDataGenerator(args.storage_path) print(f"Generating {args.conversations} conversations...") stats = await generator.generate_dataset(args.conversations) generator.print_stats(stats) # Save stats stats_file = Path(args.storage_path).expanduser() / "generation_stats.json" with open(stats_file, 'w') as f: json.dump(stats, f, indent=2) print(f"\nStats saved to: {stats_file}") if __name__ == "__main__": import asyncio asyncio.run(main())

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/adamkwhite/claude-memory-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

generate_test_data.py•14.3 KiB