Skip to main content
Glama
dataset-analyzer.ts10.6 kB
/** * Dataset Analyzer for Multi-Model Comparison * * Analyzes evaluation datasets to group them by scenario and extract * model responses for comparative evaluation. */ import { readFileSync, readdirSync } from 'fs'; import { join } from 'path'; export interface DatasetSample { input: { issue: string; [key: string]: any; }; output: string; performance: { duration_ms: number; input_tokens: number; output_tokens: number; total_tokens: number; model_version: string; sdk: string; iterations?: number; tool_calls_executed?: number; cache_read_tokens?: number; cache_creation_tokens?: number; }; metadata: { timestamp: string; complexity: string; tags: string[]; source: string; tool: string; test_scenario: string; failure_analysis: string; }; } export interface ModelResponse { model: string; response: string; performance: { duration_ms: number; input_tokens: number; output_tokens: number; total_tokens: number; iterations?: number; tool_calls_executed?: number; cache_read_tokens?: number; cache_creation_tokens?: number; }; metadata: { timestamp: string; complexity: string; test_scenario: string; issue?: string; interaction_count?: number; failure_analysis?: { failure_type: string; failure_reason: string; time_to_failure: number; }; all_failures?: Array<{ failure_type: string; failure_reason: string; time_to_failure: number; }>; }; } export interface ComparisonScenario { issue: string; interaction_id: string; tool: string; models: ModelResponse[]; } export class DatasetAnalyzer { private datasetDir: string; constructor(datasetDir: string = './eval/datasets') { this.datasetDir = datasetDir; } /** * Find all available datasets for a specific tool */ findDatasets(tool: string): string[] { const files = readdirSync(this.datasetDir); return files .filter(file => file.startsWith(`${tool}_`) && file.endsWith('.jsonl')) .map(file => join(this.datasetDir, file)); } /** * Parse dataset filename to extract components * Format: {tool}_{interaction_id}_{sdk}_{model}_{timestamp}.jsonl */ parseDatasetFilename(filename: string): { tool: string; interaction_id: string; sdk: string; model: string; timestamp: string; } | null { const basename = filename.replace(/^.*\//, '').replace(/\.jsonl$/, ''); const parts = basename.split('_'); if (parts.length < 5) return null; // For remediate datasets: remediate_{phase}_{action}_vercel_{model}_{timestamp} // e.g., remediate_manual_analyze_vercel_gpt_timestamp const tool = parts[0]; const timestamp = parts[parts.length - 1]; // Find 'vercel' SDK position to split correctly const sdkIndex = parts.indexOf('vercel'); if (sdkIndex === -1) return null; // interaction_id is everything between tool and sdk const interaction_id = parts.slice(1, sdkIndex).join('_'); const sdk = parts[sdkIndex]; const model = parts.slice(sdkIndex + 1, -1).join('_'); return { tool, interaction_id, sdk, model, timestamp }; } /** * Load and parse a dataset file */ loadDataset(filepath: string): DatasetSample | null { try { const content = readFileSync(filepath, 'utf8').trim(); if (!content) return null; return JSON.parse(content) as DatasetSample; } catch (error) { console.warn(`Failed to load dataset ${filepath}:`, error); return null; } } /** * Group datasets by scenario for comparative evaluation * Returns scenarios that have data from multiple models * Groups by both tool and interaction_id to create separate evaluations for each phase */ groupByScenario(tool: string): ComparisonScenario[] { const datasets = this.findDatasets(tool); const scenarioGroups = new Map<string, Map<string, ModelResponse[]>>(); // Group datasets by filename pattern up to provider, then by model for (const filepath of datasets) { const sample = this.loadDataset(filepath); if (!sample) continue; // Extract scenario key from filename pattern (up to provider) const filename = filepath.replace(/^.*\//, ''); // Remove directory path const filenameParts = filename.split('_'); const beforeProvider = []; for (const part of filenameParts) { if (part === 'vercel') break; // Stop at SDK name beforeProvider.push(part); } const scenarioKey = beforeProvider.join('_'); // Group by model within each scenario const modelKey = `${sample.performance.sdk}_${sample.performance.model_version}`; if (!scenarioGroups.has(scenarioKey)) { scenarioGroups.set(scenarioKey, new Map()); } const modelGroups = scenarioGroups.get(scenarioKey)!; if (!modelGroups.has(modelKey)) { modelGroups.set(modelKey, []); } // Parse failure_analysis if it exists let failure_analysis = undefined; if (sample.metadata.failure_analysis && sample.metadata.failure_analysis !== "") { try { if (typeof sample.metadata.failure_analysis === 'string') { failure_analysis = JSON.parse(sample.metadata.failure_analysis); } else { failure_analysis = sample.metadata.failure_analysis; } } catch (error) { // If parsing fails, treat as no failure analysis failure_analysis = undefined; } } modelGroups.get(modelKey)!.push({ model: modelKey, response: sample.output, performance: sample.performance, metadata: { timestamp: sample.metadata.timestamp, complexity: sample.metadata.complexity, test_scenario: sample.metadata.test_scenario, issue: sample.input.issue, failure_analysis } }); } // Convert to comparison scenarios - include ALL scenarios (remove multi-model filter) const scenarios: ComparisonScenario[] = []; for (const [scenarioKey, modelGroups] of scenarioGroups) { // Flatten model groups: each model may have multiple interactions const allModelResponses: ModelResponse[] = []; for (const [modelKey, interactions] of modelGroups) { // Combine multiple interactions per model into a single response if (interactions.length === 1) { allModelResponses.push(interactions[0]); } else { // Multiple interactions per model - combine them const combinedResponse = this.combineModelInteractions(modelKey, interactions); allModelResponses.push(combinedResponse); } } // Get representative issue from first model's first interaction const firstModel = Array.from(modelGroups.values())[0]?.[0]; const issue = firstModel?.metadata?.issue || scenarioKey; scenarios.push({ issue, interaction_id: scenarioKey, tool, models: allModelResponses }); } return scenarios; } /** * Combine multiple interactions per model into a single response for evaluation */ private combineModelInteractions(modelKey: string, interactions: ModelResponse[]): ModelResponse { // Sort interactions by timestamp const sorted = interactions.sort((a, b) => new Date(a.metadata.timestamp).getTime() - new Date(b.metadata.timestamp).getTime() ); // Create combined response showing all interactions const combinedResponse = sorted.map((interaction, index) => `**Interaction ${index + 1}:**\n` + `Issue: ${interaction.metadata.issue}\n` + `Response: ${interaction.response}\n` ).join('\n---\n'); // Aggregate performance metrics const totalDuration = sorted.reduce((sum, i) => sum + i.performance.duration_ms, 0); const totalInputTokens = sorted.reduce((sum, i) => sum + i.performance.input_tokens, 0); const totalOutputTokens = sorted.reduce((sum, i) => sum + i.performance.output_tokens, 0); // Collect all failure analyses from all interactions that have them const allFailures: any[] = []; sorted.forEach((interaction, index) => { if (interaction.metadata.failure_analysis) { allFailures.push({ interaction_number: index + 1, issue: interaction.metadata.issue, ...interaction.metadata.failure_analysis }); } }); // Use the first failure as the primary failure_analysis, but preserve all failures const primaryFailureAnalysis = allFailures.length > 0 ? allFailures[0] : undefined; return { model: modelKey, response: combinedResponse, performance: { ...sorted[0].performance, duration_ms: totalDuration, input_tokens: totalInputTokens, output_tokens: totalOutputTokens, total_tokens: totalInputTokens + totalOutputTokens }, metadata: { ...sorted[0].metadata, issue: sorted[0].metadata.issue, // Use first interaction's issue as primary interaction_count: interactions.length, failure_analysis: primaryFailureAnalysis, all_failures: allFailures.length > 0 ? allFailures : undefined } }; } /** * Get summary of available models across all scenarios for a tool */ getAvailableModels(tool: string): string[] { const datasets = this.findDatasets(tool); const models = new Set<string>(); for (const filepath of datasets) { const parsed = this.parseDatasetFilename(filepath); if (parsed) { models.add(`${parsed.sdk}_${parsed.model}`); } } return Array.from(models).sort(); } /** * Get statistics about dataset availability */ getDatasetStats(tool: string): { totalDatasets: number; availableModels: string[]; scenariosWithMultipleModels: number; interactionTypes: string[]; } { const scenarios = this.groupByScenario(tool); const datasets = this.findDatasets(tool); const interactionTypes = new Set<string>(); for (const filepath of datasets) { const parsed = this.parseDatasetFilename(filepath); if (parsed) { interactionTypes.add(parsed.interaction_id); } } return { totalDatasets: datasets.length, availableModels: this.getAvailableModels(tool), scenariosWithMultipleModels: scenarios.length, interactionTypes: Array.from(interactionTypes).sort() }; } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/vfarcic/dot-ai'

If you have feedback or need assistance with the MCP directory API, please join our Discord server