/**
* MySQL Descriptive Statistics Tools
*
* Tools for basic statistical analysis: descriptive stats, percentiles,
* distribution, time series, and sampling.
* 5 tools total.
*/
import { z } from 'zod';
import type { MySQLAdapter } from '../../MySQLAdapter.js';
import type { ToolDefinition, RequestContext } from '../../../../types/index.js';
// =============================================================================
// Schemas
// =============================================================================
const DescriptiveStatsSchema = z.object({
table: z.string().describe('Table name'),
column: z.string().describe('Numeric column name'),
where: z.string().optional().describe('Optional WHERE clause condition')
});
const PercentilesSchema = z.object({
table: z.string().describe('Table name'),
column: z.string().describe('Numeric column name'),
percentiles: z.array(z.number().min(0).max(100)).default([25, 50, 75, 90, 95, 99]).describe('Percentiles to calculate'),
where: z.string().optional().describe('Optional WHERE clause condition')
});
const DistributionSchema = z.object({
table: z.string().describe('Table name'),
column: z.string().describe('Column to analyze'),
buckets: z.number().default(10).describe('Number of histogram buckets'),
where: z.string().optional().describe('Optional WHERE clause condition')
});
const TimeSeriesSchema = z.object({
table: z.string().describe('Table name'),
valueColumn: z.string().describe('Numeric column for values'),
timeColumn: z.string().describe('Timestamp/datetime column'),
interval: z.enum(['minute', 'hour', 'day', 'week', 'month']).default('day').describe('Aggregation interval'),
aggregation: z.enum(['avg', 'sum', 'count', 'min', 'max']).default('avg').describe('Aggregation function'),
where: z.string().optional().describe('Optional WHERE clause condition'),
limit: z.number().default(100).describe('Maximum number of data points')
});
const SamplingSchema = z.object({
table: z.string().describe('Table name'),
sampleSize: z.number().default(100).describe('Number of rows to sample'),
columns: z.array(z.string()).optional().describe('Columns to include (all if not specified)'),
seed: z.number().optional().describe('Random seed for reproducibility'),
where: z.string().optional().describe('Optional WHERE clause condition')
});
// =============================================================================
// Tool Creation Functions
// =============================================================================
/**
* Calculate descriptive statistics
*/
export function createDescriptiveStatsTool(adapter: MySQLAdapter): ToolDefinition {
return {
name: 'mysql_stats_descriptive',
title: 'MySQL Descriptive Statistics',
description: 'Calculate descriptive statistics (mean, median, stddev, min, max, count) for a numeric column.',
group: 'stats',
inputSchema: DescriptiveStatsSchema,
requiredScopes: ['read'],
annotations: {
readOnlyHint: true,
idempotentHint: true
},
handler: async (params: unknown, _context: RequestContext) => {
const { table, column, where } = DescriptiveStatsSchema.parse(params);
// Validate identifiers
if (!/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(table)) {
throw new Error('Invalid table name');
}
if (!/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(column)) {
throw new Error('Invalid column name');
}
const whereClause = where ? `WHERE ${where}` : '';
// Get basic count for median calculation
const countResult = await adapter.executeQuery(
`SELECT COUNT(*) as count FROM \`${table}\` ${whereClause}`
);
const totalCount = (countResult.rows?.[0])?.['count'] as number ?? 0;
if (totalCount === 0) {
return {
column,
count: 0,
mean: null,
median: null,
stddev: null,
variance: null,
min: null,
max: null,
range: null,
sum: null
};
}
// Calculate median offset/limit
const limit = 2 - (totalCount % 2);
const offset = Math.floor((totalCount - 1) / 2);
const medianQuery = `
SELECT AVG(val) as median
FROM (
SELECT \`${column}\` as val
FROM \`${table}\`
${whereClause}
ORDER BY \`${column}\`
LIMIT ${String(limit)}
OFFSET ${String(offset)}
) as median_calc
`;
const query = `
SELECT
COUNT(\`${column}\`) as count,
AVG(\`${column}\`) as mean,
STD(\`${column}\`) as stddev,
VAR_POP(\`${column}\`) as variance,
MIN(\`${column}\`) as min,
MAX(\`${column}\`) as max,
MAX(\`${column}\`) - MIN(\`${column}\`) as \`range\`,
SUM(\`${column}\`) as sum
FROM \`${table}\`
${whereClause}
`;
const [statsResult, medianResult] = await Promise.all([
adapter.executeQuery(query),
adapter.executeQuery(medianQuery)
]);
const stats = statsResult.rows?.[0];
const medianRow = medianResult.rows?.[0];
return {
column,
count: stats?.['count'] ?? 0,
mean: stats?.['mean'] ?? null,
median: medianRow?.['median'] ?? null,
stddev: stats?.['stddev'] ?? null,
variance: stats?.['variance'] ?? null,
min: stats?.['min'] ?? null,
max: stats?.['max'] ?? null,
range: stats?.['range'] ?? null,
sum: stats?.['sum'] ?? null
};
}
};
}
/**
* Calculate percentiles
*/
export function createPercentilesTool(adapter: MySQLAdapter): ToolDefinition {
return {
name: 'mysql_stats_percentiles',
title: 'MySQL Percentiles',
description: 'Calculate percentile values for a numeric column.',
group: 'stats',
inputSchema: PercentilesSchema,
requiredScopes: ['read'],
annotations: {
readOnlyHint: true,
idempotentHint: true
},
handler: async (params: unknown, _context: RequestContext) => {
const { table, column, percentiles, where } = PercentilesSchema.parse(params);
// Validate identifiers
if (!/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(table)) {
throw new Error('Invalid table name');
}
if (!/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(column)) {
throw new Error('Invalid column name');
}
const whereClause = where ? `WHERE ${where}` : '';
// Get total count
const countResult = await adapter.executeQuery(
`SELECT COUNT(*) as cnt FROM \`${table}\` ${whereClause}`
);
const totalCount = (countResult.rows?.[0])?.['cnt'] as number ?? 0;
if (totalCount === 0) {
return {
column,
totalCount: 0,
percentiles: {}
};
}
// Calculate each percentile
const percentileResults: Record<string, unknown> = {};
for (const p of percentiles) {
const offset = Math.floor((p / 100) * (totalCount - 1));
const query = `
SELECT \`${column}\` as value
FROM \`${table}\`
${whereClause}
ORDER BY \`${column}\`
LIMIT 1 OFFSET ${String(offset)}
`;
const result = await adapter.executeQuery(query);
const valueRow = result.rows?.[0];
percentileResults[`p${String(p)}`] = valueRow?.['value'] ?? null;
}
return {
column,
totalCount,
percentiles: percentileResults
};
}
};
}
/**
* Analyze data distribution
*/
export function createDistributionTool(adapter: MySQLAdapter): ToolDefinition {
return {
name: 'mysql_stats_distribution',
title: 'MySQL Distribution Analysis',
description: 'Analyze the distribution of values in a column with histogram buckets.',
group: 'stats',
inputSchema: DistributionSchema,
requiredScopes: ['read'],
annotations: {
readOnlyHint: true,
idempotentHint: true
},
handler: async (params: unknown, _context: RequestContext) => {
const { table, column, buckets, where } = DistributionSchema.parse(params);
// Validate identifiers
if (!/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(table)) {
throw new Error('Invalid table name');
}
if (!/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(column)) {
throw new Error('Invalid column name');
}
const whereClause = where ? `WHERE ${where}` : '';
// Get min/max for bucket calculation
const rangeResult = await adapter.executeQuery(
`SELECT MIN(\`${column}\`) as min_val, MAX(\`${column}\`) as max_val FROM \`${table}\` ${whereClause}`
);
const rangeRow = rangeResult.rows?.[0];
const minVal = rangeRow?.['min_val'] as number ?? 0;
const maxVal = rangeRow?.['max_val'] as number ?? 0;
if (minVal === maxVal) {
return {
column,
distribution: [{ bucket: 0, rangeStart: minVal, rangeEnd: maxVal, count: 1 }],
bucketCount: 1,
minValue: minVal,
maxValue: maxVal
};
}
const bucketSize = (maxVal - minVal) / buckets;
// Generate distribution query with WIDTH_BUCKET emulation
const query = `
SELECT
FLOOR((\`${column}\` - ${String(minVal)}) / ${String(bucketSize)}) as bucket,
COUNT(*) as count,
MIN(\`${column}\`) as bucket_min,
MAX(\`${column}\`) as bucket_max
FROM \`${table}\`
${whereClause}
GROUP BY bucket
ORDER BY bucket
`;
const result = await adapter.executeQuery(query);
// Format buckets with proper ranges
const distribution = (result.rows ?? []).map((row) => {
const r = row;
const bucketNum = r['bucket'] as number;
return {
bucket: bucketNum,
rangeStart: minVal + (bucketNum * bucketSize),
rangeEnd: minVal + ((bucketNum + 1) * bucketSize),
count: r['count'] as number,
bucketMin: r['bucket_min'],
bucketMax: r['bucket_max']
};
});
return {
column,
distribution,
bucketCount: buckets,
bucketSize,
minValue: minVal,
maxValue: maxVal
};
}
};
}
/**
* Time series analysis with moving averages
*/
export function createTimeSeriesToolStats(adapter: MySQLAdapter): ToolDefinition {
return {
name: 'mysql_stats_time_series',
title: 'MySQL Time Series Analysis',
description: 'Aggregate and analyze time series data with specified intervals.',
group: 'stats',
inputSchema: TimeSeriesSchema,
requiredScopes: ['read'],
annotations: {
readOnlyHint: true,
idempotentHint: true
},
handler: async (params: unknown, _context: RequestContext) => {
const { table, valueColumn, timeColumn, interval, aggregation, where, limit } = TimeSeriesSchema.parse(params);
// Validate identifiers
if (!/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(table)) {
throw new Error('Invalid table name');
}
if (!/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(valueColumn) || !/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(timeColumn)) {
throw new Error('Invalid column name');
}
let dateFormat: string;
switch (interval) {
case 'minute': dateFormat = '%Y-%m-%d %H:%i:00'; break;
case 'hour': dateFormat = '%Y-%m-%d %H:00:00'; break;
case 'day': dateFormat = '%Y-%m-%d'; break;
case 'week': dateFormat = '%Y-%u'; break;
case 'month': dateFormat = '%Y-%m'; break;
}
const whereClause = where ? `WHERE ${where}` : '';
const aggFunc = aggregation.toUpperCase();
const query = `
SELECT
DATE_FORMAT(\`${timeColumn}\`, '${dateFormat}') as period,
${aggFunc}(\`${valueColumn}\`) as value,
COUNT(*) as data_points,
MIN(\`${valueColumn}\`) as period_min,
MAX(\`${valueColumn}\`) as period_max
FROM \`${table}\`
${whereClause}
GROUP BY period
ORDER BY period DESC
LIMIT ${String(limit)}
`;
const result = await adapter.executeQuery(query);
return {
interval,
aggregation,
valueColumn,
timeColumn,
dataPoints: result.rows ?? [],
count: result.rows?.length ?? 0
};
}
};
}
/**
* Random sampling
*/
export function createSamplingTool(adapter: MySQLAdapter): ToolDefinition {
return {
name: 'mysql_stats_sampling',
title: 'MySQL Random Sampling',
description: 'Get a random sample of rows from a table.',
group: 'stats',
inputSchema: SamplingSchema,
requiredScopes: ['read'],
annotations: {
readOnlyHint: true,
idempotentHint: false // Random results
},
handler: async (params: unknown, _context: RequestContext) => {
const { table, sampleSize, columns, seed, where } = SamplingSchema.parse(params);
// Validate table name
if (!/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(table)) {
throw new Error('Invalid table name');
}
// Validate column names if provided
const columnList = columns !== undefined && columns.length > 0
? columns.map(c => {
if (!/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(c)) {
throw new Error(`Invalid column name: ${c}`);
}
return `\`${c}\``;
}).join(', ')
: '*';
const whereClause = where ? `WHERE ${where}` : '';
// If seed is provided, use it for reproducibility
let query: string;
if (seed !== undefined) {
query = `
SELECT ${columnList}
FROM \`${table}\`
${whereClause}
ORDER BY RAND(${String(seed)})
LIMIT ${String(sampleSize)}
`;
} else {
query = `
SELECT ${columnList}
FROM \`${table}\`
${whereClause}
ORDER BY RAND()
LIMIT ${String(sampleSize)}
`;
}
const result = await adapter.executeQuery(query);
return {
sample: result.rows ?? [],
sampleSize: result.rows?.length ?? 0,
requestedSize: sampleSize,
seed: seed ?? null
};
}
};
}