/**
* Zod schema for local_ripgrep tool
* Optimized ripgrep implementation with performance enhancements
*/
import { z } from 'zod';
import { BaseQuerySchema, createBulkQuerySchema } from './baseSchema.js';
import { TOOL_NAMES } from '../constants.js';
/**
* Tool description for MCP registration
*/
export const LOCAL_RIPGREP_DESCRIPTION = `PRIMARY SEARCH - Fast ripgrep pattern matching (STRUCTURED OUTPUT)
PURPOSE: Find functions, classes, patterns in code. Returns structured matches with precise locations.
USE_WHEN: Know code patterns | Need fast discovery | Want advanced regex
DECISION_TREE:
2. UNDERSTAND CODE PATTERNS
└─► RIPGREP (mode=discovery, pattern, type filter)
├─► Found → FETCH_CONTENT (use location.charOffset for precision)
└─► Empty → VIEW_STRUCTURE (explore dirs)
WORKFLOW: Discovery (filesOnly) → Detailed (structured matches) → Read (FETCH_CONTENT with charOffset)
OUTPUT FORMAT (TWO-LEVEL PAGINATION):
Returns structured matches grouped by file:
{
"files": [{
"path": "/absolute/path",
"matchCount": 25, // Total matches in this file
"matches": [/* First 10 matches by default */],
"modifiedTime": "2025-01-04T10:00:00Z",
"pagination": { // Per-file pagination (if more than matchesPerPage)
"currentPage": 1,
"totalPages": 3,
"matchesPerPage": 10,
"hasMore": true
}
}],
"totalMatches": 250,
"totalFiles": 50,
"pagination": { // File-level pagination
"currentPage": 1,
"totalPages": 5,
"filesPerPage": 10,
"totalFiles": 50,
"hasMore": true
}
}
PAGINATION MODEL:
- Files sorted by modification time (most recent first)
- File-level: Control with filesPerPage (default 10) + filePageNumber
- Match-level: Control with matchesPerPage (default 10) per file
- Each file shows first matchesPerPage matches with pagination info
LARGE FILES/CODEBASES:
Use filesOnly first, then FETCH_CONTENT with location.charOffset
Pattern: Discovery → Structured matches → Precise extraction
Match values automatically truncated (default 200 chars, configurable 1-800 via matchContentLength)
⚠️ BYTE OFFSETS: location.charOffset/charLength are BYTE offsets, not character offsets!
- For ASCII files: byte offset = character offset ✓
- For UTF-8 with multi-byte chars (é, 中, emoji): byte offset ≠ character offset
- Example: "Hello 世界 World" → "World" is at byte 13 but character 7
- FETCH_CONTENT uses byte offsets, so integration works directly
MODES:
- discovery: filesOnly=true [fastest, minimal output]
- paginated: filesPerPage=10, matchesPerPage=10
- detailed: contextLines=3, filesPerPage=10, matchesPerPage=20
PATTERN_TYPES:
- smartCase (default): lowercase=case-insensitive, mixed=case-sensitive
- fixedString: literal string, no regex
- perlRegex: PCRE2 with lookahead/backreferences
FILTERS:
- type="ts": File type (preferred)
- include=["*.{ts,tsx}"]: Globs (use {} for better performance)
- excludeDir=["node_modules"]: Skip dirs
KEY_PARAMS:
- filesOnly: Just paths [BEST for discovery]
- maxFiles: Limit total files (1-1000, stops after this many files)
- filesPerPage: Files per page (default 10, max 50)
- filePageNumber: File page number (default 1)
- matchesPerPage: Matches per file (default 10, max 100)
- matchContentLength: Max chars per match (default 200, max 800)
- contextLines: Context included in match value
GOTCHAS:
- Files always sorted by modification time (most recent first)
- Match values truncated automatically (default 200 chars, use matchContentLength to adjust)
- Per-file pagination shows first matchesPerPage matches with info if more available
- multiline mode → very slow
- Separate globs ["*.ts","*.tsx"] → slower than ["*.{ts,tsx}"]
NEXT_STEP:
hasResults → FETCH_CONTENT charOffset (STRONG) | FIND_FILES modifiedWithin (MODERATE)
empty → VIEW_STRUCTURE (MODERATE) | Broaden: noIgnore, hidden
EXAMPLES:
mode="discovery", pattern="validateUser", type="ts" # Find files (sorted by date)
mode="paginated", pattern="TODO:", filePageNumber=2 # Page 2 of files
pattern="export.*function", filesPerPage=5, matchesPerPage=15 # Custom pagination
pattern="(?<=export )\\w+", perlRegex=true # Advanced regex
pattern="import", path="/node_modules", filesOnly=true, maxFiles=50 # Limit large dirs`;
/**
* Ripgrep search content query schema
* Optimized based on performance research
*/
export const RipgrepQuerySchema = BaseQuerySchema.extend({
// REQUIRED FIELDS
pattern: z
.string()
.min(1)
.describe('Regex pattern or string to search (use fixedString for literals)'),
path: z
.string()
.describe('Root directory to search'),
// WORKFLOW MODE (recommended presets)
mode: z
.enum(['discovery', 'paginated', 'detailed'])
.optional()
.describe(
'Search workflow mode (auto-configures optimal settings):\n' +
' - "discovery" - Fast file discovery (sets filesOnly=true, minimal output)\n' +
' - "paginated" - Content with pagination (sets charLength=10000, maxMatchesPerFile=3)\n' +
' - "detailed" - Full matches with context (sets contextLines=3, charLength=10000)\n' +
'NOTE: Manual parameters override mode settings.'
),
// PATTERN MODES (mutually exclusive - validated at runtime)
fixedString: z
.boolean()
.optional()
.describe('Treat pattern as literal string (faster, prevents regex injection). When enabled, ripgrep uses -F flag which treats all characters literally - no regex interpretation, no escaping needed.'),
perlRegex: z
.boolean()
.optional()
.describe('Use PCRE2 regex engine (advanced: lookahead, backreferences, named groups)'),
// CASE SENSITIVITY (smart case recommended)
smartCase: z
.boolean()
.optional()
.default(true)
.describe('Smart case: lowercase pattern - case-insensitive, otherwise - case-sensitive (RECOMMENDED default)'),
caseInsensitive: z
.boolean()
.optional()
.describe('Always case-insensitive (overrides smartCase)'),
caseSensitive: z
.boolean()
.optional()
.describe('Always case-sensitive (overrides smartCase and caseInsensitive)'),
// MATCH BEHAVIOR
wholeWord: z
.boolean()
.optional()
.describe('Match whole words only (equivalent to \\b boundaries)'),
invertMatch: z
.boolean()
.optional()
.describe('Invert matching: show lines that DON\'T match'),
// FILE FILTERING (optimized strategies)
type: z
.string()
.optional()
.describe('File type filter (e.g., "ts", "js", "py", "rust") - PREFERRED over globs for known types. Use rg --type-list to see all types'),
include: z
.array(z.string())
.optional()
.describe('Include globs. TIP: Use alternatives ["*.{ts,tsx}"] instead of ["*.ts","*.tsx"] for better performance'),
exclude: z
.array(z.string())
.optional()
.describe('Exclude globs (e.g., ["*.test.*", "*.spec.*"])'),
excludeDir: z
.array(z.string())
.optional()
.describe('Exclude directories (e.g., ["node_modules", ".git", "dist"])'),
// IGNORE CONTROL (gitignore behavior)
noIgnore: z
.boolean()
.optional()
.describe('Don\'t respect .gitignore files (search everything)'),
hidden: z
.boolean()
.optional()
.describe('Search hidden files and directories (starting with .)'),
followSymlinks: z
.boolean()
.optional()
.describe('Follow symbolic links (default: false for security)'),
// OUTPUT CONTROL (critical for performance)
filesOnly: z
.boolean()
.optional()
.describe('List matching files only (RECOMMENDED for discovery: most token-efficient)'),
filesWithoutMatch: z
.boolean()
.optional()
.describe('List files WITHOUT matches (inverse of filesOnly)'),
count: z
.boolean()
.optional()
.describe('Count matches per file (shows "file:count" format)'),
countMatches: z
.boolean()
.optional()
.describe('Count total matches across all occurrences (vs count which is per-line)'),
// CONTEXT & LINE CONTROL (semantic: defines WHAT to extract)
contextLines: z
.number()
.int()
.min(0)
.max(50)
.optional()
.describe('Context lines around matches (0-50). Semantic: defines WHAT to extract. WARNING: Multiplies output significantly! Use charLength for pagination (defines HOW MUCH to return)'),
beforeContext: z
.number()
.int()
.min(0)
.max(50)
.optional()
.describe('Lines before match (0-50). Semantic parameter.'),
afterContext: z
.number()
.int()
.min(0)
.max(50)
.optional()
.describe('Lines after match (0-50). Semantic parameter.'),
matchContentLength: z
.number()
.int()
.min(1)
.max(800)
.optional()
.default(200)
.describe('Maximum characters per match value (1-800, default 200). Controls truncation of match content for token efficiency.'),
lineNumbers: z
.boolean()
.optional()
.default(true)
.describe('Show line numbers (default: true)'),
column: z
.boolean()
.optional()
.describe('Show column numbers (useful for IDE integration)'),
// MATCH LIMITING (prevents output explosion)
maxMatchesPerFile: z
.number()
.int()
.min(1)
.max(100)
.optional()
.describe('Max matches per file (legacy, use matchesPerPage instead)'),
maxFiles: z
.number()
.int()
.min(1)
.max(1000)
.optional()
.describe('Max files to search (1-1000, stops after this many files with matches)'),
// TWO-LEVEL PAGINATION (file-level + per-file matches)
filesPerPage: z
.number()
.int()
.min(1)
.max(50)
.optional()
.default(10)
.describe('Number of files per page (default 10, max 50). Files are sorted by modification time (most recent first).'),
filePageNumber: z
.number()
.int()
.min(1)
.optional()
.default(1)
.describe('File page number to retrieve (1-based, default 1). Use with filesPerPage for file pagination.'),
matchesPerPage: z
.number()
.int()
.min(1)
.max(100)
.optional()
.default(10)
.describe('Number of matches to show per file (default 10, max 100). Each file shows up to this many matches with pagination info.'),
// ADVANCED FEATURES (use with caution)
multiline: z
.boolean()
.optional()
.describe('Enable multiline mode (WARNING: slower, memory-intensive, loads entire file into memory). Only use when pattern genuinely spans lines'),
multilineDotall: z
.boolean()
.optional()
.describe('Make . match newlines in multiline mode (use with multiline=true)'),
binaryFiles: z
.enum(['text', 'without-match', 'binary'])
.optional()
.default('without-match')
.describe('Binary file handling: "text" (search as text), "without-match" (skip, default), "binary" (detect and continue)'),
// OUTPUT FORMAT & METADATA
includeStats: z
.boolean()
.optional()
.default(true)
.describe('Include search statistics (matches, files searched, bytes searched, time). Default: true'),
jsonOutput: z
.boolean()
.optional()
.describe('Output in JSON format (NDJSON - newline delimited, structured data for programmatic parsing)'),
vimgrepFormat: z
.boolean()
.optional()
.describe('Output in vim-compatible format (file:line:col:text)'),
// STRUCTURED DATA (NEW - Enhanced response fields)
parseStructured: z
.boolean()
.optional()
.default(true)
.describe('Parse JSON output into structured matches (requires jsonOutput). Default: true'),
includeDistribution: z
.boolean()
.optional()
.default(true)
.describe('Calculate and include match distribution across files. Default: true'),
includeStructured: z
.boolean()
.optional()
.default(false)
.describe('Include structuredMatches array (verbose, rarely needed). Default: false for efficiency'),
// PERFORMANCE TUNING
threads: z
.number()
.int()
.min(1)
.max(32)
.optional()
.describe('Number of threads to use (default: auto-detect based on CPU cores)'),
mmap: z
.boolean()
.optional()
.describe('Use memory mapping (default: true, faster on large files)'),
noUnicode: z
.boolean()
.optional()
.describe(
'Disable Unicode mode for all patterns. ' +
'PERFORMANCE: Faster searches, but \\w only matches ASCII [a-zA-Z0-9_], not Unicode letters. ' +
'TRADEOFF: Won\'t match Unicode identifiers (café, 世界, etc.). ' +
'Useful for pure ASCII codebases or when maximum performance is needed.'
),
encoding: z
.string()
.optional()
.describe(
'Text encoding to use. ' +
'Values: "auto" (default, BOM detection), "none" (no encoding detection, raw bytes), ' +
'or specific encoding like "utf-8", "utf-16le", "iso-8859-1", etc. ' +
'PERFORMANCE: Using "none" can be 10-30% faster on large files by skipping BOM detection. ' +
'See: https://encoding.spec.whatwg.org/#names'
),
// SORTING
sort: z
.enum(['path', 'modified', 'accessed', 'created'])
.optional()
.default('path')
.describe(
'Sort results for consistent output. ' +
'Options: "path" (default), "modified", "accessed", "created". ' +
'⚠️ PERFORMANCE: Sorting disables parallelism and can be 3-10x slower on large directories. ' +
'Only use when result order matters (e.g., tests, deterministic output).'
),
sortReverse: z
.boolean()
.optional()
.describe('Reverse sort order'),
// UTILITY FLAGS
noMessages: z
.boolean()
.optional()
.describe(
'Suppress error messages (e.g., permission denied, file too large). ' +
'Useful for automated scripts where errors are expected and should be silent.'
),
lineRegexp: z
.boolean()
.optional()
.describe(
'Only show matches for entire lines (equivalent to wrapping pattern with ^...$). ' +
'Example: pattern "foo" with lineRegexp=true only matches line "foo", not "foobar".'
),
passthru: z
.boolean()
.optional()
.describe(
'Print all lines, whether they match or not, with matches highlighted. ' +
'Useful for viewing context while highlighting matches. ' +
'WARNING: Can produce very large output on large files. Conflicts with filesOnly.'
),
debug: z
.boolean()
.optional()
.describe(
'Show debug information: why files were ignored, configuration loaded, ' +
'search strategy used, and performance characteristics. ' +
'Useful for troubleshooting unexpected results or performance issues. ' +
'Debug output goes to stderr.'
),
});
/**
* Bulk ripgrep search schema (1-10 queries per call)
*/
export const BulkRipgrepQuerySchema = createBulkQuerySchema(
TOOL_NAMES.LOCAL_RIPGREP || 'local_ripgrep',
RipgrepQuerySchema
);
export type RipgrepQuery = z.infer<typeof RipgrepQuerySchema>;
export type BulkRipgrepQuery = z.infer<typeof BulkRipgrepQuerySchema>;
/**
* Apply workflow mode presets to query
* Mode settings are applied first, then overridden by explicit parameters
*/
export function applyWorkflowMode(query: RipgrepQuery): RipgrepQuery {
if (!query.mode) {
return query;
}
const modeDefaults: Partial<RipgrepQuery> = {};
switch (query.mode) {
case 'discovery':
// Workflow A: Fast file discovery (25x more efficient)
modeDefaults.filesOnly = true;
modeDefaults.smartCase = true;
break;
case 'paginated':
// Workflow B: Paginated content with sensible limits
modeDefaults.filesPerPage = 10;
modeDefaults.matchesPerPage = 10;
modeDefaults.smartCase = true;
break;
case 'detailed':
// Full matches with context
modeDefaults.contextLines = 3;
modeDefaults.filesPerPage = 10;
modeDefaults.matchesPerPage = 20;
modeDefaults.smartCase = true;
break;
}
// Apply mode defaults, but allow explicit parameters to override
return {
...modeDefaults,
...query,
};
}
/**
* Validation helper: Check for common misconfigurations
*/
export function validateRipgrepQuery(query: RipgrepQuery): {
isValid: boolean;
warnings: string[];
errors: string[];
} {
const warnings: string[] = [];
const errors: string[] = [];
// Mutual exclusivity checks
if (query.fixedString && query.perlRegex) {
errors.push('fixedString and perlRegex are mutually exclusive. Choose one.');
}
if (query.filesOnly && query.count) {
warnings.push('filesOnly and count are mutually exclusive. Using filesOnly.');
}
if (query.filesOnly && query.filesWithoutMatch) {
errors.push('filesOnly and filesWithoutMatch are mutually exclusive. Choose one.');
}
if (query.passthru && query.filesOnly) {
errors.push('passthru and filesOnly are mutually exclusive.');
}
if (query.passthru) {
warnings.push(
'passthru prints ALL lines from matched files. ' +
'This can produce very large output. Consider using context lines instead.'
);
}
if (query.lineRegexp && query.wholeWord) {
warnings.push('lineRegexp and wholeWord both specified. lineRegexp takes precedence.');
}
// Case sensitivity
const caseModes = [query.caseInsensitive, query.caseSensitive, query.smartCase].filter(Boolean);
if (caseModes.length > 1) {
warnings.push('Multiple case sensitivity modes specified. Priority: caseSensitive > caseInsensitive > smartCase');
}
const hasContext = (query.contextLines && query.contextLines > 2) ||
(query.beforeContext && query.beforeContext > 2) ||
(query.afterContext && query.afterContext > 2);
if (hasContext) {
const contentLength = query.matchContentLength || 200;
warnings.push(
`Context lines enabled (${query.contextLines || query.beforeContext || query.afterContext} lines). ` +
`Match values will include context and be truncated to ${contentLength} chars. Use matchesPerPage for pagination.`
);
}
if (query.multiline) {
warnings.push(
'Multiline mode is memory-intensive and slower. ' +
'Entire files are loaded into memory. Only use when pattern genuinely spans multiple lines.'
);
}
if (query.perlRegex && !query.noUnicode && query.multiline) {
warnings.push(
'PERFORMANCE TIP: For fastest PCRE2 multiline searches on ASCII codebases, ' +
'consider using noUnicode=true (2-3x faster).'
);
}
if (!query.filesOnly && !query.count && !query.maxMatchesPerFile) {
warnings.push(
'No output limiting specified. Consider setting maxMatchesPerFile (default: 3) to control output size.'
);
}
if (query.include && query.include.length > 1) {
const allSimpleGlobs = query.include.every(g =>
g.match(/^\*\.[a-zA-Z0-9]+$/)
);
if (allSimpleGlobs && !query.include[0].includes('{')) {
const exts = query.include.map(g => g.replace('*.', '')).join(',');
warnings.push(
`TIP: Consolidate globs for better performance: include=["*.{${exts}}"] instead of separate globs.`
);
}
}
if (query.include && !query.type) {
const simpleType = query.include[0]?.match(/^\*\.([a-z]+)$/)?.[1];
const knownTypes = ['ts', 'js', 'py', 'rust', 'go', 'java', 'cpp', 'c'];
if (simpleType && knownTypes.includes(simpleType)) {
warnings.push(
`TIP: Use type="${simpleType}" instead of include glob for cleaner syntax.`
);
}
}
return {
isValid: errors.length === 0,
warnings,
errors,
};
}
/**
* Helper: Estimate match count for pagination recommendations
*/
export function estimateMatchCount(query: RipgrepQuery, fileCount: number): {
estimatedMatches: number;
needsPagination: boolean;
recommendation: string;
} {
const AVG_MATCHES_PER_FILE = query.maxMatchesPerFile || 3;
const estimatedMatches = fileCount * AVG_MATCHES_PER_FILE;
const needsPagination = estimatedMatches > 100;
let recommendation = '';
if (needsPagination) {
recommendation = `Estimated ${estimatedMatches} matches. Use matchesPerPage parameter for pagination.`;
} else if (query.filesOnly) {
recommendation = 'Using filesOnly mode - optimal for discovery (~25x more token-efficient)';
}
return {
estimatedMatches,
needsPagination,
recommendation,
};
}