code-guard-ai

code-guard-ai
src
intelligence

batcher.ts•10.1 KiB

/** * Context Batcher * Aggregates workspace files into optimized chunks for LLM processing */ import { vscode } from '../utils/vscode-compat'; import * as path from 'path'; import * as fs from 'fs'; export interface FileContext { path: string; relativePath: string; content: string; language: string; tokenEstimate: number; } export interface BatchedContext { files: FileContext[]; totalTokens: number; batchIndex: number; } /** * Supported file extensions for compliance scanning */ const SUPPORTED_EXTENSIONS = [ '.ts', '.tsx', '.js', '.jsx', // JavaScript/TypeScript '.py', // Python '.java', // Java '.cs', // C# '.go', // Go '.rb', // Ruby '.php', // PHP '.swift', // Swift '.kt', '.kts', // Kotlin '.rs', // Rust '.sql', // SQL '.yaml', '.yml', // Config files '.json', // JSON configs '.env', '.env.example' // Environment files (critical for secrets) ]; /** * Files/folders to always ignore */ const IGNORE_PATTERNS = [ 'node_modules', '.git', 'dist', 'build', 'out', '.next', '.nuxt', 'coverage', '__pycache__', '.pytest_cache', 'vendor', 'packages', '.vscode', '*.min.js', '*.bundle.js', '*.map', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml' ]; /** * Estimate token count (rough approximation: 1 token ≈ 4 characters) */ function estimateTokens(text: string): number { return Math.ceil(text.length / 4); } /** * Get language identifier from file extension */ function getLanguage(filePath: string): string { const ext = path.extname(filePath).toLowerCase(); const langMap: Record<string, string> = { '.ts': 'typescript', '.tsx': 'typescript', '.js': 'javascript', '.jsx': 'javascript', '.py': 'python', '.java': 'java', '.cs': 'csharp', '.go': 'go', '.rb': 'ruby', '.php': 'php', '.swift': 'swift', '.kt': 'kotlin', '.kts': 'kotlin', '.rs': 'rust', '.sql': 'sql', '.yaml': 'yaml', '.yml': 'yaml', '.json': 'json', '.env': 'env' }; return langMap[ext] || 'text'; } /** * Check if file should be ignored */ function shouldIgnore(filePath: string): boolean { const normalized = filePath.replace(/\\/g, '/').toLowerCase(); return IGNORE_PATTERNS.some(pattern => { if (pattern.startsWith('*')) { return normalized.endsWith(pattern.slice(1)); } return normalized.includes(pattern.toLowerCase()); }); } export class ContextBatcher { private maxTokensPerBatch: number; constructor(maxTokensPerBatch: number = 80000) { // Default to 80k tokens to leave room for system prompt and response this.maxTokensPerBatch = maxTokensPerBatch; } /** * Collect all relevant files from the workspace or specific path */ async collectWorkspaceFiles(rootPath?: string): Promise<FileContext[]> { const files: FileContext[] = []; // Use provided path or fallback to workspace const targetPath = rootPath || vscode.workspace.workspaceFolders?.[0]?.uri.fsPath; if (!targetPath) { return []; } // Strategy 1: External Path (Use FS) const isExternal = rootPath && (!vscode.workspace.workspaceFolders || !rootPath.startsWith(vscode.workspace.workspaceFolders[0].uri.fsPath)); if (isExternal) { try { const filePaths = await this.scanDirectory(targetPath); for (const filePath of filePaths) { const content = fs.readFileSync(filePath, 'utf-8'); // Skip very large files if (content.length > 50000) continue; const relativePath = path.relative(targetPath, filePath).replace(/\\/g, '/'); files.push({ path: filePath, relativePath, content, language: getLanguage(filePath), tokenEstimate: estimateTokens(content) }); } } catch (err) { console.error('Error scanning external directory:', err); } } else { // Strategy 2: Internal Workspace (Use VS Code API) try { // Build glob pattern for supported extensions const extensionPattern = `**/*{${SUPPORTED_EXTENSIONS.join(',')}}`; const uris = await vscode.workspace.findFiles( extensionPattern, '**/node_modules/**' ); for (const uri of uris) { // Logic from before const filePath = uri.fsPath; if (shouldIgnore(filePath)) continue; try { const document = await vscode.workspace.openTextDocument(uri); const content = document.getText(); if (content.length > 50000 || content.trim().length === 0) continue; const relativePath = path.relative(targetPath, filePath); files.push({ path: filePath, relativePath: relativePath.replace(/\\/g, '/'), content, language: getLanguage(filePath), tokenEstimate: estimateTokens(content) }); } catch (e) { console.warn('Failed to read file:', e); } } } catch (err) { console.error('Error using workspace.findFiles:', err); } } return files; } private async scanDirectory(dir: string): Promise<string[]> { const files: string[] = []; try { const entries = await fs.promises.readdir(dir, { withFileTypes: true }); for (const entry of entries) { const fullPath = path.join(dir, entry.name); if (shouldIgnore(fullPath)) continue; if (entry.isDirectory()) { files.push(...await this.scanDirectory(fullPath)); } else if (entry.isFile()) { const ext = path.extname(fullPath).toLowerCase(); if (SUPPORTED_EXTENSIONS.includes(ext)) { files.push(fullPath); } } } } catch (e) { console.warn(`Skipping dir ${dir}:`, e); } return files; } /** * Batch files into chunks that fit within token limits */ batchFiles(files: FileContext[]): BatchedContext[] { const batches: BatchedContext[] = []; let currentBatch: FileContext[] = []; let currentTokens = 0; let batchIndex = 0; // Sort files by importance (env files first, then by size) const sortedFiles = [...files].sort((a, b) => { // Prioritize sensitive files const aIsSensitive = a.relativePath.includes('.env') || a.relativePath.includes('config'); const bIsSensitive = b.relativePath.includes('.env') || b.relativePath.includes('config'); if (aIsSensitive && !bIsSensitive) return -1; if (!aIsSensitive && bIsSensitive) return 1; // Then by token count (smaller first) return a.tokenEstimate - b.tokenEstimate; }); for (const file of sortedFiles) { // If adding this file would exceed the limit, start a new batch if (currentTokens + file.tokenEstimate > this.maxTokensPerBatch && currentBatch.length > 0) { batches.push({ files: currentBatch, totalTokens: currentTokens, batchIndex: batchIndex++ }); currentBatch = []; currentTokens = 0; } // If a single file is too large, it gets its own batch if (file.tokenEstimate > this.maxTokensPerBatch) { // Truncate content to fit const maxChars = this.maxTokensPerBatch * 4; const truncatedContent = file.content.substring(0, maxChars) + '\n// ... [TRUNCATED]'; batches.push({ files: [{ ...file, content: truncatedContent, tokenEstimate: this.maxTokensPerBatch }], totalTokens: this.maxTokensPerBatch, batchIndex: batchIndex++ }); continue; } currentBatch.push(file); currentTokens += file.tokenEstimate; } // Don't forget the last batch if (currentBatch.length > 0) { batches.push({ files: currentBatch, totalTokens: currentTokens, batchIndex: batchIndex }); } return batches; } /** * Format batch for LLM prompt */ formatBatchForPrompt(batch: BatchedContext): string { const parts: string[] = []; for (const file of batch.files) { parts.push(` ### File: ${file.relativePath} \`\`\`${file.language} ${file.content} \`\`\` `); } return parts.join('\n---\n'); } /** * Get summary stats for logging */ getStats(files: FileContext[], batches: BatchedContext[]): string { const totalFiles = files.length; const totalTokens = files.reduce((sum, f) => sum + f.tokenEstimate, 0); const totalBatches = batches.length; return `📊 Batcher Stats: ${totalFiles} files, ~${totalTokens.toLocaleString()} tokens, ${totalBatches} batch(es)`; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/negraodenio/code-guard-ai'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

batcher.ts•10.1 KiB