Skip to main content
Glama
codeChunking.ts9.96 kB
import { readFile } from 'fs/promises'; import { extname } from 'path'; import type { CodeChunk } from '../types/memory-v5.js'; // Extract meaningful chunks from a file using simple pattern-based approach export async function chunkFile( filePath: string, projectId: string, codebaseMapId: string ): Promise<Omit<CodeChunk, '_id' | 'contentVector' | 'createdAt' | 'updatedAt'>[]> { const content = await readFile(filePath, 'utf-8'); const lastModified = new Date(); // Use pattern-based chunking for all file types // This is simpler and more reliable than AST parsing return chunkByPatterns(content, filePath, projectId, codebaseMapId, lastModified); } // Smart semantic boundary detection for complete chunks function findSemanticBoundary(lines: string[], startIdx: number, maxLines: number = 200): number { let braceCount = 0; let parenCount = 0; let inBlock = false; const baseIndent = lines[startIdx]?.match(/^(\s*)/)?.[1].length || 0; for (let i = startIdx; i < Math.min(lines.length, startIdx + maxLines); i++) { const line = lines[i]; const currentIndent = line.match(/^(\s*)/)?.[1].length || 0; // Track braces and parentheses for (const char of line) { if (char === '{') { braceCount++; inBlock = true; } else if (char === '}') { braceCount--; } else if (char === '(') { parenCount++; } else if (char === ')') { parenCount--; } } // Function/class ended when: // 1. Braces are balanced and we were in a block // 2. Indentation returns to base level or less // 3. Next function/class starts if (inBlock && braceCount === 0 && parenCount === 0) { return i + 1; // Found complete semantic unit! } // Python/indentation-based: check indent level if (i > startIdx + 5 && currentIndent <= baseIndent && line.trim().length > 0) { // Back to original indent = function ended return i; } // Safety: if we see another function/class definition if (i > startIdx + 10) { const isNewBlock = /^(export\s+)?(async\s+)?(function|class|interface|def|func|fn)\s+/.test(line.trim()); if (isNewBlock) return i; } } // Fallback: take up to maxLines return Math.min(startIdx + maxLines, lines.length); } // Extract imports and top-level context function extractContext(lines: string[], upToLine: number): string { const contextLines: string[] = []; const importPattern = /^(import|from|require|use|using|include)/; for (let i = 0; i < Math.min(upToLine, lines.length); i++) { const line = lines[i].trim(); if (importPattern.test(line) || line.startsWith('//') || line.startsWith('/*')) { contextLines.push(lines[i]); } } return contextLines.join('\n'); } // Generic pattern-based chunking for all languages function chunkByPatterns( content: string, filePath: string, projectId: string, codebaseMapId: string, lastModified: Date ): Omit<CodeChunk, '_id' | 'contentVector' | 'createdAt' | 'updatedAt'>[] { const chunks: Omit<CodeChunk, '_id' | 'contentVector' | 'createdAt' | 'updatedAt'>[] = []; const lines = content.split('\n'); // Extract file-level context (imports, comments) const fileContext = extractContext(lines, 50); // Common patterns for functions/methods across languages const functionPatterns = [ /function\s+(\w+)/, // JavaScript/TypeScript /(\w+)\s*:\s*function/, // Object methods /(\w+)\s*=\s*function/, // Variable functions /(\w+)\s*=\s*\([^)]*\)\s*=>/,// Arrow functions /export\s+function\s+(\w+)/, // Export functions /async\s+function\s+(\w+)/, // Async functions /def\s+(\w+)/, // Python /func\s+(\w+)/, // Go /fn\s+(\w+)/, // Rust /public\s+\w+\s+(\w+)\s*\(/, // Java/C# /private\s+\w+\s+(\w+)\s*\(/, // Java/C# ]; // Class patterns const classPatterns = [ /class\s+(\w+)/, // Most languages /export\s+class\s+(\w+)/, // TypeScript/JavaScript /interface\s+(\w+)/, // TypeScript/Java/C# /struct\s+(\w+)/, // C/Go/Rust /enum\s+(\w+)/, // Various languages /type\s+(\w+)\s*=/, // TypeScript type aliases ]; // Find functions for (let i = 0; i < lines.length; i++) { const line = lines[i]; for (const pattern of functionPatterns) { const match = line.match(pattern); if (match) { const name = match[1]; const startLine = i + 1; const endLine = findSemanticBoundary(lines, i); // SMART boundary detection! chunks.push({ projectId, codebaseMapId: codebaseMapId as any, filePath, lastModified, chunk: { type: 'function', name, signature: line.trim(), content: lines.slice(i, endLine).join('\n'), context: fileContext, // Include imports and top-level context! startLine, endLine }, searchableText: `${name} function ${extname(filePath)}`, metadata: { dependencies: [], exports: [name], patterns: detectPatterns(name, lines.slice(i, endLine).join('\n')), size: endLine - startLine } }); break; } } // Find classes for (const pattern of classPatterns) { const match = line.match(pattern); if (match) { const name = match[1]; const startLine = i + 1; const endLine = findSemanticBoundary(lines, i, 300); // SMART boundary for classes (up to 300 lines) chunks.push({ projectId, codebaseMapId: codebaseMapId as any, filePath, lastModified, chunk: { type: 'class', name, signature: line.trim(), content: lines.slice(i, endLine).join('\n'), context: fileContext, // Include imports and top-level context! startLine, endLine }, searchableText: `${name} class ${extname(filePath)}`, metadata: { dependencies: [], exports: [name], patterns: detectPatterns(name, lines.slice(i, endLine).join('\n')), size: endLine - startLine } }); break; } } } // If no specific chunks found, create a file-level chunk if (chunks.length === 0) { chunks.push({ projectId, codebaseMapId: codebaseMapId as any, filePath, lastModified, chunk: { type: 'module', name: filePath.split('/').pop() || filePath, signature: '', content: content.substring(0, 2000), // First 2000 chars context: '', startLine: 1, endLine: lines.length }, searchableText: `${filePath} file content`, metadata: { dependencies: [], exports: [], patterns: [], size: lines.length } }); } return chunks; } function detectPatterns(name: string, content?: string): string[] { const patterns: string[] = []; const lowerName = name.toLowerCase(); const lowerContent = content?.toLowerCase() || ''; // Detect common patterns from name if (lowerName.includes('handler') || lowerName.includes('handle')) patterns.push('event-handler'); if (lowerName.includes('middleware')) patterns.push('middleware'); if (lowerName.includes('controller')) patterns.push('controller'); if (lowerName.includes('service')) patterns.push('service'); if (lowerName.includes('repository') || lowerName.includes('repo')) patterns.push('repository'); if (lowerName.includes('error') || lowerName.includes('exception')) patterns.push('error-handler'); if (lowerName.includes('auth') || lowerName.includes('login')) patterns.push('authentication'); if (lowerName.includes('test') || lowerName.includes('spec')) patterns.push('test'); if (lowerName.includes('util') || lowerName.includes('utils')) patterns.push('utility'); if (lowerName.includes('helper')) patterns.push('helper'); if (lowerName.includes('model') || lowerName.includes('schema')) patterns.push('model'); if (lowerName.includes('route') || lowerName.includes('router')) patterns.push('router'); if (lowerName.includes('api')) patterns.push('api'); if (lowerName.includes('db') || lowerName.includes('database')) patterns.push('database'); if (lowerName.includes('cache')) patterns.push('cache'); if (lowerName.includes('queue')) patterns.push('queue'); if (lowerName.includes('logger') || lowerName.includes('log')) patterns.push('logging'); if (lowerName.includes('config')) patterns.push('configuration'); if (lowerName.includes('validator') || lowerName.includes('validate')) patterns.push('validation'); // Detect patterns from content if (content && content.length > 0) { // Error handling patterns if (lowerContent.includes('try') && lowerContent.includes('catch')) patterns.push('error-handling'); if (lowerContent.includes('throw') || lowerContent.includes('error')) patterns.push('error-handling'); // Async patterns if (lowerContent.includes('async') || lowerContent.includes('await')) patterns.push('async'); if (lowerContent.includes('promise')) patterns.push('promise'); // Common patterns if (lowerContent.includes('export') || lowerContent.includes('module.exports')) patterns.push('module'); if (lowerContent.includes('import') || lowerContent.includes('require')) patterns.push('dependency'); if (lowerContent.includes('class')) patterns.push('class-based'); if (lowerContent.includes('function')) patterns.push('functional'); if (lowerContent.includes('interface') || lowerContent.includes('type')) patterns.push('typescript'); } // Remove duplicates return [...new Set(patterns)]; }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/romiluz13/memory-engineering-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server