AutoDev Codebase MCP Server

parser.ts•31.3 KiB

import { readFile } from "fs/promises" import { createHash } from "crypto" import * as path from "path" import * as treeSitter from "web-tree-sitter" import { LanguageParser, loadRequiredLanguageParsers } from "../../tree-sitter/languageParser" import { parseMarkdown } from "../../tree-sitter/markdownParser" import { ICodeParser, CodeBlock, ParentContainer } from "../interfaces" import { scannerExtensions, shouldUseFallbackChunking } from "../shared/supported-extensions" import { MAX_BLOCK_CHARS, MIN_BLOCK_CHARS, MIN_CHUNK_REMAINDER_CHARS, MAX_CHARS_TOLERANCE_FACTOR } from "../constants" /** * Node types that represent "leaf" definitions - functions/methods that should not drill down * to children when oversized. Instead, they should be chunked by lines to preserve implementation. * * Container types like class_declaration, module, namespace are NOT included here, * as they should continue to drill down to process their members. */ const LEAF_DEFINITION_TYPES = new Set([ // JavaScript/TypeScript 'function_declaration', 'function_definition', 'method_definition', 'arrow_function', 'function_expression', // Python 'function_definition', // Go 'function_declaration', 'method_declaration', // Rust 'function_item', // Java/C#/C++ 'method_declaration', 'constructor_declaration', 'function_definition', // Ruby 'method', // PHP 'function_definition', 'method_declaration', ]) /** * Markdown header information for building parent chains */ interface MarkdownHeader { level: number text: string line: number } /** * Implementation of the code parser interface */ export class CodeParser implements ICodeParser { private loadedParsers: LanguageParser = {} private pendingLoads: Map<string, Promise<LanguageParser>> = new Map() // Markdown files are now supported using the custom markdown parser // which extracts headers and sections for semantic indexing /** * Parses a code file into code blocks * @param filePath Path to the file to parse * @param options Optional parsing options * @returns Promise resolving to array of code blocks */ async parseFile( filePath: string, options?: { content?: string fileHash?: string }, ): Promise<CodeBlock[]> { // Get file extension const ext = path.extname(filePath).toLowerCase() // Skip if not a supported language if (!this.isSupportedLanguage(ext)) { return [] } // Get file content let content: string let fileHash: string if (options?.content) { content = options.content fileHash = options.fileHash || this.createFileHash(content) } else { try { content = await readFile(filePath, "utf8") fileHash = this.createFileHash(content) } catch (error) { console.error(`Error reading file ${filePath}:`, error) return [] } } // Parse the file return this.parseContent(filePath, content, fileHash) } /** * Checks if a language is supported * @param extension File extension * @returns Boolean indicating if the language is supported */ private isSupportedLanguage(extension: string): boolean { return scannerExtensions.includes(extension) } /** * Creates a hash for a file * @param content File content * @returns Hash string */ private createFileHash(content: string): string { return createHash("sha256").update(content).digest("hex") } /** * Parses file content into code blocks * @param filePath Path to the file * @param content File content * @param fileHash File hash * @returns Array of code blocks */ private async parseContent(filePath: string, content: string, fileHash: string): Promise<CodeBlock[]> { const ext = path.extname(filePath).slice(1).toLowerCase() const seenSegmentHashes = new Set<string>() // Handle markdown files specially if (ext === "md" || ext === "markdown") { return this.parseMarkdownContent(filePath, content, fileHash, seenSegmentHashes) } // Check if this extension should use fallback chunking if (shouldUseFallbackChunking(`.${ext}`)) { return this._performFallbackChunking(filePath, content, fileHash, seenSegmentHashes) } // Check if we already have the parser loaded if (!this.loadedParsers[ext]) { const pendingLoad = this.pendingLoads.get(ext) if (pendingLoad) { try { await pendingLoad } catch (error) { console.error(`Error in pending parser load for ${filePath}:`, error) return [] } } else { const loadPromise = loadRequiredLanguageParsers([filePath]) this.pendingLoads.set(ext, loadPromise) try { const newParsers = await loadPromise if (newParsers) { this.loadedParsers = { ...this.loadedParsers, ...newParsers } } } catch (error) { console.error(`Error loading language parser for ${filePath}:`, error) return [] } finally { this.pendingLoads.delete(ext) } } } const language = this.loadedParsers[ext] if (!language) { console.warn(`No parser available for file extension: ${ext}`) return [] } const tree = language.parser.parse(content) // We don't need to get the query string from languageQueries since it's already loaded // in the language object const captures = language.query.captures(tree.rootNode) // Check if captures are empty if (!captures || captures.length === 0) { if (content.length >= MIN_BLOCK_CHARS) { // Perform fallback chunking if content is large enough const blocks = this._performFallbackChunking(filePath, content, fileHash, seenSegmentHashes) return blocks } else { // Return empty if content is too small for fallback return [] } } const results: CodeBlock[] = [] // Process captures if not empty - build a map to track node identifiers const nodeIdentifierMap = new Map<treeSitter.SyntaxNode, string>() // Extract identifiers from captures for (const capture of captures) { if (capture.name === "name" || capture.name === "property.name.definition") { // Find the *closest* definition node that fully contains this name node. // When multiple definition nodes match (e.g. class + method), we prefer the // one with the smallest span so that method names don't get attached to // the outer class/container. const candidateDefinitions = captures.filter((c) => { if (!c.name.includes("definition")) return false const defNode = c.node const nameNode = capture.node if (!defNode || !nameNode) return false return ( defNode.startPosition.row <= nameNode.startPosition.row && defNode.endPosition.row >= nameNode.endPosition.row ) }) if (candidateDefinitions.length > 0) { const definitionCapture = candidateDefinitions.reduce((best, current) => { const bestSpan = best.node.endPosition.row - best.node.startPosition.row const currentSpan = current.node.endPosition.row - current.node.startPosition.row return currentSpan < bestSpan ? current : best }) // For JSON properties, remove quotes from the identifier let identifier = capture.node.text if ( capture.name === "property.name.definition" && identifier.startsWith("\"") && identifier.endsWith("\"") ) { identifier = identifier.slice(1, -1) } nodeIdentifierMap.set(definitionCapture.node, identifier) } } } const queue: treeSitter.SyntaxNode[] = captures .filter((capture: any) => capture.name.includes('definition')) .map((capture: any) => capture.node) while (queue.length > 0) { const currentNode = queue.shift()! // const lineSpan = currentNode.endPosition.row - currentNode.startPosition.row + 1 // Removed as per lint error // Check if the node meets the minimum character requirement if (currentNode.text && currentNode.text.length >= MIN_BLOCK_CHARS) { // If it also exceeds the maximum character limit, try to break it down if (currentNode.text.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) { // Check if this is a "leaf" definition (function/method) that should not drill down const isLeafDefinition = LEAF_DEFINITION_TYPES.has(currentNode.type) if (isLeafDefinition) { // For functions/methods: chunk by lines instead of drilling down to children // This ensures implementation is captured, not just docstrings const chunkedBlocks = this._chunkDefinitionNodeByLines( currentNode, filePath, fileHash, seenSegmentHashes, nodeIdentifierMap, ) results.push(...chunkedBlocks) } else if (currentNode.children && currentNode.children.length > 0) { // For containers (classes, modules): drill down to process members queue.push(...currentNode.children) } else { // For other leaf nodes: chunk by lines const chunkedBlocks = this._chunkLeafNodeByLines( currentNode, filePath, fileHash, seenSegmentHashes, nodeIdentifierMap, ) results.push(...chunkedBlocks) } } else { // Node meets min chars and is within max chars, create a block const identifier = nodeIdentifierMap.get(currentNode) || currentNode.childForFieldName("name")?.text || currentNode.children?.find((c) => c.type === "identifier")?.text || null const type = currentNode.type const start_line = currentNode.startPosition.row + 1 const end_line = currentNode.endPosition.row + 1 const content = currentNode.text const contentPreview = content.slice(0, 100) const segmentHash = createHash("sha256") .update(`${filePath}-${start_line}-${end_line}-${content.length}-${contentPreview}`) .digest("hex") if (!seenSegmentHashes.has(segmentHash)) { seenSegmentHashes.add(segmentHash) // Build parent chain and hierarchy display const parentChain = this.buildParentChain('tree-sitter', currentNode, nodeIdentifierMap) const hierarchyDisplay = this.buildHierarchyDisplay(parentChain, identifier, type) results.push({ file_path: filePath, identifier, type, start_line, end_line, content, segmentHash, fileHash, chunkSource: 'tree-sitter', parentChain, hierarchyDisplay, }) } } } // Nodes smaller than MIN_BLOCK_CHARS are ignored } return this.deduplicateBlocks(results) } /** * Common helper function to chunk text by lines, avoiding tiny remainders. */ private _chunkTextByLines( lines: string[], filePath: string, fileHash: string, chunkType: string, seenSegmentHashes: Set<string>, baseStartLine: number = 1, // 1-based start line of the *first* line in the `lines` array options?: { /** * Optional identifier (e.g. markdown header text) to attach to all * produced chunks. When omitted, `identifier` defaults to null. */ identifier?: string | null /** * Optional parent chain describing the logical hierarchy for the * chunks (used primarily for markdown sections). */ parentChain?: ParentContainer[] /** * Optional pre-computed hierarchy display string. */ hierarchyDisplay?: string | null /** * Optional override for the chunkSource. When omitted we keep the * existing defaults of 'fallback' and 'line-segment'. */ chunkSourceOverride?: CodeBlock["chunkSource"] }, ): CodeBlock[] { const identifier = options?.identifier ?? null const parentChain = options?.parentChain ?? [] const hierarchyDisplay = options?.hierarchyDisplay ?? null const chunkSourceOverride = options?.chunkSourceOverride const chunks: CodeBlock[] = [] let currentChunkLines: string[] = [] let currentChunkLength = 0 let chunkStartLineIndex = 0 // 0-based index within the `lines` array const effectiveMaxChars = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR const finalizeChunk = (endLineIndex: number) => { if (currentChunkLength >= MIN_BLOCK_CHARS && currentChunkLines.length > 0) { const chunkContent = currentChunkLines.join("\n") const startLine = baseStartLine + chunkStartLineIndex const endLine = baseStartLine + endLineIndex const contentPreview = chunkContent.slice(0, 100) const segmentHash = createHash("sha256") .update(`${filePath}-${startLine}-${endLine}-${chunkContent.length}-${contentPreview}`) .digest("hex") if (!seenSegmentHashes.has(segmentHash)) { seenSegmentHashes.add(segmentHash) chunks.push({ file_path: filePath, identifier, type: chunkType, start_line: startLine, end_line: endLine, content: chunkContent, segmentHash, fileHash, chunkSource: chunkSourceOverride ?? "fallback", parentChain, hierarchyDisplay, }) } } currentChunkLines = [] currentChunkLength = 0 chunkStartLineIndex = endLineIndex + 1 } const createSegmentBlock = (segment: string, originalLineNumber: number, startCharIndex: number) => { const segmentPreview = segment.slice(0, 100) const segmentHash = createHash("sha256") .update( `${filePath}-${originalLineNumber}-${originalLineNumber}-${startCharIndex}-${segment.length}-${segmentPreview}`, ) .digest("hex") if (!seenSegmentHashes.has(segmentHash)) { seenSegmentHashes.add(segmentHash) chunks.push({ file_path: filePath, identifier, type: `${chunkType}_segment`, start_line: originalLineNumber, end_line: originalLineNumber, content: segment, segmentHash, fileHash, chunkSource: chunkSourceOverride ?? "line-segment", parentChain, hierarchyDisplay, }) } } for (let i = 0; i < lines.length; i++) { const line = lines[i] const lineLength = line.length + (i < lines.length - 1 ? 1 : 0) // +1 for newline, except last line const originalLineNumber = baseStartLine + i // Handle oversized lines (longer than effectiveMaxChars) // We allow some tolerance for chunk sizing, and only split single lines // when they exceed the tolerated max. if (lineLength > effectiveMaxChars) { // Finalize any existing normal chunk before processing the oversized line if (currentChunkLines.length > 0) { finalizeChunk(i - 1) } // Split the oversized line into segments let remainingLineContent = line let currentSegmentStartChar = 0 while (remainingLineContent.length > 0) { const segment = remainingLineContent.substring(0, MAX_BLOCK_CHARS) remainingLineContent = remainingLineContent.substring(MAX_BLOCK_CHARS) createSegmentBlock(segment, originalLineNumber, currentSegmentStartChar) currentSegmentStartChar += MAX_BLOCK_CHARS } continue } // Handle normally sized lines if (currentChunkLength > 0 && currentChunkLength + lineLength > effectiveMaxChars) { // Re-balancing Logic let splitIndex = i - 1 let remainderLength = 0 for (let j = i; j < lines.length; j++) { remainderLength += lines[j].length + (j < lines.length - 1 ? 1 : 0) } if ( currentChunkLength >= MIN_BLOCK_CHARS && remainderLength < MIN_CHUNK_REMAINDER_CHARS && currentChunkLines.length > 1 ) { for (let k = i - 2; k >= chunkStartLineIndex; k--) { const potentialChunkLines = lines.slice(chunkStartLineIndex, k + 1) const potentialChunkLength = potentialChunkLines.join("\n").length + 1 const potentialNextChunkLines = lines.slice(k + 1) const potentialNextChunkLength = potentialNextChunkLines.join("\n").length + 1 if ( potentialChunkLength >= MIN_BLOCK_CHARS && potentialNextChunkLength >= MIN_CHUNK_REMAINDER_CHARS ) { splitIndex = k break } } } finalizeChunk(splitIndex) if (i >= chunkStartLineIndex) { currentChunkLines.push(line) currentChunkLength += lineLength } else { i = chunkStartLineIndex - 1 continue } } else { currentChunkLines.push(line) currentChunkLength += lineLength } } // Process the last remaining chunk if (currentChunkLines.length > 0) { finalizeChunk(lines.length - 1) } return chunks } private _performFallbackChunking( filePath: string, content: string, fileHash: string, seenSegmentHashes: Set<string>, ): CodeBlock[] { const lines = content.split("\n") return this._chunkTextByLines(lines, filePath, fileHash, "fallback_chunk", seenSegmentHashes) } private _chunkLeafNodeByLines( node: treeSitter.SyntaxNode, filePath: string, fileHash: string, seenSegmentHashes: Set<string>, nodeIdentifierMap: Map<treeSitter.SyntaxNode, string> ): CodeBlock[] { if (!node.text) { console.warn(`Node text is undefined for ${node.type} in ${filePath}`) return [] } const lines = node.text.split("\n") const baseStartLine = node.startPosition.row + 1 // Build parent chain and hierarchy display to preserve context // For non-definition nodes (like string_content), we still want to show // which class/function they belong to const parentChain = this.buildParentChain('tree-sitter', node, nodeIdentifierMap) const identifier = null // Leaf nodes like string_content don't have their own identifier const type = node.type const hierarchyDisplay = this.buildHierarchyDisplay(parentChain, identifier, type) return this._chunkTextByLines( lines, filePath, fileHash, type, seenSegmentHashes, baseStartLine, { identifier, parentChain, hierarchyDisplay, chunkSourceOverride: 'tree-sitter' } ) } /** * Chunks a definition node (function/method) by lines while preserving metadata. * This method is used for oversized leaf definition nodes to ensure their entire * implementation is captured, not just docstrings or large child nodes. */ private _chunkDefinitionNodeByLines( node: treeSitter.SyntaxNode, filePath: string, fileHash: string, seenSegmentHashes: Set<string>, nodeIdentifierMap: Map<treeSitter.SyntaxNode, string> ): CodeBlock[] { if (!node.text) { console.warn(`Node text is undefined for ${node.type} in ${filePath}`) return [] } const lines = node.text.split("\n") const baseStartLine = node.startPosition.row + 1 // Extract definition metadata to preserve across all chunks const identifier = nodeIdentifierMap.get(node) || node.childForFieldName("name")?.text || node.children?.find((c) => c.type === "identifier")?.text || null const type = node.type const parentChain = this.buildParentChain('tree-sitter', node, nodeIdentifierMap) const hierarchyDisplay = this.buildHierarchyDisplay(parentChain, identifier, type) // Call line chunking with metadata so all chunks share the same hierarchy return this._chunkTextByLines( lines, filePath, fileHash, type, seenSegmentHashes, baseStartLine, { identifier, parentChain, hierarchyDisplay, chunkSourceOverride: 'tree-sitter' } ) } /** * Removes blocks that are contained within other blocks to avoid duplication */ private deduplicateBlocks(blocks: CodeBlock[]): CodeBlock[] { const sourceOrder = ['tree-sitter', 'fallback', 'line-segment'] blocks.sort((a, b) => sourceOrder.indexOf(a.chunkSource) - sourceOrder.indexOf(b.chunkSource) ) const result: CodeBlock[] = [] for (const block of blocks) { const isDuplicate = result.some(existing => this.isBlockContained(block, existing) ) if (!isDuplicate) { result.push(block) } } return result } /** * Builds the parent chain for a given tree-sitter node */ /** * 统一的parentChain构建入口 */ private buildParentChain( context: 'tree-sitter' | 'markdown', ...args: any[] ): ParentContainer[] { if (context === 'markdown') { return this.buildMarkdownParentChain(...args as [MarkdownHeader, MarkdownHeader[]]) } else { return this.buildTreeSitterParentChain(...args as [treeSitter.SyntaxNode, Map<treeSitter.SyntaxNode, string>]) } } /** * 原有方法重命名 - tree-sitter专用 */ private buildTreeSitterParentChain( node: treeSitter.SyntaxNode, nodeIdentifierMap: Map<treeSitter.SyntaxNode, string> ): ParentContainer[] { const parentChain: ParentContainer[] = [] // Container node types that we want to track in the hierarchy const containerTypes = new Set([ 'class_declaration', 'class_definition', 'interface_declaration', 'interface_definition', 'namespace_declaration', 'namespace_definition', 'module_declaration', 'module_definition', 'function_declaration', 'function_definition', 'method_definition', 'object_expression', 'object_pattern', 'object', 'pair', // JSON objects and properties 'program', 'source_file' ]) let currentNode = node.parent while (currentNode) { // Skip non-container nodes if (!containerTypes.has(currentNode.type)) { currentNode = currentNode.parent continue } // Skip program/source_file as they're too generic if (currentNode.type === 'program' || currentNode.type === 'source_file') { currentNode = currentNode.parent continue } // Try to get identifier from various sources let identifier = nodeIdentifierMap.get(currentNode) || null if (!identifier) { // Try to extract identifier from the node structure identifier = this.extractNodeIdentifier(currentNode) } // Only add to chain if we found a meaningful identifier if (identifier) { parentChain.unshift({ // Add to beginning to maintain correct order identifier: identifier, type: this.normalizeNodeType(currentNode.type) }) } currentNode = currentNode.parent } return parentChain } /** * Markdown专用的parentChain构建方法 * 基于header层级关系构建虚拟的父子关系 */ private buildMarkdownParentChain( currentHeader: MarkdownHeader, headerStack: MarkdownHeader[] ): ParentContainer[] { const parentChain: ParentContainer[] = [] // 找到当前header的直接父级 const parentLevel = currentHeader.level - 1 if (parentLevel < 1) { return parentChain // h1没有父级 } // 从栈顶开始查找最近的父级header for (let i = headerStack.length - 1; i >= 0; i--) { const header = headerStack[i] if (header.level === parentLevel) { // 找到直接父级，添加到parentChain parentChain.push({ identifier: header.text, // 使用更简洁的display类型，避免hierarchyDisplay过长 type: this.getMarkdownDisplayType(header.level), }) // 递归查找父级的父级 const grandParentChain = this.buildMarkdownParentChain(header, headerStack.slice(0, i)) parentChain.unshift(...grandParentChain) break } } return parentChain } /** * 为Markdown header提供统一的、精简的展示类型 * 例如：h1 -> "md_h1" */ private getMarkdownDisplayType(level: number): string { return `md_h${level}` } /** * Extracts identifier from a tree-sitter node using various strategies */ private extractNodeIdentifier(node: treeSitter.SyntaxNode): string | null { // Try field-based extraction first const nameField = node.childForFieldName("name") if (nameField) { let name = nameField.text // Remove quotes from JSON properties if (name.startsWith('"') && name.endsWith('"')) { name = name.slice(1, -1) } return name } // Try to find identifier child nodes const identifierChild = node.children?.find(child => child.type === "identifier" || child.type === "type_identifier" || child.type === "property_identifier" ) if (identifierChild) { let name = identifierChild.text // Remove quotes from JSON properties if (name.startsWith('"') && name.endsWith('"')) { name = name.slice(1, -1) } return name } // For JSON pairs, try to get the key if (node.type === 'pair' && node.children && node.children.length > 0) { const key = node.children[0] if (key) { let name = key.text // Remove quotes from JSON keys if (name.startsWith('"') && name.endsWith('"')) { name = name.slice(1, -1) } return name } } return null } /** * Normalizes node types to more readable format */ private normalizeNodeType(nodeType: string): string { const typeMap: Record<string, string> = { 'class_declaration': 'class', 'class_definition': 'class', 'interface_declaration': 'interface', 'interface_definition': 'interface', 'namespace_declaration': 'namespace', 'namespace_definition': 'namespace', 'module_declaration': 'module', 'module_definition': 'module', 'function_declaration': 'function', 'function_definition': 'function', 'method_definition': 'method', 'object_expression': 'object', 'object_pattern': 'object', 'object': 'object', 'pair': 'property' } return typeMap[nodeType] || nodeType } /** * Builds hierarchy display string from parent chain */ private buildHierarchyDisplay(parentChain: ParentContainer[], currentIdentifier: string | null, currentType: string): string | null { const parts: string[] = [] // Add parent parts for (const parent of parentChain) { parts.push(`${parent.type} ${parent.identifier}`) } // Add current node if it has an identifier if (currentIdentifier) { const normalizedCurrentType = this.normalizeNodeType(currentType) parts.push(`${normalizedCurrentType} ${currentIdentifier}`) } return parts.length > 0 ? parts.join(' > ') : null } /** * 为Markdown section构建hierarchyDisplay */ private buildMarkdownHierarchyDisplay( parentChain: ParentContainer[], currentHeader: MarkdownHeader ): string { const parts: string[] = [] // 添加父级链（这里的type已经是精简后的md_hX） for (const parent of parentChain) { parts.push(`${parent.type} ${parent.identifier}`) } // 添加当前header（使用精简后的md_hX） parts.push(`${this.getMarkdownDisplayType(currentHeader.level)} ${currentHeader.text}`) return parts.join(' > ') } /** * 更新header栈，保持正确的层级关系 */ private updateHeaderStack(headerStack: MarkdownHeader[], newHeader: MarkdownHeader): MarkdownHeader[] { // 移除所有大于或等于当前层级的header（同级或更低级的header需要被替换） while (headerStack.length > 0 && headerStack[headerStack.length - 1].level >= newHeader.level) { headerStack.pop() } // 添加新的header headerStack.push(newHeader) return headerStack } /** * Checks if block1 is contained within block2 */ private isBlockContained(block1: CodeBlock, block2: CodeBlock): boolean { return block1.file_path === block2.file_path && block1.start_line >= block2.start_line && block1.end_line <= block2.end_line && block2.content.includes(block1.content) } /** * Helper method to process markdown content sections with consistent chunking logic */ private processMarkdownSection( lines: string[], filePath: string, fileHash: string, type: string, seenSegmentHashes: Set<string>, startLine: number, identifier: string | null = null, parentChain: ParentContainer[] = [], hierarchyDisplay: string | null = null, ): CodeBlock[] { const content = lines.join("\n") if (content.trim().length < MIN_BLOCK_CHARS) { return [] } // Check if content needs chunking (either total size or individual line size) const needsChunking = content.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR || lines.some((line) => line.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) if (needsChunking) { // Apply chunking for large content or oversized lines return this._chunkTextByLines( lines, filePath, fileHash, type, seenSegmentHashes, startLine, { identifier, parentChain, hierarchyDisplay, // Ensure markdown sections keep a consistent source label even // when they are internally chunked. chunkSourceOverride: "markdown", }, ) } // Create a single block for normal-sized content with no oversized lines const endLine = startLine + lines.length - 1 const contentPreview = content.slice(0, 100) const segmentHash = createHash("sha256") .update(`${filePath}-${startLine}-${endLine}-${content.length}-${contentPreview}`) .digest("hex") if (!seenSegmentHashes.has(segmentHash)) { seenSegmentHashes.add(segmentHash) return [ { file_path: filePath, identifier, type, start_line: startLine, end_line: endLine, content, segmentHash, fileHash, chunkSource: 'markdown', parentChain, hierarchyDisplay, }, ] } return [] } private parseMarkdownContent( filePath: string, content: string, fileHash: string, seenSegmentHashes: Set<string>, ): CodeBlock[] { const lines = content.split("\n") const markdownCaptures = parseMarkdown(content) || [] if (markdownCaptures.length === 0) { // No headers found, process entire content return this.processMarkdownSection(lines, filePath, fileHash, "markdown_content", seenSegmentHashes, 1) } const results: CodeBlock[] = [] let lastProcessedLine = 0 // 维护一个header栈来跟踪层级关系 const headerStack: MarkdownHeader[] = [] // Process content before the first header if (markdownCaptures.length > 0) { const firstHeaderLine = markdownCaptures[0].node.startPosition.row if (firstHeaderLine > 0) { const preHeaderLines = lines.slice(0, firstHeaderLine) const preHeaderBlocks = this.processMarkdownSection( preHeaderLines, filePath, fileHash, "markdown_content", seenSegmentHashes, 1, null, // 没有identifier [], // 空的parentChain null, // 没有hierarchyDisplay ) results.push(...preHeaderBlocks) } } // Process markdown captures (headers and sections) for (let i = 0; i < markdownCaptures.length; i += 2) { const nameCapture = markdownCaptures[i] // Ensure we don't go out of bounds when accessing the next capture if (i + 1 >= markdownCaptures.length) break const definitionCapture = markdownCaptures[i + 1] if (!definitionCapture) continue const startLine = definitionCapture.node.startPosition.row + 1 const endLine = definitionCapture.node.endPosition.row + 1 const sectionLines = lines.slice(startLine - 1, endLine) // Extract header level for type classification const headerMatch = nameCapture.name.match(/\.h(\d)$/) const headerLevel = headerMatch ? parseInt(headerMatch[1]) : 1 const headerText = nameCapture.node.text // 创建当前header对象 const currentHeader: MarkdownHeader = { level: headerLevel, text: headerText, line: startLine } // 构建parentChain - 在更新栈之前使用当前栈来查找父级 const parentChain = this.buildMarkdownParentChain(currentHeader, headerStack) // 更新header栈 this.updateHeaderStack(headerStack, currentHeader) // 构建hierarchyDisplay const hierarchyDisplay = this.buildMarkdownHierarchyDisplay(parentChain, currentHeader) const sectionBlocks = this.processMarkdownSection( sectionLines, filePath, fileHash, `markdown_header_h${headerLevel}`, seenSegmentHashes, startLine, headerText, parentChain, hierarchyDisplay, ) results.push(...sectionBlocks) lastProcessedLine = endLine } // Process any remaining content after the last header section if (lastProcessedLine < lines.length) { const remainingLines = lines.slice(lastProcessedLine) const remainingBlocks = this.processMarkdownSection( remainingLines, filePath, fileHash, "markdown_content", seenSegmentHashes, lastProcessedLine + 1, null, [], // 剩余内容没有特定的父级 null, // 剩余内容没有层级显示 ) results.push(...remainingBlocks) } return results } } // Export a singleton instance for convenience export const codeParser = new CodeParser()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/anrgct/autodev-codebase'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

parser.ts•31.3 KiB