Skip to main content
Glama
JsonPipeline.ts5.25 kB
import { JsonDocumentSplitter } from "../../splitter/JsonDocumentSplitter"; import type { DocumentSplitter } from "../../splitter/types"; import { SPLITTER_PREFERRED_CHUNK_SIZE } from "../../utils/config"; import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; import type { ContentFetcher, RawContent } from "../fetcher/types"; import type { ContentProcessorMiddleware, MiddlewareContext } from "../middleware/types"; import type { ScraperOptions } from "../types"; import { convertToString } from "../utils/buffer"; import { BasePipeline } from "./BasePipeline"; import type { PipelineResult } from "./types"; /** * Pipeline for processing JSON content with semantic, hierarchical splitting. * Uses JsonDocumentSplitter to produce structurally faithful chunks (preserving {level, path}) * without greedy size-based merging. Greedy merging is intentionally omitted to avoid collapsing * distinct structural nodes that are required for precise hierarchical reassembly. */ export class JsonPipeline extends BasePipeline { private readonly middleware: ContentProcessorMiddleware[]; private readonly splitter: DocumentSplitter; constructor(_chunkSize = SPLITTER_PREFERRED_CHUNK_SIZE) { super(); this.middleware = []; // Structure-preserving splitter only (no greedy size merging) this.splitter = new JsonDocumentSplitter({ preserveFormatting: true, }); } canProcess(mimeType: string): boolean { if (!mimeType) return false; return MimeTypeUtils.isJson(mimeType); } async process( rawContent: RawContent, options: ScraperOptions, fetcher?: ContentFetcher, ): Promise<PipelineResult> { const contentString = convertToString(rawContent.content, rawContent.charset); // Validate JSON structure let parsedJson: unknown; let isValidJson = true; try { parsedJson = JSON.parse(contentString); } catch (_error) { isValidJson = false; } // For invalid JSON, return as-is for fallback text processing if (!isValidJson) { // Still split invalid JSON content for consistency const fallbackChunks = await this.splitter.splitText(contentString); return { textContent: contentString, // metadata: { // isValidJson: false, // }, links: [], errors: [], chunks: fallbackChunks, }; } const metadata = this.extractMetadata(parsedJson); const context: MiddlewareContext = { content: contentString, source: rawContent.source, title: metadata.title, contentType: rawContent.mimeType || "application/json", // metadata: { // ...this.extractMetadata(parsedJson), // isValidJson, // jsonStructure: this.analyzeJsonStructure(parsedJson), // }, links: [], // JSON files typically don't contain links errors: [], options, fetcher, }; // Execute the middleware stack (minimal for JSON) await this.executeMiddlewareStack(this.middleware, context); // Split the content using JsonContentSplitter const chunks = await this.splitter.splitText(context.content); return { title: context.title, contentType: context.contentType, textContent: context.content, links: context.links, errors: context.errors, chunks, }; } /** * Extracts metadata from JSON content only when meaningful values exist */ private extractMetadata(parsedJson: unknown): { title?: string; description?: string } { const metadata: { title?: string; description?: string } = {}; if (typeof parsedJson === "object" && parsedJson !== null) { const obj = parsedJson as Record<string, unknown>; // Look for common title fields - only use if they exist and are strings const titleFields = ["title", "name", "displayName", "label"]; for (const field of titleFields) { if (field in obj && typeof obj[field] === "string" && obj[field]) { metadata.title = obj[field] as string; break; } } // Look for common description fields - only use if they exist and are strings const descFields = ["description", "summary", "about", "info"]; for (const field of descFields) { if (field in obj && typeof obj[field] === "string" && obj[field]) { metadata.description = obj[field] as string; break; } } } return metadata; } /** * Calculates the maximum nesting depth of a JSON structure */ private calculateDepth(obj: unknown, currentDepth = 1): number { if (Array.isArray(obj)) { let maxDepth = currentDepth; for (const item of obj) { if (typeof item === "object" && item !== null) { maxDepth = Math.max(maxDepth, this.calculateDepth(item, currentDepth + 1)); } } return maxDepth; } else if (typeof obj === "object" && obj !== null) { let maxDepth = currentDepth; for (const value of Object.values(obj)) { if (typeof value === "object" && value !== null) { maxDepth = Math.max(maxDepth, this.calculateDepth(value, currentDepth + 1)); } } return maxDepth; } return currentDepth; } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server