@ragrabbit/mcp

by madarco
Verified
import { Settings, TextNode, TransformComponent } from "llamaindex"; import OpenAI from "openai"; import { LLM, LLMEnum } from "../settings"; import { codeBlock } from "common-tags"; import { z } from "zod"; import { zodResponseFormat } from "openai/helpers/zod.mjs"; import { logger } from "@repo/logger"; import { getEncoding } from "@langchain/core/utils/tiktoken"; const log = logger.child({ component: "Llamaindex", }); const metadataSchema = z.object({ title: z.string(), description: z.string(), keywords: z.array(z.string()), questions: z.array(z.string()), entities: z.array( z.object({ name: z.string(), type: z.string(), }) ), }); const prompt = codeBlock` Extract from the following text: - The title of the page - A short description of the page (max 100 characters) - A list of keywords (max 10) - A list of questions that can be answered by the page (max 5) - A list of entities that can be extracted from the page (max 5) {{predefinedData}} Output the result in JSON format. `; export class LlamaindexMetadataTransformer extends TransformComponent { constructor() { super(async (nodes) => { return await this.transform(nodes as TextNode[]); }); } async transform(nodes: TextNode[]): Promise<TextNode[]> { for (const node of nodes) { await this.transform_node(node); } return nodes; } async transform_node(node: TextNode): Promise<TextNode> { if (LLM === LLMEnum.openai) { const openai = new OpenAI(); const response = await openai.chat.completions.create({ model: "gpt-4o-mini", response_format: zodResponseFormat(metadataSchema, "metadata"), max_tokens: 5000, temperature: 0.2, messages: [ { role: "system", content: prompt, }, { role: "user", content: node.text, }, ], }); const message = response.choices[0].message; if (message.refusal) { log.warn( { refusal: message.refusal, message, }, "Failed to parse metadata" ); return; } const parsed = JSON.parse(message.content); const encoding = await getEncoding("cl100k_base"); const tokens = encoding.encode(node.text).length; node.metadata = { ...node.metadata, pageTitle: node.metadata.pageTitle || parsed.title, pageDescription: node.metadata.pageDescription || parsed.description, keywords: parsed.keywords || [], questions: parsed.questions || [], entities: parsed.entities || [], tokens, }; } else { throw new Error(`Unsupported LLM: ${LLM}`); } return node; } }