@ragrabbit/mcp
by madarco
Verified
import { Settings, TextNode, TransformComponent } from "llamaindex";
import OpenAI from "openai";
import { LLM, LLMEnum } from "../settings";
import { codeBlock } from "common-tags";
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod.mjs";
import { logger } from "@repo/logger";
import { env } from "../env.mjs";
import { RagMetadata } from "./metadata.type";
import { countTokens } from "./tokens";
const log = logger.child({
component: "Llamaindex",
});
const metadataSchema = z.object({
title: z.string(),
description: z.string(),
keywords: z.array(z.string()),
questions: z.array(z.string()),
entities: z.array(
z.object({
name: z.string(),
type: z.string(),
})
),
});
const prompt = codeBlock`
Extract from the following text:
- The title of the page
- A short description of the page (max 100 characters)
- A list of keywords (max 10)
- A list of questions that can be answered by the page (max 5)
- A list of entities that can be extracted from the page (max 5)
{{predefinedData}}
Output the result in JSON format.
`;
export async function extractMetadata(text: string): Promise<Partial<RagMetadata> | undefined> {
if (!env.OPENAI_API_KEY) {
throw new Error(`OPENAI_API_KEY is required to get metadata`);
}
const openai = new OpenAI();
const response = await openai.chat.completions.create({
model: "gpt-4o-mini",
response_format: zodResponseFormat(metadataSchema, "metadata"),
max_tokens: 5000,
temperature: 0.2,
messages: [
{
role: "system",
content: prompt,
},
{
role: "user",
content: text,
},
],
});
const message = response.choices[0].message;
if (message.refusal) {
log.warn(
{
refusal: message.refusal,
message,
},
"Failed to parse metadata"
);
return;
}
const parsed = JSON.parse(message.content);
return {
pageTitle: parsed.title,
pageDescription: parsed.description,
keywords: parsed.keywords || [],
questions: parsed.questions || [],
entities: parsed.entities || [],
tokens: await countTokens(text),
};
}
export class LlamaindexMetadataTransformer extends TransformComponent {
constructor() {
super(async (nodes) => {
return await this.transform(nodes as TextNode[]);
});
}
async transform(nodes: TextNode[]): Promise<TextNode[]> {
for (const node of nodes) {
await this.transform_node(node);
}
return nodes;
}
async transform_node(node: TextNode): Promise<TextNode> {
const metadata = await extractMetadata(node.text);
node.metadata = {
...node.metadata,
...metadata,
pageTitle: node.metadata?.pageTitle || metadata.pageTitle,
pageDescription: node.metadata?.pageDescription || metadata.pageDescription,
};
return node;
}
}