@ragrabbit/mcp
by madarco
Verified
import { relations } from "drizzle-orm/relations";
import { ExtraConfigColumn, index, pgEnum, unique, varchar } from "drizzle-orm/pg-core";
import { serial } from "drizzle-orm/pg-core";
import { boolean, timestamp, pgTable, text, primaryKey, integer } from "drizzle-orm/pg-core";
import { InferSelectModel, InferInsertModel, SQL, sql, ColumnDataType, ColumnBaseConfig } from "drizzle-orm";
import { organizationsTable } from "./auth";
import { uuid } from "drizzle-orm/pg-core";
import { jsonb } from "drizzle-orm/pg-core";
import { vector } from "drizzle-orm/pg-core";
import { env } from "../env.mjs";
export type PgExtraConfigJson = ExtraConfigColumn<ColumnBaseConfig<ColumnDataType, string>>;
export function extraJsonFieldDeep(column: PgExtraConfigJson, keys: Array<string | number>): SQL {
return sql.raw(`("${column.name}" ->> '{${(keys as string[]).join(",")}}')`);
}
// RAG
export enum IndexStatus {
PENDING = "PENDING",
PROCESSING = "PROCESSING",
SCRAPED = "SCRAPED",
DONE = "DONE",
PENDING_CLEAN = "PENDING_CLEAN",
SKIPPED = "SKIPPED",
ERROR = "ERROR",
OUTDATED = "OUTDATED",
}
export const indexStatusEnum = pgEnum("index_status", [
"PENDING",
"PROCESSING",
"SCRAPED",
"DONE",
"PENDING_CLEAN", // Must be after DONE to be processed after all other jobs complete
"SKIPPED",
"ERROR",
"OUTDATED",
]);
export const indexedTable = pgTable(
"indexed",
{
id: serial().primaryKey(),
organizationId: integer().notNull(),
url: varchar().notNull(),
normalizedUrl: varchar().notNull(),
canonicalUrl: varchar(),
title: varchar(),
description: text(),
scrapeOptions: jsonb()
.$type<{
stripLinks?: boolean;
stripImages?: boolean;
stripHeader?: boolean;
stripFooter?: boolean;
allowSubdomains?: boolean;
maxDepth?: number;
}>()
.default({}),
/**
* If this content should be the start of a crawling run
*/
doCrawl: boolean().default(false).notNull(),
isSitemap: boolean().default(false).notNull(),
/**
* If this content was found during a crawl, this is the original page from which the link was found
*/
foundFromIndexId: integer().references(() => indexedTable.id, { onDelete: "cascade" }),
depth: integer().default(0).notNull(),
status: indexStatusEnum().default("PENDING").notNull(),
error: text(),
hash: varchar(),
skip: boolean().default(false).notNull(),
skipReason: text(),
reindexAt: timestamp(),
indexedAt: timestamp(),
createdAt: timestamp().defaultNow().notNull(),
updatedAt: timestamp(),
},
(table) => {
return {
uniqueOrgUrl: unique().on(table.organizationId, table.normalizedUrl),
uniqueHash: unique().on(table.organizationId, table.hash),
orgFoundFromIdx: index().on(table.organizationId, table.foundFromIndexId),
idxStatus: index().on(table.organizationId, table.status),
};
}
);
export function normalizeUrl(url: string, baseUrl?: string) {
try {
// Avoid error with protocol-relative urls:
if (url.startsWith("//")) {
url = "https:" + url;
}
const urlParts = new URL(url.toLowerCase(), baseUrl);
return "//" + urlParts.hostname.replace("www.", "") + urlParts.pathname;
} catch (e) {
throw new Error(`Error normalizing url: ${url}, ${e}`);
}
}
export type Indexed = InferSelectModel<typeof indexedTable>;
export type NewIndexed = InferInsertModel<typeof indexedTable>;
export const indexedRelations = relations(indexedTable, ({ one, many }) => ({
foundFromIndex: one(indexedTable, {
fields: [indexedTable.foundFromIndexId],
references: [indexedTable.id],
}),
indexedContent: one(indexedContentTable),
}));
export const indexedContentTable = pgTable(
"indexed_content",
{
id: serial("id").primaryKey(),
indexId: integer()
.notNull()
.references(() => indexedTable.id, { onDelete: "cascade" }),
content: text().notNull(),
createdAt: timestamp().defaultNow().notNull(),
updatedAt: timestamp(),
},
(table) => ({
uniqueIndexId: unique().on(table.indexId),
})
);
export type IndexedContent = InferSelectModel<typeof indexedContentTable>;
export type NewIndexedContent = InferInsertModel<typeof indexedContentTable>;
export const indexedContentRelations = relations(indexedContentTable, ({ one }) => ({
indexed: one(indexedTable, {
fields: [indexedContentTable.indexId],
references: [indexedTable.id],
}),
}));
export const EmbeddingDimensions = {
openai: 1536,
baai: 384,
xenova: 1536,
};
export const llamaindexEmbedding = pgTable(
"indexed_content_embeddings",
{
id: uuid().defaultRandom().primaryKey().notNull(),
externalId: varchar("external_id"), // NB: This is not used by llamaindex
collection: varchar(),
document: text(),
metadata: jsonb()
.$type<{
contentId?: string;
organizationId?: number;
pageTitle?: string;
pageDescription?: string;
pageUrl?: string;
pageKeywords?: string[];
pageQuestions?: string[];
pageEntities?: { name: string; type: string }[];
tokens?: number;
}>()
.default({}),
// Duplicate contentId from metadata to allow for cascade FK deletion:
contentId: integer()
.generatedAlwaysAs(sql`(metadata ->> 'contentId')::int`)
.references(() => indexedTable.id, { onDelete: "cascade" }),
embeddings: vector({ dimensions: EmbeddingDimensions[env.EMBEDDING_MODEL] }),
},
(table) => {
return {
idxLlamaindexEmbeddingCollection: index("idx_llamaindex_embedding_collection").using(
"btree",
table.collection.asc().nullsLast()
),
metadataPageId: index("idx_llamaindex_embedding_metadata_page_id").on(
extraJsonFieldDeep(table.metadata, ["contentId"])
),
organizationId: index("idx_llamaindex_embedding_organization_id").on(
extraJsonFieldDeep(table.metadata, ["organizationId"])
),
searchIndex: index("search_index").using(
"gin",
sql`(
setweight(to_tsvector('english', ${table.metadata} ->> 'pageUrl'), 'A') ||
setweight(to_tsvector('english', ${table.metadata} ->> 'pageTitle'), 'B') ||
setweight(to_tsvector('english', ${table.metadata} ->> 'pageDescription'), 'C') ||
setweight(to_tsvector('english', ${table.document}), 'D')
)`
),
//TODO: add support for pg_trgm on dev postgres
//searchIndexTsquery: index("search_index_tsquery").using("gin", sql`document gin_trgm_ops`),
};
}
);
export type LlamaindexEmbedding = InferSelectModel<typeof llamaindexEmbedding>;
export type NewLlamaindexEmbedding = InferInsertModel<typeof llamaindexEmbedding>;
export const widgetConfigTable = pgTable("widget_config", {
id: serial("id").primaryKey().notNull(),
organizationId: integer()
.notNull()
.unique()
.references(() => organizationsTable.id, { onDelete: "cascade" }),
suggestedQueries: jsonb().$type<string[]>().default([]),
welcomeMessage: text(),
logoUrl: varchar(),
});
export type WidgetConfig = InferSelectModel<typeof widgetConfigTable>;
export type NewWidgetConfig = InferInsertModel<typeof widgetConfigTable>;