
by madarco
import { UserError } from "@repo/core"; import db from "@repo/db"; import { eq } from "@repo/db/drizzle"; import { indexedTable, normalizeUrl } from "@repo/db/schema"; import { logger } from "@repo/logger"; export async function loadSitemapUrls(sitemapUrl: string) { // TODO: this is not working in Vercel: //const { urls } = await Sitemap.load(sitemapUrl); //logger.info(`Found ${urls.length} URLs in the sitemap`); return []; } export async function crawlDbItem(indexedId: number) { const indexed = await db.query.indexedTable.findFirst({ where: eq(indexedTable.id, indexedId), }); if (!indexed) { throw new UserError("Index page not found"); } if (!indexed.doCrawl) { throw new UserError("Index page does not have crawling enabled"); } logger.info("Crawling from item", { indexedId, isSitemap: indexed.isSitemap }); if (indexed.isSitemap) { const urls = await loadSitemapUrls(indexed.url); logger.info("Loaded sitemap urls", { urls: urls.length }); const batchSize = 100; for (let i = 0; i < urls.length; i += batchSize) { const batch = urls.slice(i, i + batchSize).map((url) => ({ url, normalizedUrl: normalizeUrl(url), organizationId: indexed.organizationId, foundFromIndexId: indexed.id, status: "PENDING", })); await db.insert(indexedTable).values(batch); } } else { console.warn("Crawling not supported for non-sitemap pages"); } }