NotebookLM MCP Server (Security Hardened)

pdf-chunker.ts•6.73 KiB

/** * PDF Chunker Utility * * Splits large PDFs into smaller chunks that fit within Gemini's limits: * - Max 50MB per file * - Max 1000 pages per file * * Uses pdf-lib for pure JavaScript PDF manipulation (no system dependencies). */ import { PDFDocument } from "pdf-lib"; import * as fs from "fs"; import * as path from "path"; import * as os from "os"; import { log } from "../utils/logger.js"; /** * Gemini file limits */ export const GEMINI_LIMITS = { maxFileSizeBytes: 50 * 1024 * 1024, // 50MB maxPages: 1000, // Use conservative chunk sizes to stay well under limits chunkPages: 500, // Pages per chunk chunkSizeBytes: 25 * 1024 * 1024, // 25MB target per chunk }; /** * Result of PDF analysis */ export interface PdfAnalysis { filePath: string; fileName: string; fileSize: number; pageCount: number; needsChunking: boolean; estimatedChunks: number; reason?: string; } /** * Result of PDF chunking */ export interface PdfChunk { chunkIndex: number; totalChunks: number; filePath: string; fileName: string; pageStart: number; pageEnd: number; pageCount: number; fileSize: number; } /** * Result of chunking operation */ export interface ChunkingResult { success: boolean; originalFile: string; chunks: PdfChunk[]; totalPages: number; error?: string; } /** * Analyze a PDF to determine if it needs chunking */ export async function analyzePdf(filePath: string): Promise<PdfAnalysis> { const stats = await fs.promises.stat(filePath); const fileName = path.basename(filePath); const fileSize = stats.size; // Check file size first (quick check) if (fileSize > GEMINI_LIMITS.maxFileSizeBytes) { const estimatedChunks = Math.ceil( fileSize / GEMINI_LIMITS.chunkSizeBytes ); return { filePath, fileName, fileSize, pageCount: -1, // Unknown until we read it needsChunking: true, estimatedChunks, reason: `File size ${formatBytes(fileSize)} exceeds 50MB limit`, }; } // Read PDF to get page count try { const pdfBytes = await fs.promises.readFile(filePath); const pdfDoc = await PDFDocument.load(pdfBytes, { ignoreEncryption: true, }); const pageCount = pdfDoc.getPageCount(); if (pageCount > GEMINI_LIMITS.maxPages) { const estimatedChunks = Math.ceil(pageCount / GEMINI_LIMITS.chunkPages); return { filePath, fileName, fileSize, pageCount, needsChunking: true, estimatedChunks, reason: `Page count ${pageCount} exceeds 1000 page limit`, }; } return { filePath, fileName, fileSize, pageCount, needsChunking: false, estimatedChunks: 1, }; } catch (error) { // If we can't read the PDF, assume it doesn't need chunking // and let Gemini handle the error log.warning(`Could not analyze PDF ${fileName}: ${error}`); return { filePath, fileName, fileSize, pageCount: -1, needsChunking: false, estimatedChunks: 1, reason: `Could not analyze: ${error}`, }; } } /** * Split a PDF into chunks */ export async function chunkPdf(filePath: string): Promise<ChunkingResult> { const fileName = path.basename(filePath, ".pdf"); const tempDir = await fs.promises.mkdtemp( path.join(os.tmpdir(), "pdf-chunks-") ); try { log.info(`Chunking PDF: ${filePath}`); // Read the original PDF const pdfBytes = await fs.promises.readFile(filePath); const pdfDoc = await PDFDocument.load(pdfBytes, { ignoreEncryption: true, }); const totalPages = pdfDoc.getPageCount(); log.info(`PDF has ${totalPages} pages, splitting into chunks...`); const chunks: PdfChunk[] = []; let currentPage = 0; let chunkIndex = 0; while (currentPage < totalPages) { // Calculate chunk range const pageStart = currentPage; const pageEnd = Math.min( currentPage + GEMINI_LIMITS.chunkPages - 1, totalPages - 1 ); const chunkPageCount = pageEnd - pageStart + 1; // Create new PDF for this chunk const chunkDoc = await PDFDocument.create(); const pageIndices = Array.from( { length: chunkPageCount }, (_, i) => pageStart + i ); const copiedPages = await chunkDoc.copyPages(pdfDoc, pageIndices); for (const page of copiedPages) { chunkDoc.addPage(page); } // Save chunk to temp file const chunkFileName = `${fileName}_chunk_${chunkIndex + 1}.pdf`; const chunkFilePath = path.join(tempDir, chunkFileName); const chunkBytes = await chunkDoc.save(); await fs.promises.writeFile(chunkFilePath, chunkBytes); const chunkStats = await fs.promises.stat(chunkFilePath); chunks.push({ chunkIndex, totalChunks: -1, // Will update after filePath: chunkFilePath, fileName: chunkFileName, pageStart: pageStart + 1, // 1-indexed for display pageEnd: pageEnd + 1, pageCount: chunkPageCount, fileSize: chunkStats.size, }); log.info( ` Chunk ${chunkIndex + 1}: pages ${pageStart + 1}-${pageEnd + 1} (${formatBytes(chunkStats.size)})` ); currentPage = pageEnd + 1; chunkIndex++; } // Update total chunks count for (const chunk of chunks) { chunk.totalChunks = chunks.length; } log.info( `Split into ${chunks.length} chunks, stored in ${tempDir}` ); return { success: true, originalFile: filePath, chunks, totalPages, }; } catch (error) { const errorMsg = error instanceof Error ? error.message : String(error); log.error(`Failed to chunk PDF: ${errorMsg}`); // Clean up temp directory on failure try { await fs.promises.rm(tempDir, { recursive: true }); } catch { // Ignore cleanup errors } return { success: false, originalFile: filePath, chunks: [], totalPages: 0, error: errorMsg, }; } } /** * Clean up chunk files after upload */ export async function cleanupChunks(chunks: PdfChunk[]): Promise<void> { if (chunks.length === 0) return; // Get the temp directory from the first chunk const tempDir = path.dirname(chunks[0].filePath); try { await fs.promises.rm(tempDir, { recursive: true }); log.info(`Cleaned up chunk temp directory: ${tempDir}`); } catch (error) { log.warning(`Failed to cleanup chunks: ${error}`); } } /** * Format bytes to human readable string */ function formatBytes(bytes: number): string { if (bytes < 1024) return `${bytes} B`; if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`; if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`; return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`; }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Pantheon-Security/notebooklm-mcp-secure'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

pdf-chunker.ts•6.73 KiB