Skip to main content
Glama
pdf2md.ts3.85 kB
import { createRequire } from 'module'; import { generatePageNumbers } from '../utils.js'; import { extractImagesFromPdf, ImageInfo } from '../extract-images.js'; const require = createRequire(import.meta.url); const { parse } = require('@opendocsg/pdf2md/lib/util/pdf'); const { makeTransformations, transform } = require('@opendocsg/pdf2md/lib/util/transformations'); type ParseResult = ReturnType<typeof parse>; /** * PDF metadata structure */ export interface PdfMetadata { fileSize?: number; totalPages: number; title?: string; author?: string; creator?: string; producer?: string; version?: string; creationDate?: string; modificationDate?: string; isEncrypted?: boolean; } export interface PdfPageItem { text: string; images: ImageInfo[]; pageNumber: number; } export interface PdfParseResult { pages: PdfPageItem[]; metadata: PdfMetadata; } /** * Extracts metadata from a PDF document. * @param pdfDocument The PDF document to extract metadata from. * @returns A PdfMetadata object containing the extracted metadata. */ const extractMetadata = ({ pdfDocument, metadata }: ParseResult): PdfMetadata => ({ totalPages: pdfDocument.numPages, title: metadata.Title, author: metadata.Author, creator: metadata.Creator, producer: metadata.Producer, version: metadata.PDFFormatVersion, creationDate: metadata.CreationDate, modificationDate: metadata.ModDate, isEncrypted: metadata.IsEncrypted, }); export type PageRange = { offset: number; length: number; }; /** * Reads a PDF and converts it to Markdown, returning structured data. * @param pdfBuffer The PDF buffer to convert. * @param pageNumbers The page numbers to extract. If empty, all pages are extracted. * @returns A Promise that resolves to a PdfParseResult object containing the parsed data. */ export async function pdf2md(pdfBuffer: Uint8Array, pageNumbers: number[] | PageRange = []): Promise<PdfParseResult> { const result = await parse(pdfBuffer); const { fonts, pages, pdfDocument } = result; // Calculate which pages to process const filterPageNumbers = Array.isArray(pageNumbers) ? pageNumbers : generatePageNumbers(pageNumbers.offset, pageNumbers.length, pages.length); // Filter and transform pages const pagesToProcess = filterPageNumbers.length === 0 ? pages : pages.filter((_: any, index: number) => filterPageNumbers.includes(index + 1)); const pageNumberMap = filterPageNumbers.length === 0 ? pages.map((_: any, index: number) => index + 1) : filterPageNumbers.filter(pageNum => pageNum >= 1 && pageNum <= pages.length); const transformations = makeTransformations(fonts.map); const parseResult = transform(pagesToProcess, transformations); // Extract images const imagesByPage = await extractImagesFromPdf(pdfBuffer, pageNumberMap, { format: 'webp', quality: 85 }); // Create pages without images for now const processedPages: PdfPageItem[] = parseResult.pages.map((page: any, index: number) => { const pageNumber = pageNumberMap[index]; return { pageNumber, text: page.items.join('\n') + '\n', images: imagesByPage[pageNumber] || [], }; }); const metadata = extractMetadata(result); try { return { pages: processedPages, metadata }; } finally { if (pdfDocument) { try { if (typeof pdfDocument.cleanup === 'function') { await pdfDocument.cleanup(false); } } catch (e) { } try { if (typeof pdfDocument.destroy === 'function') { await pdfDocument.destroy(); } } catch (e) { } } } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/wonderwhy-er/DesktopCommanderMCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server