pdfsearch-mcp

pdf.ts•6.84 KiB

import fs from "node:fs"; export type Chunk = { id: number; text: string; page?: number; }; export type Index = { chunks: Chunk[]; inverted: Record<string, number[]>; // token -> chunk ids meta?: { source?: string; createdAt?: number; pdfSize?: number; pdfMtime?: number; pdfHash?: string; }; }; // Unicode-aware tokenizer that keeps letters, numbers, spaces, and hyphens export function tokenize(s: string): string[] { return s .toLowerCase() .replace(/[^\p{L}\p{N}\s-]/gu, " ") .split(/\s+/) .filter(Boolean); } // Build inverted index with unique tokens per chunk export function buildInverted(chunks: Chunk[]): Record<string, number[]> { const inverted: Record<string, number[]> = {}; for (const c of chunks) { const seen = new Set<string>(); for (const t of tokenize(c.text)) { if (seen.has(t)) continue; seen.add(t); (inverted[t] ||= []).push(c.id); } } return inverted; } // Chunk strategy: parse text into paragraphs with page-number tracking, then // coalesce adjacent paragraphs into ~1000-character chunks to keep rule text // together (headings + body + examples) while capping size. export function chunkText( text: string, opts: { maxLen?: number; sentenceSplit?: boolean } = {} ): Chunk[] { const maxLen = opts.maxLen ?? 1000; const sentenceSplit = opts.sentenceSplit ?? true; // Recognize explicit page markers like <<<PAGE:12>>> placed by build step const pageMarker = /^<<<PAGE:(\d+)>>>$/; const lines = text.split(/\r?\n/); let currentPage: number | undefined; type Para = { text: string; page?: number }; const paras: Para[] = []; let buf: string[] = []; let bufPage: number | undefined; const flushPara = () => { const t = buf.join(" ").replace(/\s+/g, " ").trim(); if (t) paras.push({ text: t, page: bufPage }); buf = []; bufPage = undefined; }; for (const rawLine of lines) { const line = rawLine.trim(); const m = line.match(pageMarker); if (m) { // Page boundary: terminate any ongoing paragraph before switching pages if (buf.length) flushPara(); currentPage = Number(m[1]); continue; } if (!line) { // paragraph break flushPara(); continue; } if (buf.length === 0) bufPage = currentPage; buf.push(line); } flushPara(); const chunks: Chunk[] = []; let id = 0; let acc = ""; let accPage: number | undefined = undefined; const emit = (t: string) => { const text = t.trim(); if (!text) return; chunks.push({ id: id++, text, page: accPage }); }; const flushAcc = () => { if (!acc.trim()) return; emit(acc); acc = ""; accPage = undefined; }; const pushPara = (p: Para) => { // If page changed relative to accumulated chunk, start a new chunk if (acc && p.page != null && accPage != null && p.page !== accPage) { flushAcc(); } const paraText = p.text; const next = (acc ? acc + " " : "") + paraText; if (next.length <= maxLen) { if (!acc) accPage = p.page ?? accPage; acc = next; return; } if (!acc) { // single long paragraph: split by sentences if desired if (sentenceSplit) { const parts = paraText.split(/(?<=[\.!\?])\s+/g); let cur = ""; for (const s of parts) { if ((cur + " " + s).trim().length > maxLen) { chunks.push({ id: id++, text: cur.trim(), page: p.page }); cur = s; } else { cur += (cur ? " " : "") + s; } } if (cur.trim()) chunks.push({ id: id++, text: cur.trim(), page: p.page }); } else { chunks.push({ id: id++, text: paraText, page: p.page }); } return; } // acc exists but adding this para would exceed maxLen flushAcc(); // start with this para (may split next iteration) accPage = p.page ?? accPage; acc = paraText; }; for (const p of paras) pushPara(p); flushAcc(); return chunks; } // Simple bag-of-words scoring by unique query token overlap export function scoreChunks( query: string, index: Index, opts?: { phraseBoost?: number; phraseOnly?: boolean } ): { chunk: Chunk; score: number }[] { const qTokens = Array.from(new Set(tokenize(query))); const scores = new Map<number, number>(); for (const t of qTokens) { const hits = index.inverted[t]; if (!hits) continue; for (const id of hits) { scores.set(id, (scores.get(id) ?? 0) + 1); } } // Simple phrase boost if full query substring appears const q = query.trim().toLowerCase(); const phraseBoost = Number(opts?.phraseBoost ?? 2); if (q.length >= 3 && phraseBoost > 0) { for (const c of index.chunks) { if (c.text.toLowerCase().includes(q)) { scores.set(c.id, (scores.get(c.id) ?? 0) + phraseBoost); } } } const results: { chunk: Chunk; score: number }[] = []; for (const [id, score] of scores) { const chunk = index.chunks[id]; if (chunk) results.push({ chunk, score }); } if (opts?.phraseOnly && q.length >= 3) { // Keep only chunks containing the exact phrase const lowered = new Set( index.chunks.filter((c) => c.text.toLowerCase().includes(q)).map((c) => c.id) ); for (let i = results.length - 1; i >= 0; i--) { if (!lowered.has(results[i].chunk.id)) results.splice(i, 1); } } results.sort((a, b) => b.score - a.score); return results; } // Expand a matched chunk with neighboring context for richer previews export function expandContext( index: Index, centerId: number, opts: { before?: number; after?: number; budget?: number } = {} ): { text: string; page?: number } { const before = opts.before ?? 1; const after = opts.after ?? 2; const budget = opts.budget ?? 1200; // max characters const start = Math.max(0, centerId - before); const end = Math.min(index.chunks.length - 1, centerId + after); const parts: string[] = []; let page: number | undefined = index.chunks[centerId]?.page; for (let i = start; i <= end; i++) { const t = index.chunks[i]?.text || ""; if (!t) continue; if (parts.join(" ").length + t.length + 1 > budget) break; parts.push(t); if (page == null && index.chunks[i]?.page != null) page = index.chunks[i].page; } return { text: parts.join("\n\n"), page }; } export function highlight(text: string, query: string): string { const tokens = Array.from(new Set(tokenize(query))); if (!tokens.length) return text; // Escape regex special chars in tokens const esc = (s: string) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); const pattern = new RegExp(`\\b(${tokens.map(esc).join("|")})\\b`, "gi"); return text.replace(pattern, (m) => `[${m}]`); } // Utility to write JSON with stable formatting for tests if needed export async function writeJson(file: string, data: unknown): Promise<void> { const content = JSON.stringify(data); await fs.promises.writeFile(file, content); }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/russellbrenner/pdfsearch-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

pdf.ts•6.84 KiB