Skip to main content
Glama
fetcher.ts4.45 kB
import axios from "axios"; import * as cheerio from "cheerio"; import { fileTypeFromBuffer } from "file-type"; import { pdf } from "pdf-parse"; import tesseract from "node-tesseract-ocr"; import * as tmp from "tmp"; import * as fs from "fs/promises"; export interface FetchResponse { text: string; contentType: string; sourceUrl: string; ocrUsed: boolean; metadata?: Record<string, string>; } async function extractTextFromPdf( buffer: Buffer, url: string, ): Promise<{ text: string; ocrUsed: boolean }> { try { // First try to extract text from PDF directly const pdfData = await pdf(buffer); const extractedText = pdfData.text.trim(); // If we got substantial text, return it if (extractedText.length > 100) { return { text: extractedText, ocrUsed: false }; } // Otherwise, fall back to OCR console.warn(`PDF at ${url} has minimal text, attempting OCR...`); return await performOcr(buffer); } catch (error) { console.warn(`PDF parsing failed for ${url}, attempting OCR:`, error); return await performOcr(buffer); } } async function performOcr(buffer: Buffer): Promise<{ text: string; ocrUsed: boolean }> { // Create a temporary file for Tesseract const tmpFile = tmp.fileSync({ postfix: ".pdf" }); try { await fs.writeFile(tmpFile.name, buffer); const config = { lang: "eng", oem: 1, psm: 3, }; const text = await tesseract.recognize(tmpFile.name, config); return { text: text.trim(), ocrUsed: true }; } catch (error) { throw new Error(`OCR failed: ${error instanceof Error ? error.message : String(error)}`); } finally { tmpFile.removeCallback(); } } function extractTextFromHtml(html: string, url: string): string { const $ = cheerio.load(html); // Remove script and style elements $("script, style, nav, header, footer").remove(); // Try to find the main content area // Common patterns in legal websites const mainContentSelectors = [ "article", "main", ".content", "#content", ".judgment", ".decision", ".case", ".legislation", "[role='main']", ]; for (const selector of mainContentSelectors) { const $main = $(selector); if ($main.length > 0) { const text = $main.text().trim(); if (text.length > 200) { return text; } } } // Fallback: Extract from body const bodyText = $("body").text().trim(); // Clean up whitespace return bodyText.replace(/\s+/g, " ").trim(); } export async function fetchDocumentText(url: string): Promise<FetchResponse> { try { const response = await axios.get(url, { responseType: "arraybuffer", headers: { "User-Agent": "auslaw-mcp/0.1.0 (legal research tool)", }, timeout: 30000, maxContentLength: 50 * 1024 * 1024, // 50MB limit }); const buffer = Buffer.from(response.data); const contentType = response.headers["content-type"] || ""; // Detect file type from buffer const detectedType = await fileTypeFromBuffer(buffer); let text: string; let ocrUsed = false; // Handle PDF documents if ( contentType.includes("application/pdf") || detectedType?.mime === "application/pdf" ) { const result = await extractTextFromPdf(buffer, url); text = result.text; ocrUsed = result.ocrUsed; } // Handle HTML documents else if ( contentType.includes("text/html") || detectedType?.mime === "text/html" ) { const html = buffer.toString("utf-8"); text = extractTextFromHtml(html, url); } // Handle plain text else if (contentType.includes("text/plain")) { text = buffer.toString("utf-8"); } // Unsupported format else { throw new Error( `Unsupported content type: ${contentType}${detectedType ? ` (detected: ${detectedType.mime})` : ""}`, ); } // Extract basic metadata const metadata: Record<string, string> = { contentLength: String(buffer.length), contentType: contentType || detectedType?.mime || "unknown", }; return { text, contentType: contentType || detectedType?.mime || "unknown", sourceUrl: url, ocrUsed, metadata, }; } catch (error) { if (axios.isAxiosError(error)) { throw new Error( `Failed to fetch document from ${url}: ${error.message}`, ); } throw error; } }

Implementation Reference

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/russellbrenner/auslaw-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server