Literature Review MCP Server

pdf.ts•6.5 KiB

/**
 * 统一的 PDF 解析模块
 * 使用 pdfjs-dist，替代所有 pdfreader 实现
 */

import * as fs from 'fs';
import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs';
import { storage } from '../storage/StorageManager.js';

/**
 * PDF 提取选项
 */
export interface PdfExtractionOptions {
  /** 是否包含元数据 */
  includeMetadata?: boolean;
  /** 最大页数限制（0 表示无限制） */
  maxPages?: number;
  /** 是否在页面之间添加分隔符 */
  pageSeparator?: string;
}

/**
 * PDF 提取结果
 */
export interface PdfExtractionResult {
  /** 提取的文本内容 */
  text: string;
  /** 总页数 */
  pageCount: number;
  /** PDF 元数据（如果请求） */
  metadata?: {
    title?: string;
    author?: string;
    subject?: string;
    creator?: string;
    producer?: string;
    creationDate?: string;
  };
}

/**
 * 提取 PDF 文本（核心实现）
 * 
 * @param pdfPath PDF 文件路径
 * @param options 提取选项
 * @returns 提取结果
 */
export async function extractPdfText(
  pdfPath: string,
  options: PdfExtractionOptions = {}
): Promise<PdfExtractionResult> {
  try {
    // 读取 PDF 文件
    const dataBuffer = fs.readFileSync(pdfPath);
    const uint8Array = new Uint8Array(dataBuffer);
    
    // 加载 PDF 文档
    const loadingTask = pdfjsLib.getDocument({ data: uint8Array });
    const pdf = await loadingTask.promise;
    
    const pageCount = pdf.numPages;
    const maxPages = options.maxPages && options.maxPages > 0 
      ? Math.min(options.maxPages, pageCount) 
      : pageCount;
    const pageSeparator = options.pageSeparator || '\n\n';
    
    // 提取文本
    const textParts: string[] = [];
    for (let i = 1; i <= maxPages; i++) {
      const page = await pdf.getPage(i);
      const textContent = await page.getTextContent();
      const pageText = textContent.items
        .map((item: any) => item.str)
        .join(' ')
        .trim();
      
      if (pageText) {
        textParts.push(pageText);
      }
    }
    
    const text = textParts.join(pageSeparator);
    
    // 验证提取的文本
    if (text.length < 100) {
      throw new Error('PDF 文本提取失败或内容过少（少于 100 字符）');
    }
    
    // 提取元数据（如果请求）
    let metadata: any = undefined;
    if (options.includeMetadata) {
      const pdfMetadata = await pdf.getMetadata();
      const info = pdfMetadata.info as any;
      metadata = {
        title: info?.Title,
        author: info?.Author,
        subject: info?.Subject,
        creator: info?.Creator,
        producer: info?.Producer,
        creationDate: info?.CreationDate
      };
    }
    
    return {
      text,
      pageCount,
      metadata
    };
    
  } catch (error) {
    console.error('PDF 解析失败:', error);
    throw new Error(`PDF 解析失败: ${error instanceof Error ? error.message : String(error)}`);
  }
}

/**
 * 提取 PDF 文本并保存到存储系统
 * 
 * @param arxivId arXiv ID（已清理版本号）
 * @param pdfPath PDF 文件路径
 * @param paperInfo 论文信息（可选）
 * @returns 保存的文本内容
 */
export async function extractAndSavePdfText(
  arxivId: string,
  pdfPath: string,
  paperInfo?: any
): Promise<string> {
  try {
    // 检查是否已存在
    const textPath = storage.getTextPath(arxivId);
    const existingText = storage.readText(arxivId);
    if (existingText) {
      console.log(`✓ 文本文件已存在: ${textPath}`);
      return existingText;
    }
    
    // 提取 PDF 文本
    console.log(`📄 提取 PDF 文本: ${arxivId}`);
    const result = await extractPdfText(pdfPath);
    
    // 构建输出内容
    let outputContent = '';
    
    // 添加论文信息头部
    if (paperInfo) {
      outputContent += `=== 论文信息 ===\n`;
      outputContent += `标题: ${paperInfo.title}\n`;
      outputContent += `arXiv ID: ${arxivId}\n`;
      outputContent += `发布日期: ${paperInfo.published || 'N/A'}\n`;
      
      if (paperInfo.authors && paperInfo.authors.length > 0) {
        const authorNames = paperInfo.authors.map((author: any) => 
          author.name || author
        ).join(', ');
        outputContent += `作者: ${authorNames}\n`;
      }
      
      if (paperInfo.summary) {
        outputContent += `摘要: ${paperInfo.summary}\n`;
      }
      
      outputContent += `\n=== PDF 解析文本 ===\n\n`;
    }
    
    // 添加提取的文本
    outputContent += result.text;
    
    // 保存到存储系统
    const savedTextPath = storage.saveText(arxivId, outputContent);
    console.log(`✓ 文本已保存: ${savedTextPath}`);
    
    // 更新数据库
    const paper = storage.db.getPaperByArxivId(arxivId);
    if (paper) {
      storage.db.updatePaper(arxivId, { text_path: savedTextPath });
      console.log(`✓ 数据库已更新`);
    } else {
      // 如果论文不存在，创建基础记录
      storage.db.insertOrUpdatePaper({
        arxiv_id: arxivId,
        title: paperInfo?.title || `arXiv:${arxivId}`,
        abstract: paperInfo?.summary,
        publication_date: paperInfo?.published,
        text_path: savedTextPath,
        source: 'arxiv'
      });
      console.log(`✓ 论文记录已创建`);
    }
    
    return outputContent;
    
  } catch (error) {
    console.error('提取并保存 PDF 文本失败:', error);
    throw new Error(`提取并保存 PDF 文本失败: ${error instanceof Error ? error.message : String(error)}`);
  }
}

/**
 * 简单提取 PDF 文本（仅返回文本字符串）
 * 
 * @param pdfPath PDF 文件路径
 * @returns 提取的文本
 */
export async function extractPdfTextSimple(pdfPath: string): Promise<string> {
  const result = await extractPdfText(pdfPath);
  return result.text;
}

/**
 * 检查 PDF 文件是否存在且可读
 * 
 * @param pdfPath PDF 文件路径
 * @returns 是否存在且可读
 */
export function isPdfReadable(pdfPath: string): boolean {
  try {
    return fs.existsSync(pdfPath) && fs.statSync(pdfPath).isFile();
  } catch {
    return false;
  }
}

/**
 * 获取 PDF 页数（不提取文本）
 * 
 * @param pdfPath PDF 文件路径
 * @returns 页数
 */
export async function getPdfPageCount(pdfPath: string): Promise<number> {
  try {
    const dataBuffer = fs.readFileSync(pdfPath);
    const uint8Array = new Uint8Array(dataBuffer);
    const loadingTask = pdfjsLib.getDocument({ data: uint8Array });
    const pdf = await loadingTask.promise;
    return pdf.numPages;
  } catch (error) {
    throw new Error(`获取 PDF 页数失败: ${error instanceof Error ? error.message : String(error)}`);
  }
}

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ydzat/literature-review-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

pdf.ts•6.5 KiB

/**
 * 统一的 PDF 解析模块
 * 使用 pdfjs-dist，替代所有 pdfreader 实现
 */

import * as fs from 'fs';
import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs';
import { storage } from '../storage/StorageManager.js';

/**
 * PDF 提取选项
 */
export interface PdfExtractionOptions {
  /** 是否包含元数据 */
  includeMetadata?: boolean;
  /** 最大页数限制（0 表示无限制） */
  maxPages?: number;
  /** 是否在页面之间添加分隔符 */
  pageSeparator?: string;
}

/**
 * PDF 提取结果
 */
export interface PdfExtractionResult {
  /** 提取的文本内容 */
  text: string;
  /** 总页数 */
  pageCount: number;
  /** PDF 元数据（如果请求） */
  metadata?: {
    title?: string;
    author?: string;
    subject?: string;
    creator?: string;
    producer?: string;
    creationDate?: string;
  };
}

/**
 * 提取 PDF 文本（核心实现）
 * 
 * @param pdfPath PDF 文件路径
 * @param options 提取选项
 * @returns 提取结果
 */
export async function extractPdfText(
  pdfPath: string,
  options: PdfExtractionOptions = {}
): Promise<PdfExtractionResult> {
  try {
    // 读取 PDF 文件
    const dataBuffer = fs.readFileSync(pdfPath);
    const uint8Array = new Uint8Array(dataBuffer);
    
    // 加载 PDF 文档
    const loadingTask = pdfjsLib.getDocument({ data: uint8Array });
    const pdf = await loadingTask.promise;
    
    const pageCount = pdf.numPages;
    const maxPages = options.maxPages && options.maxPages > 0 
      ? Math.min(options.maxPages, pageCount) 
      : pageCount;
    const pageSeparator = options.pageSeparator || '\n\n';
    
    // 提取文本
    const textParts: string[] = [];
    for (let i = 1; i <= maxPages; i++) {
      const page = await pdf.getPage(i);
      const textContent = await page.getTextContent();
      const pageText = textContent.items
        .map((item: any) => item.str)
        .join(' ')
        .trim();
      
      if (pageText) {
        textParts.push(pageText);
      }
    }
    
    const text = textParts.join(pageSeparator);
    
    // 验证提取的文本
    if (text.length < 100) {
      throw new Error('PDF 文本提取失败或内容过少（少于 100 字符）');
    }
    
    // 提取元数据（如果请求）
    let metadata: any = undefined;
    if (options.includeMetadata) {
      const pdfMetadata = await pdf.getMetadata();
      const info = pdfMetadata.info as any;
      metadata = {
        title: info?.Title,
        author: info?.Author,
        subject: info?.Subject,
        creator: info?.Creator,
        producer: info?.Producer,
        creationDate: info?.CreationDate
      };
    }
    
    return {
      text,
      pageCount,
      metadata
    };
    
  } catch (error) {
    console.error('PDF 解析失败:', error);
    throw new Error(`PDF 解析失败: ${error instanceof Error ? error.message : String(error)}`);
  }
}

/**
 * 提取 PDF 文本并保存到存储系统
 * 
 * @param arxivId arXiv ID（已清理版本号）
 * @param pdfPath PDF 文件路径
 * @param paperInfo 论文信息（可选）
 * @returns 保存的文本内容
 */
export async function extractAndSavePdfText(
  arxivId: string,
  pdfPath: string,
  paperInfo?: any
): Promise<string> {
  try {
    // 检查是否已存在
    const textPath = storage.getTextPath(arxivId);
    const existingText = storage.readText(arxivId);
    if (existingText) {
      console.log(`✓ 文本文件已存在: ${textPath}`);
      return existingText;
    }
    
    // 提取 PDF 文本
    console.log(`📄 提取 PDF 文本: ${arxivId}`);
    const result = await extractPdfText(pdfPath);
    
    // 构建输出内容
    let outputContent = '';
    
    // 添加论文信息头部
    if (paperInfo) {
      outputContent += `=== 论文信息 ===\n`;
      outputContent += `标题: ${paperInfo.title}\n`;
      outputContent += `arXiv ID: ${arxivId}\n`;
      outputContent += `发布日期: ${paperInfo.published || 'N/A'}\n`;
      
      if (paperInfo.authors && paperInfo.authors.length > 0) {
        const authorNames = paperInfo.authors.map((author: any) => 
          author.name || author
        ).join(', ');
        outputContent += `作者: ${authorNames}\n`;
      }
      
      if (paperInfo.summary) {
        outputContent += `摘要: ${paperInfo.summary}\n`;
      }
      
      outputContent += `\n=== PDF 解析文本 ===\n\n`;
    }
    
    // 添加提取的文本
    outputContent += result.text;
    
    // 保存到存储系统
    const savedTextPath = storage.saveText(arxivId, outputContent);
    console.log(`✓ 文本已保存: ${savedTextPath}`);
    
    // 更新数据库
    const paper = storage.db.getPaperByArxivId(arxivId);
    if (paper) {
      storage.db.updatePaper(arxivId, { text_path: savedTextPath });
      console.log(`✓ 数据库已更新`);
    } else {
      // 如果论文不存在，创建基础记录
      storage.db.insertOrUpdatePaper({
        arxiv_id: arxivId,
        title: paperInfo?.title || `arXiv:${arxivId}`,
        abstract: paperInfo?.summary,
        publication_date: paperInfo?.published,
        text_path: savedTextPath,
        source: 'arxiv'
      });
      console.log(`✓ 论文记录已创建`);
    }
    
    return outputContent;
    
  } catch (error) {
    console.error('提取并保存 PDF 文本失败:', error);
    throw new Error(`提取并保存 PDF 文本失败: ${error instanceof Error ? error.message : String(error)}`);
  }
}

/**
 * 简单提取 PDF 文本（仅返回文本字符串）
 * 
 * @param pdfPath PDF 文件路径
 * @returns 提取的文本
 */
export async function extractPdfTextSimple(pdfPath: string): Promise<string> {
  const result = await extractPdfText(pdfPath);
  return result.text;
}

/**
 * 检查 PDF 文件是否存在且可读
 * 
 * @param pdfPath PDF 文件路径
 * @returns 是否存在且可读
 */
export function isPdfReadable(pdfPath: string): boolean {
  try {
    return fs.existsSync(pdfPath) && fs.statSync(pdfPath).isFile();
  } catch {
    return false;
  }
}

/**
 * 获取 PDF 页数（不提取文本）
 * 
 * @param pdfPath PDF 文件路径
 * @returns 页数
 */
export async function getPdfPageCount(pdfPath: string): Promise<number> {
  try {
    const dataBuffer = fs.readFileSync(pdfPath);
    const uint8Array = new Uint8Array(dataBuffer);
    const loadingTask = pdfjsLib.getDocument({ data: uint8Array });
    const pdf = await loadingTask.promise;
    return pdf.numPages;
  } catch (error) {
    throw new Error(`获取 PDF 页数失败: ${error instanceof Error ? error.message : String(error)}`);
  }
}