Skip to main content
Glama
server.py11.3 kB
""" Academic PDF Reader MCP Server with enhanced capabilities """ import json import os from typing import Dict, Any import uuid from mcp.server.fastmcp import FastMCP import mcp.types as types from .core.pdf_processor import PDFProcessor from .academic.text_processor import AcademicTextProcessor from .academic.section_detector import SectionDetector from .academic.citation_parser import CitationParser # PDF file storage pdf_files: Dict[str, Dict[str, Any]] = {} # Create FastMCP server mcp_server = FastMCP("academic-pdf-reader") # Basic PDF Tools @mcp_server.tool() async def load_pdf(file_path: str, name: str = None) -> str: """Load a PDF file for processing""" if not file_path or not os.path.exists(file_path): raise ValueError(f"File not found: {file_path}") display_name = name or os.path.basename(file_path) metadata = await PDFProcessor.get_metadata(file_path) file_id = str(uuid.uuid4()) pdf_files[file_id] = { "path": file_path, "name": display_name, "page_count": metadata["page_count"], "metadata": metadata } return f"Loaded PDF: {display_name}\nPages: {metadata['page_count']}\nFile ID: {file_id}" @mcp_server.tool() async def get_metadata(file_path: str) -> str: """Get PDF metadata and document information""" metadata = await PDFProcessor.get_metadata(file_path) return f"PDF Metadata:\n{json.dumps(metadata, indent=2)}" @mcp_server.tool() async def extract_images(file_path: str, page: int = None) -> str: """Extract images from PDF""" images = await PDFProcessor.extract_images(file_path, page) result = f"Found {len(images)} images" if page is not None: result += f" on page {page}" if images: result += "\n\nImage details:\n" for img in images: result += f"- Page {img['page']}: {img['width']}x{img['height']} pixels\n" return result @mcp_server.tool() async def render_page(file_path: str, page: int, dpi: int = 150) -> str: """Render a PDF page as an image""" img_b64 = await PDFProcessor.render_page(file_path, page, dpi) return f"Rendered page {page} at {dpi} DPI\nImage size: {len(img_b64)} characters" # Academic Enhancement Tools @mcp_server.tool() async def extract_academic_text(file_path: str, page: int = None) -> str: """Extract text with proper academic reading order and formatting""" if page is not None: result = await AcademicTextProcessor._process_single_page(file_path, page) text_info = f"Page {page} processed text:\n{result['processed_text']}" if result['math_formulas']: text_info += f"\n\nMath formulas found: {len(result['math_formulas'])}" for i, formula in enumerate(result['math_formulas'][:3]): text_info += f"\n Formula {i+1}: {formula}" return text_info else: result = await AcademicTextProcessor.extract_academic_text(file_path) return f"Full document processed:\n\n{result['full_text'][:2000]}{'...' if len(result['full_text']) > 2000 else ''}" @mcp_server.tool() async def detect_sections(file_path: str) -> str: """Detect and extract academic paper sections""" sections_data = await SectionDetector.detect_sections(file_path) sections = sections_data["sections"] if not sections: return "No academic sections detected in this PDF." result = f"Detected {len(sections)} academic sections:\n\n" for section_name, section_data in sections.items(): content_preview = section_data["content"][:200] result += f"**{section_name.upper()}** ({section_data['word_count']} words)\n" result += f"{content_preview}{'...' if len(section_data['content']) > 200 else ''}\n\n" return result @mcp_server.tool() async def extract_abstract(file_path: str) -> str: """Extract the abstract from an academic paper""" abstract_data = await SectionDetector.extract_abstract(file_path) if not abstract_data["found"]: return "No abstract found in this PDF." result = f"Abstract ({abstract_data['word_count']} words):\n\n" result += abstract_data["abstract"] if "method" in abstract_data: result += f"\n\n[Extracted using {abstract_data['method']} method]" return result @mcp_server.tool() async def extract_key_sections(file_path: str) -> str: """Extract key academic sections optimized for agent understanding""" key_sections = await SectionDetector.extract_key_sections(file_path) if not key_sections: return "No key academic sections found." result = "Key sections extracted for analysis:\n\n" for section_name, content in key_sections.items(): result += f"**{section_name.upper()}**\n{content}\n\n---\n\n" return result @mcp_server.tool() async def extract_citations(file_path: str) -> str: """Extract citations and references from the academic paper""" citation_data = await CitationParser.extract_citations(file_path) result = f"Citation Analysis:\n" result += f"- In-text citations: {citation_data['citation_count']}\n" result += f"- Reference list entries: {citation_data['reference_count']}\n" result += f"- Citation style: {citation_data['citation_style']}\n\n" if citation_data["in_text_citations"]: result += "Sample in-text citations:\n" for citation in citation_data["in_text_citations"][:5]: result += f" {citation['citation']} - {citation['type']}\n" if citation_data["references"]: result += f"\nFirst few references:\n" for ref in citation_data["references"][:3]: result += f" [{ref['reference_number']}] {ref['raw_text'][:100]}...\n" return result @mcp_server.tool() async def chunk_content(file_path: str, chunk_size: int = 1000) -> str: """Break PDF content into agent-friendly chunks""" chunks = await AcademicTextProcessor.chunk_academic_content(file_path, chunk_size) result = f"Content chunked into {len(chunks)} segments:\n\n" for i, chunk in enumerate(chunks[:5]): # Show first 5 chunks result += f"**Chunk {i+1}** (Pages {chunk['page_start']}-{chunk['page_end']}, {chunk['word_count']} words)\n" result += f"{chunk['text'][:200]}...\n\n" if len(chunks) > 5: result += f"... and {len(chunks) - 5} more chunks\n" return result @mcp_server.tool() async def analyze_document_structure(file_path: str) -> str: """Analyze the overall structure and characteristics of the academic document""" section_summary = await SectionDetector.get_section_summary(file_path) citation_summary = await CitationParser.get_citation_summary(file_path) metadata = await PDFProcessor.get_metadata(file_path) result = f"Document Structure Analysis:\n\n" result += f"**Document Type**: {section_summary['estimated_structure']}\n" result += f"**Total Pages**: {metadata['page_count']}\n" result += f"**Academic Sections Found**: {section_summary['total_sections']}\n\n" result += "**Section Coverage**:\n" for section, present in section_summary.items(): if section.startswith('has_') and present: section_name = section.replace('has_', '').replace('_', ' ').title() result += f" ✓ {section_name}\n" result += f"\n**Citation Profile**:\n" result += f" - Total citations: {citation_summary['total_citations']}\n" result += f" - Reference count: {citation_summary['total_references']}\n" result += f" - Citation style: {citation_summary['citation_style']}\n" if citation_summary['reference_years']['min_year']: result += f" - Reference span: {citation_summary['reference_years']['min_year']}-{citation_summary['reference_years']['max_year']}\n" result += f" - Recent refs (2015+): {citation_summary['reference_years']['recent_references']}\n" return result # Academic Prompts @mcp_server.prompt() async def summarize_academic_paper(file_path: str, focus: str = "general") -> types.PromptMessage: """Create an intelligent summary of an academic paper""" key_sections = await SectionDetector.extract_key_sections(file_path) metadata = await PDFProcessor.get_metadata(file_path) citation_summary = await CitationParser.get_citation_summary(file_path) focus_instructions = { "general": "Provide a comprehensive overview suitable for researchers", "methodology": "Focus on research methods, data collection, and analysis approaches", "results": "Emphasize findings, results, and statistical outcomes", "implications": "Highlight conclusions, implications, and future research directions" } instruction = focus_instructions.get(focus, focus_instructions["general"]) content = f"""Please provide an academic summary of this research paper focusing on {focus}. {instruction} Document Information: - Title: {metadata.get('title', 'N/A')} - Author: {metadata.get('author', 'N/A')} - Pages: {metadata.get('page_count', 'N/A')} - Citations: {citation_summary['total_citations']} in-text, {citation_summary['total_references']} references Key Sections Available: """ for section_name, section_content in key_sections.items(): content += f"\n**{section_name.upper()}:**\n{section_content}\n" return types.PromptMessage( role="user", content=types.TextContent(type="text", text=content) ) @mcp_server.prompt() async def analyze_research_methodology(file_path: str) -> types.PromptMessage: """Analyze the research methodology of an academic paper""" sections = await SectionDetector.detect_sections(file_path) methods_content = "" if "methods" in sections["sections"]: methods_content = sections["sections"]["methods"]["content"] content = f"""Please analyze the research methodology of this academic paper. Focus on: 1. Research design and approach 2. Data collection methods 3. Sample size and characteristics 4. Statistical analysis methods 5. Limitations and validity considerations Methods Section: {methods_content if methods_content else "Methods section not clearly identified - please analyze the full document for methodological information."} """ return types.PromptMessage( role="user", content=types.TextContent(type="text", text=content) ) # Resources @mcp_server.resource("file://{path}") async def read_pdf_resource(path: str) -> str: """Read PDF with academic structure awareness""" try: key_sections = await SectionDetector.extract_key_sections(path) metadata = await PDFProcessor.get_metadata(path) return json.dumps({ "metadata": metadata, "key_sections": key_sections, "document_type": "academic_paper" if key_sections else "general_pdf" }, indent=2) except Exception as e: raise ValueError(f"Error reading PDF: {str(e)}") def main(): """Main entry point for Academic PDF Reader MCP server""" mcp_server.run(transport="sse") if __name__ == "__main__": main()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/averagejoeslab/pdf-reader-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server