PDF Reader MCP Server

Overview Schema Related Servers Score Discussions

section_detector.py•7.51 KiB

""" Academic section detection for research papers """ import re from typing import Dict, List, Any, Optional from ..core.pdf_processor import PDFProcessor class SectionDetector: """Detects and extracts academic paper sections""" # Common academic section patterns SECTION_PATTERNS = { "abstract": [ r"^ABSTRACT\s*$", r"^Abstract\s*$", r"^\d+\.\s*ABSTRACT", r"^\d+\.\s*Abstract" ], "introduction": [ r"^INTRODUCTION\s*$", r"^Introduction\s*$", r"^\d+\.\s*INTRODUCTION", r"^\d+\.\s*Introduction", r"^1\.\s*Introduction" ], "methods": [ r"^METHODS?\s*$", r"^Methods?\s*$", r"^METHODOLOGY\s*$", r"^Methodology\s*$", r"^\d+\.\s*METHODS?", r"^\d+\.\s*Methods?", r"^\d+\.\s*METHODOLOGY", r"^\d+\.\s*Methodology" ], "results": [ r"^RESULTS?\s*$", r"^Results?\s*$", r"^FINDINGS\s*$", r"^Findings\s*$", r"^\d+\.\s*RESULTS?", r"^\d+\.\s*Results?", r"^\d+\.\s*FINDINGS", r"^\d+\.\s*Findings" ], "discussion": [ r"^DISCUSSION\s*$", r"^Discussion\s*$", r"^\d+\.\s*DISCUSSION", r"^\d+\.\s*Discussion" ], "conclusion": [ r"^CONCLUSION\s*$", r"^Conclusion\s*$", r"^CONCLUSIONS\s*$", r"^Conclusions\s*$", r"^\d+\.\s*CONCLUSION", r"^\d+\.\s*Conclusion", r"^\d+\.\s*CONCLUSIONS", r"^\d+\.\s*Conclusions" ], "references": [ r"^REFERENCES\s*$", r"^References\s*$", r"^BIBLIOGRAPHY\s*$", r"^Bibliography\s*$", r"^\d+\.\s*REFERENCES", r"^\d+\.\s*References" ] } @staticmethod async def detect_sections(pdf_path: str) -> Dict[str, Any]: """Detect academic sections in the PDF""" text = await PDFProcessor.extract_raw_text(pdf_path) lines = text.split('\n') sections = {} current_section = None section_content = [] for i, line in enumerate(lines): line = line.strip() if not line: continue # Check if line matches any section pattern detected_section = SectionDetector._match_section_pattern(line) if detected_section: # Save previous section if current_section and section_content: sections[current_section] = { "content": '\n'.join(section_content).strip(), "line_start": i - len(section_content), "line_end": i - 1, "word_count": len(' '.join(section_content).split()) } # Start new section current_section = detected_section section_content = [] else: # Add to current section if current_section: section_content.append(line) # Save last section if current_section and section_content: sections[current_section] = { "content": '\n'.join(section_content).strip(), "line_start": len(lines) - len(section_content), "line_end": len(lines) - 1, "word_count": len(' '.join(section_content).split()) } return { "sections": sections, "sections_found": list(sections.keys()), "total_sections": len(sections) } @staticmethod def _match_section_pattern(line: str) -> Optional[str]: """Check if line matches any section pattern""" for section_name, patterns in SectionDetector.SECTION_PATTERNS.items(): for pattern in patterns: if re.match(pattern, line, re.IGNORECASE): return section_name return None @staticmethod async def extract_abstract(pdf_path: str) -> Dict[str, Any]: """Extract abstract specifically""" sections = await SectionDetector.detect_sections(pdf_path) if "abstract" in sections["sections"]: abstract_data = sections["sections"]["abstract"] return { "abstract": abstract_data["content"], "word_count": abstract_data["word_count"], "found": True } # Fallback: look for abstract in first few paragraphs text = await PDFProcessor.extract_raw_text(pdf_path) paragraphs = text.split('\n\n')[:5] # First 5 paragraphs for para in paragraphs: if len(para.split()) > 50 and len(para.split()) < 300: # Abstract length if any(word in para.lower() for word in ['study', 'research', 'analysis', 'investigation']): return { "abstract": para.strip(), "word_count": len(para.split()), "found": True, "method": "heuristic" } return {"abstract": "", "found": False} @staticmethod async def extract_key_sections(pdf_path: str) -> Dict[str, str]: """Extract key academic sections for agent understanding""" sections_data = await SectionDetector.detect_sections(pdf_path) sections = sections_data["sections"] key_sections = {} # Priority sections for agents priority_sections = ["abstract", "introduction", "methods", "results", "conclusion"] for section in priority_sections: if section in sections: content = sections[section]["content"] # Truncate very long sections for agent consumption if len(content.split()) > 500: words = content.split()[:500] content = ' '.join(words) + "... [truncated]" key_sections[section] = content return key_sections @staticmethod async def get_section_summary(pdf_path: str) -> Dict[str, Any]: """Get a summary of detected sections""" sections_data = await SectionDetector.detect_sections(pdf_path) sections = sections_data["sections"] summary = { "has_abstract": "abstract" in sections, "has_introduction": "introduction" in sections, "has_methods": "methods" in sections, "has_results": "results" in sections, "has_discussion": "discussion" in sections, "has_conclusion": "conclusion" in sections, "has_references": "references" in sections, "total_sections": len(sections), "estimated_structure": "academic_paper" if len(sections) >= 4 else "other_document" } # Calculate section word counts section_stats = {} for section_name, section_data in sections.items(): section_stats[section_name] = { "word_count": section_data["word_count"], "percentage": round(section_data["word_count"] / sum(s["word_count"] for s in sections.values()) * 100, 1) } summary["section_statistics"] = section_stats return summary

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/averagejoeslab/pdf-reader-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

section_detector.py•7.51 KiB