PDF Reader MCP Server

Overview Schema Related Servers Score Discussions

citation_parser.py•8.5 KiB

""" Citation and reference parsing for academic papers """ import re from typing import List, Dict, Any, Optional from ..core.pdf_processor import PDFProcessor class CitationParser: """Parses citations and references from academic papers""" # Citation patterns IN_TEXT_PATTERNS = [ r'\(([A-Z][a-z]+ et al\.?, \d{4}[a-z]?)\)', # (Smith et al., 2020) r'\(([A-Z][a-z]+ & [A-Z][a-z]+, \d{4}[a-z]?)\)', # (Smith & Jones, 2020) r'\(([A-Z][a-z]+, \d{4}[a-z]?)\)', # (Smith, 2020) r'\[(\d+)\]', # [1] r'\[(\d+)-(\d+)\]', # [1-3] r'\[(\d+,\s*\d+(?:,\s*\d+)*)\]', # [1, 2, 3] ] # Reference patterns REFERENCE_PATTERNS = [ # Author, A. (Year). Title. Journal, Volume(Issue), pages. r'^([A-Z][a-z]+(?:,\s[A-Z]\.)*(?:\s&\s[A-Z][a-z]+(?:,\s[A-Z]\.)*)*)\.\s*\((\d{4}[a-z]?)\)\.\s*(.+?)\.\s*([^,]+),\s*(\d+)(?:\((\d+)\))?,\s*(\d+-\d+)\.', # Author, A., & Author, B. (Year). Title. Journal, Volume, pages. r'^([A-Z][a-z]+(?:,\s[A-Z]\.)*(?:\s&\s[A-Z][a-z]+(?:,\s[A-Z]\.)*)*)\.\s*\((\d{4}[a-z]?)\)\.\s*(.+?)\.\s*([^,]+),\s*(\d+),\s*(\d+-\d+)\.', # Numbered references: [1] Author, A. (Year). Title... r'^\[(\d+)\]\s+([A-Z][a-z]+(?:,\s[A-Z]\.)*(?:(?:,\s)?\s?&\s[A-Z][a-z]+(?:,\s[A-Z]\.)*)*)\.\s*\((\d{4}[a-z]?)\)\.\s*(.+)', ] @staticmethod async def extract_citations(pdf_path: str) -> Dict[str, Any]: """Extract all citations from the PDF""" text = await PDFProcessor.extract_raw_text(pdf_path) in_text_citations = CitationParser._find_in_text_citations(text) references = await CitationParser._extract_references(pdf_path) return { "in_text_citations": in_text_citations, "references": references, "citation_count": len(in_text_citations), "reference_count": len(references), "citation_style": CitationParser._detect_citation_style(in_text_citations) } @staticmethod def _find_in_text_citations(text: str) -> List[Dict[str, Any]]: """Find in-text citations""" citations = [] for pattern in CitationParser.IN_TEXT_PATTERNS: matches = re.finditer(pattern, text) for match in matches: citation_text = match.group(0) position = match.start() # Get context (50 chars before and after) context_start = max(0, position - 50) context_end = min(len(text), position + len(citation_text) + 50) context = text[context_start:context_end] citations.append({ "citation": citation_text, "position": position, "context": context, "type": CitationParser._classify_citation_type(citation_text) }) # Remove duplicates and sort by position unique_citations = [] seen = set() for citation in sorted(citations, key=lambda x: x["position"]): if citation["citation"] not in seen: unique_citations.append(citation) seen.add(citation["citation"]) return unique_citations @staticmethod async def _extract_references(pdf_path: str) -> List[Dict[str, Any]]: """Extract reference list""" from .section_detector import SectionDetector sections = await SectionDetector.detect_sections(pdf_path) if "references" not in sections["sections"]: return [] ref_text = sections["sections"]["references"]["content"] references = [] # Split by lines and clean lines = [line.strip() for line in ref_text.split('\n') if line.strip()] current_ref = "" ref_number = 1 for line in lines: # Check if line starts a new reference if (re.match(r'^\[\d+\]', line) or re.match(r'^\d+\.', line) or re.match(r'^[A-Z][a-z]+,', line)): # Save previous reference if current_ref: parsed_ref = CitationParser._parse_reference(current_ref, ref_number - 1) if parsed_ref: references.append(parsed_ref) current_ref = line ref_number += 1 else: # Continue previous reference current_ref += " " + line # Save last reference if current_ref: parsed_ref = CitationParser._parse_reference(current_ref, ref_number - 1) if parsed_ref: references.append(parsed_ref) return references @staticmethod def _parse_reference(ref_text: str, ref_number: int) -> Optional[Dict[str, Any]]: """Parse a single reference""" ref_text = ref_text.strip() if len(ref_text) < 20: # Too short to be valid reference return None # Extract basic components using simple patterns parsed = { "reference_number": ref_number, "raw_text": ref_text, "authors": [], "year": "", "title": "", "journal": "", "volume": "", "pages": "" } # Extract year year_match = re.search(r'\((\d{4}[a-z]?)\)', ref_text) if year_match: parsed["year"] = year_match.group(1) # Extract DOI doi_match = re.search(r'doi[:\s]*(10\.\d+/[^\s]+)', ref_text, re.IGNORECASE) if doi_match: parsed["doi"] = doi_match.group(1) # Extract URL url_match = re.search(r'https?://[^\s]+', ref_text) if url_match: parsed["url"] = url_match.group(0) # Simple author extraction (first part before year) if year_match: author_part = ref_text[:year_match.start()].strip() # Remove reference numbers author_part = re.sub(r'^\[\d+\]\s*', '', author_part) author_part = re.sub(r'^\d+\.\s*', '', author_part) parsed["authors_raw"] = author_part return parsed @staticmethod def _classify_citation_type(citation: str) -> str: """Classify the type of citation""" if re.match(r'\[\d+\]', citation): return "numbered" elif re.match(r'\([A-Z]', citation): return "author_year" else: return "other" @staticmethod def _detect_citation_style(citations: List[Dict[str, Any]]) -> str: """Detect the predominant citation style""" if not citations: return "unknown" types = [c["type"] for c in citations] if types.count("numbered") > types.count("author_year"): return "numbered" elif types.count("author_year") > types.count("numbered"): return "apa_harvard" else: return "mixed" @staticmethod async def get_citation_summary(pdf_path: str) -> Dict[str, Any]: """Get summary of citations for agent understanding""" citation_data = await CitationParser.extract_citations(pdf_path) return { "total_citations": citation_data["citation_count"], "total_references": citation_data["reference_count"], "citation_style": citation_data["citation_style"], "has_bibliography": citation_data["reference_count"] > 0, "heavily_cited": citation_data["citation_count"] > 20, "reference_years": CitationParser._extract_reference_years(citation_data["references"]) } @staticmethod def _extract_reference_years(references: List[Dict[str, Any]]) -> Dict[str, Any]: """Extract publication years from references""" years = [] for ref in references: if ref["year"]: try: year = int(ref["year"][:4]) # Remove letter suffixes years.append(year) except ValueError: continue if not years: return {"min_year": None, "max_year": None, "year_range": 0} return { "min_year": min(years), "max_year": max(years), "year_range": max(years) - min(years), "recent_references": sum(1 for y in years if y >= 2015) }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/averagejoeslab/pdf-reader-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

citation_parser.py•8.5 KiB