LPDP MCP Server

mcp-training
src
document

pdf_loader.py•4.08 KiB

"""PDF Loader using PyMuPDF""" import fitz # PyMuPDF from pathlib import Path from typing import List, Dict, Any from dataclasses import dataclass @dataclass class Document: """Represents a document chunk with content and metadata""" content: str metadata: Dict[str, Any] class PDFLoader: """Load and extract text from PDF documents""" def __init__(self, file_path: str | Path): """ Initialize PDF loader Args: file_path: Path to the PDF file """ self.file_path = Path(file_path) if not self.file_path.exists(): raise FileNotFoundError(f"PDF file not found: {file_path}") if not self.file_path.suffix.lower() == '.pdf': raise ValueError(f"File must be a PDF: {file_path}") def load(self) -> List[Document]: """ Load PDF and extract text from each page Returns: List of Document objects with page content and metadata """ documents = [] with fitz.open(self.file_path) as doc: for page_num in range(len(doc)): page = doc[page_num] text = page.get_text() # Skip empty pages if not text.strip(): continue # Clean text text = self._clean_text(text) # Extract section title if present section = self._extract_section(text) document = Document( content=text, metadata={ "source": str(self.file_path.name), "page_number": page_num + 1, "section": section, "total_pages": len(doc) } ) documents.append(document) return documents def _clean_text(self, text: str) -> str: """ Clean extracted text Args: text: Raw extracted text Returns: Cleaned text """ # Remove excessive whitespace lines = text.split('\n') cleaned_lines = [] for line in lines: line = line.strip() if line: cleaned_lines.append(line) return '\n'.join(cleaned_lines) def _extract_section(self, text: str) -> str: """ Extract section title from text Args: text: Page text content Returns: Section title or empty string """ # Common section patterns in LPDP document section_keywords = [ "Dana Pendaftaran", "Dana SPP", "Dana Tunjangan Buku", "Dana Bantuan Penelitian", "Dana Bantuan Seminar", "Dana Bantuan Publikasi", "Dana Transportasi", "Dana Aplikasi Visa", "Dana Asuransi Kesehatan", "Dana Hidup Bulanan", "Dana Kedatangan", "Dana Tunjangan Keluarga", "Insentif Kelulusan", "Dana Keadaan Darurat", "Dana Pelatihan", "Dana Lomba Internasional", "Dana Pendamping Disabilitas", ] first_lines = text[:500].lower() for keyword in section_keywords: if keyword.lower() in first_lines: return keyword return "" def get_full_text(self) -> str: """ Get full text from all pages Returns: Concatenated text from all pages """ documents = self.load() return "\n\n".join([doc.content for doc in documents]) def load_pdf(file_path: str | Path) -> List[Document]: """ Convenience function to load a PDF file Args: file_path: Path to the PDF file Returns: List of Document objects """ loader = PDFLoader(file_path) return loader.load()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/adityaldy/mcp-training'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

pdf_loader.py•4.08 KiB