Mnemos

by Tanush1912

Overview Schema Related Servers Score Discussions

mnemos-mcp
src
ingestion

parser.py•6.69 kB

"""Document parser for multiple formats including PDF, Markdown, HTML, RST, DOCX, and text files.""" import os from dataclasses import dataclass from pathlib import Path from typing import Literal import fitz try: from bs4 import BeautifulSoup HAS_BS4 = True except ImportError: HAS_BS4 = False try: import docx2txt HAS_DOCX = True except ImportError: HAS_DOCX = False try: from docutils.core import publish_parts HAS_RST = True except ImportError: HAS_RST = False FileType = Literal["pdf", "md", "txt", "html", "rst", "docx", "url"] @dataclass class ParsedDocument: """Parsed document with content and metadata.""" content: str file_type: FileType file_name: str file_path: str file_size: int page_count: int | None = None pages: list[dict] | None = None url: str | None = None class DocumentParser: """Parser for various document formats.""" SUPPORTED_EXTENSIONS = { ".pdf", ".md", ".markdown", ".txt", ".text", ".html", ".htm", ".rst", ".docx", } def parse(self, file_path: str | Path) -> ParsedDocument: """ Parse a document file and extract its content. Args: file_path: Path to the document file Returns: ParsedDocument with extracted content and metadata Raises: ValueError: If file type is not supported FileNotFoundError: If file does not exist """ path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"File not found: {file_path}") extension = path.suffix.lower() if extension not in self.SUPPORTED_EXTENSIONS: raise ValueError( f"Unsupported file type: {extension}. " f"Supported: {self.SUPPORTED_EXTENSIONS}" ) file_size = path.stat().st_size file_name = path.name if extension == ".pdf": return self._parse_pdf(path, file_name, file_size) elif extension in {".md", ".markdown"}: return self._parse_markdown(path, file_name, file_size) elif extension in {".html", ".htm"}: return self._parse_html(path, file_name, file_size) elif extension == ".rst": return self._parse_rst(path, file_name, file_size) elif extension == ".docx": return self._parse_docx(path, file_name, file_size) else: return self._parse_text(path, file_name, file_size) def _parse_pdf(self, path: Path, file_name: str, file_size: int) -> ParsedDocument: """Parse PDF file using PyMuPDF.""" pages = [] all_content = [] with fitz.open(path) as doc: for page_num, page in enumerate(doc, start=1): text = page.get_text("text") if text.strip(): pages.append({"page_num": page_num, "content": text}) all_content.append(text) return ParsedDocument( content="\n\n".join(all_content), file_type="pdf", file_name=file_name, file_path=str(path.absolute()), file_size=file_size, page_count=len(pages), pages=pages, ) def _parse_markdown( self, path: Path, file_name: str, file_size: int ) -> ParsedDocument: """Parse Markdown file.""" content = path.read_text(encoding="utf-8") return ParsedDocument( content=content, file_type="md", file_name=file_name, file_path=str(path.absolute()), file_size=file_size, ) def _parse_html(self, path: Path, file_name: str, file_size: int) -> ParsedDocument: """Parse HTML file, extracting readable text content.""" if not HAS_BS4: raise ImportError( "beautifulsoup4 is required for HTML parsing. Install with: pip install beautifulsoup4 lxml" ) html_content = path.read_text(encoding="utf-8") content = self._extract_text_from_html(html_content) return ParsedDocument( content=content, file_type="html", file_name=file_name, file_path=str(path.absolute()), file_size=file_size, ) def _extract_text_from_html(self, html: str) -> str: """Extract readable text from HTML content.""" soup = BeautifulSoup(html, "lxml") for element in soup(["script", "style", "nav", "footer", "header"]): element.decompose() text = soup.get_text(separator="\n") lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = "\n".join(chunk for chunk in chunks if chunk) return text def _parse_rst(self, path: Path, file_name: str, file_size: int) -> ParsedDocument: """Parse reStructuredText file.""" if not HAS_RST: raise ImportError( "docutils is required for RST parsing. Install with: pip install docutils" ) rst_content = path.read_text(encoding="utf-8") try: parts = publish_parts(rst_content, writer_name="html") html_body = parts["html_body"] content = self._extract_text_from_html(html_body) except Exception: content = rst_content return ParsedDocument( content=content, file_type="rst", file_name=file_name, file_path=str(path.absolute()), file_size=file_size, ) def _parse_docx(self, path: Path, file_name: str, file_size: int) -> ParsedDocument: """Parse Word document (.docx).""" if not HAS_DOCX: raise ImportError( "docx2txt is required for DOCX parsing. Install with: pip install docx2txt" ) content = docx2txt.process(str(path)) return ParsedDocument( content=content, file_type="docx", file_name=file_name, file_path=str(path.absolute()), file_size=file_size, ) def _parse_text(self, path: Path, file_name: str, file_size: int) -> ParsedDocument: """Parse plain text file.""" content = path.read_text(encoding="utf-8") return ParsedDocument( content=content, file_type="txt", file_name=file_name, file_path=str(path.absolute()), file_size=file_size, )

Loading blob content...

Latest Blog Posts

Don't Use Large Strings as Cache Keys
By punkpeye on January 11, 2026.
markdown
node-js
cache
What are Claude Skills?
By punkpeye on January 10, 2026.
mcp
skills
How to Test MCP Streamable HTTP Endpoints Using cURL
By punkpeye on January 2, 2026.
tutorial
bash

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Tanush1912/mnemos-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

parser.py•6.69 kB