MCP Spark Documentation Server

Overview Schema Related Servers Score Discussions

mcp-spark-documentation
src
mcp_spark_documentation

parser.py•3.89 KiB

"""Parser for Spark documentation markdown files.""" import re from pathlib import Path import frontmatter # type: ignore[import-untyped] from mcp_spark_documentation.models import Document, DocumentMetadata class DocumentParser: """Parses markdown files with YAML frontmatter.""" SPARK_DOCS_BASE_URL = "https://spark.apache.org/docs/latest" def parse_file(self, file_path: Path, base_path: Path) -> Document | None: """Parse a markdown file and extract metadata and content. Args: file_path: Path to the markdown file. base_path: Base path of the documentation directory. Returns: Document instance or None if parsing fails. """ try: post = frontmatter.load(file_path) metadata = self._extract_metadata(post.metadata, file_path) relative_path = file_path.relative_to(base_path) section = self._extract_section(relative_path) url = self._compute_url(relative_path) content = self._clean_content(post.content) return Document( path=str(relative_path), title=metadata.title, description=metadata.description, section=section, content=content, url=url, ) except Exception: return None def _extract_metadata(self, metadata: dict[str, object], file_path: Path) -> DocumentMetadata: """Extract structured metadata from frontmatter. Args: metadata: Dictionary of frontmatter fields. file_path: Path to the file for fallback title extraction. Returns: DocumentMetadata instance. """ title = metadata.get("title") if not isinstance(title, str): # Fallback to filename if no title in frontmatter title = file_path.stem.replace("-", " ").replace("_", " ").title() description = metadata.get("description") if not isinstance(description, str): description = None license_info = metadata.get("license") if not isinstance(license_info, str): license_info = None return DocumentMetadata( title=title, description=description, license=license_info, ) def _extract_section(self, relative_path: Path) -> str: """Extract the top-level section from the path. Args: relative_path: Path relative to docs directory. Returns: Section name (first directory component or 'root'). """ parts = relative_path.parts return parts[0] if len(parts) > 1 else "root" def _compute_url(self, relative_path: Path) -> str: """Compute the spark.apache.org documentation URL. Args: relative_path: Path relative to docs directory. Returns: Full URL to the documentation page. """ # Remove .md extension and convert to URL path path_str = str(relative_path).replace(".md", "").replace(".markdown", "") # Convert to HTML extension if not path_str.endswith(".html"): path_str += ".html" return f"{self.SPARK_DOCS_BASE_URL}/{path_str}" def _clean_content(self, content: str) -> str: """Clean markdown content for indexing. Removes Jekyll-specific syntax and other markup artifacts. Args: content: Raw markdown content. Returns: Cleaned content suitable for indexing. """ # Remove Jekyll liquid tags content = re.sub(r"\{%.*?%\}", "", content) content = re.sub(r"\{\{.*?\}\}", "", content) # Remove HTML comments content = re.sub(r"", "", content, flags=re.DOTALL) # Remove HTML tags content = re.sub(r"<[^>]+>", "", content) return content.strip()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/martoc/mcp-spark-documentation'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

parser.py•3.89 KiB