import re
from pathlib import Path
from typing import List, Optional, Dict
from .models import ParsedDocument
from .repository_manager import get_repo_path
import logging
logger = logging.getLogger(__name__)
def get_pydantic_docs_path() -> Path:
"""Returns the absolute path to the 'docs' directory within the cloned Pydantic repository."""
return get_repo_path() / "docs"
def extract_title_from_content(content: str, file_path: Path) -> str:
"""
Extracts the title from Markdown content.
Looks for the first H1 header. If not found, uses the filename (stem) as a fallback,
formatted to be more readable.
"""
heading_match = re.search(r"^\s*#\s+([^\n]+)", content, re.MULTILINE)
if heading_match:
return heading_match.group(1).strip()
return file_path.stem.replace("-", " ").replace("_", " ").title()
def create_document_id(relative_path_str: str) -> str:
"""
Creates a unique and clean ID for a document based on its relative path string.
Example: 'usage/models.md' -> 'usage-models'
"""
if relative_path_str.lower().endswith(".md"):
clean_path = relative_path_str[:-3]
else:
clean_path = relative_path_str
clean_path = clean_path.replace("/", "-")
clean_path = re.sub(r"[^a-zA-Z0-9-]", "", clean_path)
clean_path = clean_path.lower()
clean_path = re.sub(r"-+", "-", clean_path).strip("-")
return clean_path if clean_path else "root"
def parse_markdown_file(
file_path: Path, docs_base_dir: Path
) -> Optional[ParsedDocument]:
"""
Parses a single Markdown file into a ParsedDocument object.
Args:
file_path: Absolute path to the Markdown file.
docs_base_dir: Absolute path to the root of the documentation directory (e.g., .../pydantic_repo/docs).
Used to determine the relative path for the document.
Returns:
A ParsedDocument object if successful, None otherwise.
"""
try:
if not file_path.is_file():
logger.warning(f"Path is not a file, skipping: {file_path}")
return None
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
relative_path = file_path.relative_to(docs_base_dir)
relative_path_str = relative_path.as_posix()
title = extract_title_from_content(content, file_path)
doc_id = create_document_id(relative_path_str)
return ParsedDocument(
id=doc_id, path=relative_path_str, title=title, content=content
)
except Exception as e:
logger.error(f"Error parsing Markdown file {file_path}: {e}", exc_info=True)
return None
def find_all_markdown_files(start_dir: Path) -> List[Path]:
"""
Recursively finds all Markdown (.md) files in the given directory.
"""
markdown_files = []
if not start_dir.is_dir():
logger.warning(f"Cannot find Markdown files: {start_dir} is not a directory.")
return markdown_files
for path_object in start_dir.rglob("*.md"):
if path_object.is_file():
markdown_files.append(path_object)
return markdown_files
def parse_all_documents() -> Dict[str, ParsedDocument]:
"""
Parses all Markdown documents from the Pydantic documentation directory.
Skips files if they cannot be parsed.
Returns:
A dictionary mapping document IDs to ParsedDocument objects.
"""
documents: Dict[str, ParsedDocument] = {}
pydantic_docs_dir = get_pydantic_docs_path()
if not pydantic_docs_dir.exists() or not pydantic_docs_dir.is_dir():
logger.error(
f"Pydantic documentation directory not found or is not a directory: {pydantic_docs_dir}"
)
return documents
logger.info(f"Starting parsing of Markdown files from: {pydantic_docs_dir}")
markdown_files = find_all_markdown_files(pydantic_docs_dir)
parsed_count = 0
failed_count = 0
for md_file_path in markdown_files:
doc = parse_markdown_file(md_file_path, pydantic_docs_dir)
if doc:
if doc.id in documents:
logger.warning(
f"Duplicate document ID '{doc.id}' generated for paths: "
f"'{documents[doc.id].path}' and '{doc.path}'. Overwriting."
)
documents[doc.id] = doc
parsed_count += 1
else:
failed_count += 1
logger.info(
f"Finished parsing. Successfully parsed: {parsed_count} documents. Failed to parse: {failed_count} files."
)
return documents