Obsidian MCP Server

Overview Schema Related Servers Score Discussions

indexer.py•10.1 KiB

"""Database service for vector storage and document management""" import logging import os import re import shutil from typing import Any, Dict, List, Optional, Set import yaml from langchain_chroma import Chroma # type: ignore from langchain_core.documents import Document # type: ignore from langchain_core.embeddings import Embeddings # type: ignore from langchain_huggingface import HuggingFaceEmbeddings # type: ignore from langchain_ollama import OllamaEmbeddings # type: ignore from langchain_text_splitters import RecursiveCharacterTextSplitter # type: ignore from ..constants import CHUNK_OVERLAP, CHUNK_SIZE from .metadata_tracker import FileMetadataTracker logger = logging.getLogger(__name__) def extract_obsidian_links(content: str) -> List[str]: """Extract Obsidian wikilinks [[Note]] or [[Note|Alias]] from content""" links = re.findall(r"\[\[(.*?)\]\]", content) # Clean links (remove alias like [[Note|Alias]] -> Note) cleaned_links = [link.split("|")[0].strip() for link in links] # Remove duplicates while preserving order seen = set() unique_links = [] for link in cleaned_links: if link and link not in seen: seen.add(link) unique_links.append(link) return unique_links def parse_frontmatter(content: str) -> Dict[str, Any]: """Extract YAML frontmatter from content using PyYAML""" match = re.match(r"^---\s*\n(.*?)\n---\s*\n", content, re.DOTALL) if not match: return {} try: yaml_content = match.group(1) metadata = yaml.safe_load(yaml_content) if isinstance(metadata, dict): # Flatten or normalize some fields if needed processed = {} for k, v in metadata.items(): if isinstance(v, list): processed[k] = ",".join(str(i) for i in v) else: processed[k] = str(v) return processed except Exception as e: logger.warning("Error parsing frontmatter", extra={"error": str(e)}) return {} def get_embeddings( provider: str = "ollama", model: str = "embeddinggemma", ollama_base_url: str = "http://localhost:11434", ) -> Embeddings: """Get configured embeddings model based on provider setting.""" if provider == "ollama": logger.info("Loading Ollama embeddings", extra={"model": model}) return OllamaEmbeddings(model=model, base_url=ollama_base_url) # Fallback to HuggingFace logger.info( "Initializing HuggingFace embeddings: " "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" ) return HuggingFaceEmbeddings( model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2" ) def get_text_splitter( chunk_size: int = CHUNK_SIZE, chunk_overlap: int = CHUNK_OVERLAP ) -> RecursiveCharacterTextSplitter: """Get configured text splitter""" return RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, separators=["#", "##", "###", "####", "\n\n", "\n", " ", ""], ) def extract_image_captions(content: str) -> List[str]: """ Extract captions from Obsidian image links. Supports both ![[image.png|caption]] and ![caption](image.png) formats. """ captions = [] # 1. Wikilinks: ![[image.ext|caption]] matches_wiki = re.findall(r"!\[\[(.*?)\|(.*?)\]\]", content) captions.extend([caption.strip() for _, caption in matches_wiki if caption.strip()]) # 2. Standard Markdown: ![caption](image.ext) # Exclude invalid links or empty captions matches_md = re.findall(r"!\[(.*?)\]\(.*?\)", content) captions.extend([caption.strip() for caption in matches_md if caption.strip()]) return captions def load_documents_from_paths(filepaths: Set[str]) -> List[Document]: """Load documents from specific file paths with link and caption extraction""" documents = [] for filepath in filepaths: try: with open(filepath, "r", encoding="utf-8") as f: content = f.read() # Extract Obsidian links links = extract_obsidian_links(content) # Extract image captions captions = extract_image_captions(content) # Extract frontmatter metadata fm_metadata = parse_frontmatter(content) metadata = { "source": filepath, "links": ",".join(links) if links else "", } if captions: metadata["image_captions"] = ",".join(captions) # Enrich content with specific context for embeddings # We add this at the end so it's part of the text but clearly labeled content += "\n\nImage Context:\n" + "\n".join(captions) # Merge with frontmatter metadata metadata.update(fm_metadata) doc = Document( page_content=content, metadata=metadata, ) documents.append(doc) except Exception as e: logger.warning( "Could not load file", extra={"filepath": filepath, "error": str(e)} ) return documents def load_all_obsidian_documents(obsidian_path: str) -> List[Document]: """Load all documents from Obsidian vault using recursive walk""" logger.info("Loading Obsidian documents (.md) recursively") # File patterns to exclude (binary, canvas, etc.) excluded_patterns = [ ".excalidraw.md", ".canvas", "untitled", ] documents = [] for root, _, files in os.walk(obsidian_path): for file in files: if file.endswith(".md"): filepath = os.path.join(root, file) # Skip excluded patterns if any(pattern in file.lower() for pattern in excluded_patterns): continue try: with open(filepath, "r", encoding="utf-8") as f: content = f.read() if content.strip(): links = extract_obsidian_links(content) captions = extract_image_captions(content) fm_metadata = parse_frontmatter(content) doc_metadata = { "source": filepath, "links": ",".join(links) if links else "", } if captions: doc_metadata["image_captions"] = ",".join(captions) content += "\n\nImage Context:\n" + "\n".join(captions) doc_metadata.update(fm_metadata) doc = Document( page_content=content, metadata=doc_metadata, ) documents.append(doc) except Exception as e: logger.error( "Error loading file", extra={"filepath": filepath, "error": str(e)}, ) return documents def load_or_create_db( obsidian_path: str, db_path: str, **kwargs: Any, ) -> tuple[Optional[Chroma], Dict[str, Any]]: """Load or create vector database with incremental indexing support. Returns: Tuple of (db, stats_dict) where stats contains: - docs_processed: total documents in the operation - docs_new: new documents added - docs_modified: documents updated - docs_deleted: documents removed - is_incremental: whether update was incremental """ stats: Dict[str, Any] = { "docs_processed": 0, "docs_new": 0, "docs_modified": 0, "docs_deleted": 0, "is_incremental": False, } metadata_file = kwargs.get("metadata_file", "") embeddings_provider = kwargs.get("embeddings_provider", "ollama") embeddings_model = kwargs.get("embeddings_model", "embeddinggemma") force_rebuild = kwargs.get("force_rebuild", False) logger.info("Starting vector database load or creation") embeddings = get_embeddings(provider=embeddings_provider, model=embeddings_model) tracker = FileMetadataTracker(metadata_file) # Check if we should do incremental update if os.path.exists(db_path) and not force_rebuild: if tracker.should_rebuild(obsidian_path): force_rebuild = True else: new_files, modified_files, deleted_files = tracker.detect_changes( obsidian_path ) if not new_files and not modified_files and not deleted_files: db = Chroma(persist_directory=db_path, embedding_function=embeddings) return db, stats # Do incremental update stats["is_incremental"] = True stats["docs_new"] = len(new_files) stats["docs_modified"] = len(modified_files) stats["docs_deleted"] = len(deleted_files) stats["docs_processed"] = len(new_files) + len(modified_files) db = Chroma(persist_directory=db_path, embedding_function=embeddings) for f in deleted_files | modified_files: db.delete(where={"source": f}) docs = load_documents_from_paths(new_files | modified_files) if docs: splitter = get_text_splitter() texts = splitter.split_documents(docs) db.add_documents(texts) tracker.update_metadata(obsidian_path) return db, stats # Full rebuild documents = load_all_obsidian_documents(obsidian_path) if not documents: return None, stats stats["docs_processed"] = len(documents) stats["is_incremental"] = False splitter = get_text_splitter() texts = splitter.split_documents(documents) if os.path.exists(db_path): shutil.rmtree(db_path) db = Chroma.from_documents( texts, embeddings, persist_directory=db_path, collection_metadata={"hnsw:space": "cosine"}, ) tracker.update_metadata(obsidian_path) return db, stats

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Vasallo94/obsidian-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

indexer.py•10.1 KiB