find_related
Find notes thematically related to a source note by comparing averaged chunk embeddings via pgvector, useful for sparsely linked notes.
Instructions
Semantically similar notes based on the source note's chunk embeddings, averaged then queried via pgvector.
Independent of the link graph — useful when the source is sparsely linked
or when looking for thematic neighbors. For link-based exploration use
get_neighborhood. For arbitrary topic queries use semantic_search.
Args: path: Vault-relative path to the source note. limit: Maximum results (default 10, hard cap 50).
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| path | Yes | ||
| limit | No |
Output Schema
| Name | Required | Description | Default |
|---|---|---|---|
| result | Yes |
Implementation Reference
- src/mcp_server/server.py:289-302 (registration)MCP tool registration for 'find_related' — exposes the tool to the MCP server as an @mcp.tool() named 'find_related', delegates to the implementation.
@mcp.tool() async def find_related(path: str, limit: int = 10) -> str: """Semantically similar notes based on the source note's chunk embeddings, averaged then queried via pgvector. Independent of the link graph — useful when the source is sparsely linked or when looking for thematic neighbors. For link-based exploration use `get_neighborhood`. For arbitrary topic queries use `semantic_search`. Args: path: Vault-relative path to the source note. limit: Maximum results (default 10, hard cap 50). """ return await find_related_impl(path, limit=limit) - src/mcp_server/tools.py:516-597 (handler)Core implementation of 'find_related' — retrieves the source note's chunk embeddings, averages them, then queries pgvector using cosine_distance to find semantically similar notes (excluding the source), deduplicating by note ID and returning the top matches.
@_tracked("find_related", ["path", "limit"]) async def find_related_impl(path: str, limit: int = 10) -> str: """Semantic neighbors via averaged chunk embeddings.""" import numpy as np from sqlalchemy import select from src.models.db import NoteEmbedding, NoteMetadata uid = current_user_id.get() limit = max(1, min(limit, 50)) async with async_session() as session: src_stmt = select(NoteMetadata).where(NoteMetadata.file_path == path) if uid is not None: src_stmt = src_stmt.where(NoteMetadata.user_id == uid) source = (await session.execute(src_stmt)).scalar_one_or_none() if source is None: return f"Note not found: {path}" chunks = (await session.execute( select(NoteEmbedding.embedding).where(NoteEmbedding.note_id == source.id) )).scalars().all() if not chunks: return ( f"`{path}` has not been embedded yet — " "the indexer is still catching up. Try again in a few minutes." ) avg = np.mean([np.asarray(c, dtype=float) for c in chunks], axis=0) avg_list = avg.tolist() # Same HNSW tuning as semantic_search — see embeddings.py for context. await session.execute(text("SET LOCAL hnsw.ef_search = 80")) await session.execute(text("SET LOCAL random_page_cost = 1.1")) # Pull more than `limit` so we can dedupe by note. stmt = ( select( NoteEmbedding.note_id, NoteEmbedding.chunk_text, NoteEmbedding.embedding, NoteMetadata.file_path, NoteMetadata.title, NoteMetadata.tags, ) .join(NoteMetadata, NoteEmbedding.note_id == NoteMetadata.id) .where(NoteEmbedding.note_id != source.id) .order_by(NoteEmbedding.embedding.cosine_distance(avg_list)) .limit(limit * 5) ) if uid is not None: stmt = stmt.where(NoteMetadata.user_id == uid) rows = (await session.execute(stmt)).all() if not rows: return f"No related notes for `{path}`" # Dedupe by note_id, keeping the highest-similarity chunk. avg_norm = float(np.linalg.norm(avg)) or 1.0 best: dict[int, dict] = {} for r in rows: emb = np.asarray(r.embedding, dtype=float) sim = float(np.dot(emb, avg) / ((np.linalg.norm(emb) or 1.0) * avg_norm)) prev = best.get(r.note_id) if prev is None or sim > prev["similarity"]: best[r.note_id] = { "path": r.file_path, "title": r.title, "tags": r.tags, "similarity": sim, "chunk": r.chunk_text, } ranked = sorted(best.values(), key=lambda x: x["similarity"], reverse=True)[:limit] lines = [f"Top {len(ranked)} related notes for `{path}`:\n"] for r in ranked: tags_str = f" [{', '.join(r['tags'])}]" if r["tags"] else "" snippet = r["chunk"].replace("\n", " ")[:200] lines.append( f"- **{r['title']}** (`{r['path']}`){tags_str} — sim: {r['similarity']:.3f}" ) lines.append(f" > {snippet}…") return "\n".join(lines) - src/mcp_server/tools.py:73-90 (helper)The @_tracked decorator wraps tool implementations with timing and usage logging. Applied to find_related_impl with tool_name='find_related' and param_keys=['path', 'limit'].
def _tracked(tool_name: str, param_keys: list[str]): """Decorator that times the call and logs it to usage_logs.""" def decorator(fn): @wraps(fn) async def wrapper(*args, **kwargs): start = time.monotonic() result = await fn(*args, **kwargs) duration_ms = int((time.monotonic() - start) * 1000) params = {} for i, key in enumerate(param_keys): if i < len(args): params[key] = args[i] elif key in kwargs: params[key] = kwargs[key] await _log_usage(tool_name, _truncate_params(params), duration_ms, len(str(result))) return result return wrapper return decorator