search_similar
Find semantically similar documents by comparing embeddings. Input a reference document to retrieve related files with similarity scores.
Instructions
Find documents similar to a given document.
Uses the document's embedding to find semantically similar documents.
Args:
filepath: Path to the reference document
max_results: Number of similar documents to return (default: 5)
Returns:
JSON string with list of similar documents and similarity scoresInput Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| filepath | Yes | ||
| max_results | No |
Implementation Reference
- mcp_server/server.py:1104-1171 (handler)The `search_similar` method inside the `KnowledgeOrchestrator` class performs embedding-based similarity search to find similar documents.
def search_similar(self, filepath: str, max_results: int = 5) -> List[Dict[str, Any]]: """Find documents similar to a given document using embedding similarity.""" filepath_resolved = str(Path(filepath).resolve()) doc_id = None for did, info in self._indexed_docs.items(): stored = str(Path(info.get("source", "")).resolve()) if stored == filepath_resolved: doc_id = did break if not doc_id: return [] try: results = self.collection.get( where={"doc_id": doc_id}, include=["embeddings"], limit=1 ) if not results["ids"] or not results.get("embeddings"): return [] embeddings = results.get("embeddings", []) if not embeddings: return [] query_embedding = embeddings[0] except Exception: return [] try: similar = self.collection.query( query_embeddings=[query_embedding], n_results=max_results + 20, include=["documents", "metadatas", "distances"] ) except Exception: return [] if not similar["ids"] or not similar["ids"][0]: return [] seen_sources = set() output = [] for i, chunk_id in enumerate(similar["ids"][0]): meta = similar["metadatas"][0][i] source = meta.get("source", "") if meta.get("doc_id") == doc_id: continue if source in seen_sources: continue seen_sources.add(source) distance = similar["distances"][0][i] if similar["distances"] else 0 similarity = max(0, 1.0 - distance) output.append({ "source": source, "filename": meta.get("filename", ""), "category": meta.get("category", ""), "similarity": round(similarity, 4), "preview": (similar["documents"][0][i] or "")[:200], }) if len(output) >= max_results: break return output - mcp_server/server.py:1530-1561 (registration)The `search_similar` tool is registered here using the `@mcp.tool()` decorator, which interfaces with the `KnowledgeOrchestrator.search_similar` method.
@mcp.tool() def search_similar(filepath: str, max_results: int = 5) -> str: """ Find documents similar to a given document. Uses the document's embedding to find semantically similar documents. Args: filepath: Path to the reference document max_results: Number of similar documents to return (default: 5) Returns: JSON string with list of similar documents and similarity scores """ if not filepath: return json.dumps({"status": "error", "message": "Filepath required"}) max_results = max(1, min(max_results or 5, 20)) orchestrator = get_orchestrator() results = orchestrator.search_similar(filepath, max_results=max_results) if not results: return json.dumps({"status": "no_results", "message": "No similar documents found or document not indexed"}) return json.dumps({ "status": "success", "reference": filepath, "count": len(results), "similar_documents": results }, indent=2, ensure_ascii=False)