LangSmith MCP Server

Official

Overview Schema Related Servers Score Discussions

register_tools.py•63.4 KiB

"""Registration module for LangSmith MCP tools.""" import json from typing import Any, Dict, Optional from fastmcp import FastMCP from fastmcp.server import Context from langsmith_mcp_server.common.helpers import get_client_from_context from langsmith_mcp_server.services.tools.datasets import ( list_datasets_tool, list_examples_tool, read_dataset_tool, read_example_tool, ) from langsmith_mcp_server.services.tools.experiments import ( list_experiments_tool, ) from langsmith_mcp_server.services.tools.prompts import ( get_prompt_tool, list_prompts_tool, ) from langsmith_mcp_server.services.tools.traces import ( fetch_runs_tool, list_projects_tool, ) def register_tools(mcp: FastMCP) -> None: """ Register all LangSmith tool-related functionality with the MCP server. This function configures and registers various tools for interacting with LangSmith, including prompt management, conversation history, traces, and analytics. Args: mcp: The MCP server instance to register tools with """ @mcp.tool() def list_prompts( is_public: str = "false", limit: int = 20, ctx: Context = None ) -> Dict[str, Any]: """ Fetch prompts from LangSmith with optional filtering. Args: is_public (str): Filter by prompt visibility - "true" for public prompts, "false" for private prompts (default: "false") limit (int): Maximum number of prompts to return (default: 20) Returns: Dict[str, Any]: Dictionary containing the prompts and metadata """ try: client = get_client_from_context(ctx) is_public_bool = is_public.lower() == "true" return list_prompts_tool(client, is_public_bool, limit) except Exception as e: return {"error": str(e)} @mcp.tool() def get_prompt_by_name(prompt_name: str, ctx: Context = None) -> Dict[str, Any]: """ Get a specific prompt by its exact name. Args: prompt_name (str): The exact name of the prompt to retrieve ctx: FastMCP context (automatically provided) Returns: Dict[str, Any]: Dictionary containing the prompt details and template, or an error message if the prompt cannot be found """ try: client = get_client_from_context(ctx) return get_prompt_tool(client, prompt_name=prompt_name) except Exception as e: return {"error": str(e)} @mcp.tool() def push_prompt(ctx: Context = None) -> str: """ Call this tool when you need to understand how to create and push prompts to LangSmith. """ documentation = """ Documentation tool for understanding how to create and push prompts to LangSmith. This tool provides comprehensive documentation on creating ChatPromptTemplate and StructuredPrompt objects and pushing them to LangSmith using the LangSmith Client. --- 🧩 PURPOSE ---------- This is a **documentation-only tool** that explains how to: - Create prompts using LangChain's prompt templates - Push prompts to LangSmith for version control and management - Handle prompt creation vs. version updates --- 📦 REQUIRED DEPENDENCIES ------------------------ To use the functionality described in this documentation, you need: - `langsmith` - The LangSmith Python client - `langchain-core` - Core LangChain functionality for prompt templates - `langchain` (optional) - Required only if using `from langchain.messages` imports Install with: ```bash pip install langsmith langchain-core # Optional, for message classes: pip install langchain ``` --- 🔧 HOW TO PUSH PROMPTS ----------------------- Use the LangSmith Client's `push_prompt()` method: ```python from langsmith import Client client = Client() url = client.push_prompt( prompt_identifier="my-prompt-name", object=prompt, # Your prompt object description="Optional description", tags=["tag1", "tag2"], # Optional tags is_public=False, # Optional visibility (True/False) ) ``` **Behavior:** - If the prompt name **doesn't exist**: Creates a new prompt in LangSmith - If the prompt name **exists** and it's a **new version**: Creates a new commit/version - If the prompt name **exists** and it's the **same version**: No new commit is created --- 📝 CREATING CHATPROMPTTEMPLATE PROMPTS -------------------------------------- 1️⃣ **Basic ChatPromptTemplate** ```python from langchain_core.prompts import ChatPromptTemplate prompt = ChatPromptTemplate.from_messages([ ("system", "You are a helpful AI assistant. Your name is {assistant_name}."), ("human", "{user_input}"), ]) client.push_prompt("my-chat-prompt", object=prompt) ``` 2️⃣ **Using Message Classes** ```python from langchain_core.prompts import ChatPromptTemplate from langchain.messages import SystemMessage, HumanMessage, AIMessage prompt = ChatPromptTemplate.from_messages([ SystemMessage(content="You are a coding assistant."), HumanMessage(content="Write a Python function to {task}"), AIMessage(content="I'll help you write that function."), ("human", "Make it {style}"), ]) client.push_prompt("my-message-classes-prompt", object=prompt) ``` 3️⃣ **With MessagesPlaceholder for Conversation History** ```python from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder prompt = ChatPromptTemplate.from_messages([ ("system", "You are a helpful assistant."), MessagesPlaceholder(variable_name="conversation", optional=True), ("human", "{user_input}"), ]) client.push_prompt("my-conversation-prompt", object=prompt) ``` 4️⃣ **Complex Prompt with Multiple Placeholders** ```python from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder from langchain.messages import HumanMessage prompt = ChatPromptTemplate.from_messages([ ("system", "You are {assistant_name}, a {role} assistant."), MessagesPlaceholder(variable_name="chat_history", optional=True), ("human", "Current question: {question}"), ("ai", "Let me think about that..."), MessagesPlaceholder(variable_name="tool_results", optional=True), HumanMessage(content="Based on the above, what's your final answer?"), ]) client.push_prompt("my-complex-prompt", object=prompt) ``` --- 🎯 CREATING STRUCTUREDPROMPT PROMPTS ------------------------------------ StructuredPrompt allows you to define output schemas for structured outputs. 1️⃣ **With Dictionary Schema (with title and description)** ```python from langchain_core.prompts.structured import StructuredPrompt schema = { "title": "SentimentAnalysis", "description": "Analyzes the sentiment of text with confidence and reasoning", "type": "object", "properties": { "sentiment": { "type": "string", "enum": ["positive", "negative", "neutral"], "description": "The sentiment of the text" }, "confidence": { "type": "number", "description": "Confidence score between 0 and 1" }, "reasoning": { "type": "string", "description": "Brief reasoning for the sentiment" } }, "required": ["sentiment", "confidence", "reasoning"], "strict": True } prompt = StructuredPrompt( [ ("system", "You are a sentiment analysis expert."), ("human", "Analyze the sentiment of: {text}"), ], schema_=schema, ) client.push_prompt("my-structured-prompt", object=prompt) ``` 2️⃣ **With Pydantic Model (Convert to Dict Schema)** ```python from langchain_core.prompts.structured import StructuredPrompt from pydantic import BaseModel, Field class UserInfo(BaseModel): '''User information extracted from text.''' name: str = Field(description="The user's name") age: int = Field(description="The user's age") email: str = Field(description="The user's email address") # Convert Pydantic model to dict schema schema_dict = UserInfo.model_json_schema() # Add title and description at top level if not present if "title" not in schema_dict: schema_dict["title"] = UserInfo.__name__ if "description" not in schema_dict: schema_dict["description"] = UserInfo.__doc__ or f"Schema for {UserInfo.__name__}" prompt = StructuredPrompt( [ ("system", "You are a helpful assistant that extracts user information."), ("human", "Extract information from: {text}"), ], schema_=schema_dict, ) client.push_prompt("my-pydantic-prompt", object=prompt) ``` --- 🧠 HELPER FUNCTION PATTERN --------------------------- You can create a reusable helper function: ```python def push_prompt_to_langsmith( prompt, prompt_identifier: str, description: str = None, tags: list = None, is_public: bool = None, ) -> str: ''' Push a prompt to LangSmith with optional metadata. Args: prompt: The prompt object (ChatPromptTemplate, StructuredPrompt, etc.) prompt_identifier: The name/identifier for the prompt description: Optional description of the prompt tags: Optional list of tags is_public: Optional visibility setting (True/False) Returns: The URL of the pushed prompt ''' kwargs = {"object": prompt} if description: kwargs["description"] = description if tags: kwargs["tags"] = tags if is_public is not None: kwargs["is_public"] = is_public url = client.push_prompt(prompt_identifier, **kwargs) return url ``` --- 📤 RETURNS ---------- None This tool is documentation-only and returns None. The documentation is in the docstring. --- 🧠 NOTES FOR AGENTS -------------------- - This tool is **documentation-only** - it does not execute any code - Use this tool to understand how to create and push prompts programmatically - The `push_prompt()` method automatically handles versioning: - New prompt name → creates new prompt - Existing prompt name with changes → creates new version/commit - Existing prompt name with no changes → no new commit - Always ensure you have the required dependencies installed before using these patterns - Prompt identifiers should be unique and descriptive - Use tags and descriptions to organize and document your prompts --- 🔐 ENVIRONMENT VARIABLES ------------------------- Before using the LangSmith Client, make sure to set up your environment variables: **Required:** ```bash export LANGSMITH_API_KEY="lsv2_pt_..." ``` **Optional:** ```bash # Only needed if using a custom endpoint (defaults to cloud if not set) export LANGSMITH_ENDPOINT="https://api.smith.langchain.com" # Only needed if you want to specify a workspace export LANGSMITH_WORKSPACE_ID="35e66a3b-2973-4830-83e1-352c43a660ed" ``` You can also use a `.env` file with `python-dotenv`: ```python from dotenv import load_dotenv load_dotenv() # Loads variables from .env file from langsmith import Client client = Client() # Will automatically use environment variables ``` """ return documentation # Register conversation tools # @mcp.tool() # def get_thread_history(thread_id: str, project_name: str, ctx: Context = None) -> Dict[str, Any]: # """ # Retrieve the message history for a specific conversation thread. # Args: # thread_id (str): The unique ID of the thread to fetch history for # project_name (str): The name of the project containing the thread # (format: "owner/project" or just "project") # Returns: # Dict[str, Any]: Dictionary containing the thread history, # or an error message if the thread cannot be found # """ # try: # client = get_client_from_context(ctx) # return get_thread_history_tool(client, thread_id, project_name) # except Exception as e: # return {"error": str(e)} # Register analytics tools # @mcp.tool() # def get_project_runs_stats(project_name: str = None, trace_id: str = None, ctx: Context = None) -> Dict[str, Any]: # """ # Get statistics about runs in a LangSmith project. # Args: # project_name (str): The name of the project to analyze # (format: "owner/project" or just "project") # trace_id (str): The specific ID of the trace to fetch (preferred parameter) # Returns: # Dict[str, Any]: Dictionary containing the requested project run statistics # or an error message if statistics cannot be retrieved # """ # try: # client = get_client_from_context(ctx) # return get_project_runs_stats_tool(client, project_name, trace_id) # except Exception as e: # return {"error": str(e)} # # Register trace tools # @mcp.tool() # def fetch_trace(project_name: str = None, trace_id: str = None, ctx: Context = None) -> Dict[str, Any]: # """ # Fetch trace content for debugging and analyzing LangSmith runs. # Note: Only one parameter (project_name or trace_id) is required. # If both are provided, trace_id is preferred. # String "null" inputs are handled as None values. # Args: # project_name (str, optional): The name of the project to fetch the latest trace from # trace_id (str, optional): The specific ID of the trace to fetch (preferred parameter) # Returns: # Dict[str, Any]: Dictionary containing the trace data and metadata, # or an error message if the trace cannot be found # """ # try: # client = get_client_from_context(ctx) # return fetch_trace_tool(client, project_name, trace_id) # except Exception as e: # return {"error": str(e)} @mcp.tool() def fetch_runs( project_name: str, trace_id: str = None, run_type: str = None, error: str = None, is_root: str = None, filter: str = None, trace_filter: str = None, tree_filter: str = None, order_by: str = "-start_time", limit: int = 50, reference_example_id: str = None, format_type: str = "pretty", ctx: Context = None, ) -> Dict[str, Any]: """ Fetch LangSmith runs (traces, tools, chains, etc.) from one or more projects using flexible filters, query language expressions, and trace-level constraints. --- 🧩 PURPOSE ---------- This is a **general-purpose LangSmith run fetcher** designed for analytics, trace export, and automated exploration. It wraps `client.list_runs()` with complete support for: - Multiple project names or IDs - The **Filter Query Language (FQL)** for precise queries - Hierarchical filtering across trace trees - Sorting and result limiting It returns **raw `dict` objects** suitable for further analysis or export. --- ⚙️ PARAMETERS ------------- project_name : str The project name to fetch runs from. For multiple projects, use JSON array string (e.g., '["project1", "project2"]'). trace_id : str, optional Return only runs that belong to a specific trace tree. It is a UUID string, e.g. "123e4567-e89b-12d3-a456-426614174000". run_type : str, optional Filter runs by type (e.g. "llm", "chain", "tool", "retriever"). error : str, optional Filter by error status: "true" for errored runs, "false" for successful runs. is_root : str, optional Filter root traces: "true" for only top-level traces, "false" to exclude roots. If not provided, returns all runs. filter : str, optional A **Filter Query Language (FQL)** expression that filters runs by fields, metadata, tags, feedback, latency, or time. ─── Common field names ─── - `id`, `name`, `run_type` - `start_time`, `end_time` - `latency` - `total_tokens` - `error` - `tags` - `feedback_key`, `feedback_score` - `metadata_key`, `metadata_value` - `execution_order` ─── Supported comparators ─── - `eq`, `neq` → equal / not equal - `gt`, `gte`, `lt`, `lte` → numeric or time comparisons - `has` → tag or metadata contains value - `search` → substring or full-text match - `and`, `or`, `not` → logical operators ─── Examples ─── ```python 'gt(latency, "5s")' # took longer than 5 seconds 'neq(error, null)' # errored runs 'has(tags, "beta")' # runs tagged "beta" 'and(eq(name,"ChatOpenAI"), eq(run_type,"llm"))' # named & typed runs 'search("image classification")' # full-text search ``` trace_filter : str, optional Filter applied **to the root run** in each trace tree. Lets you select child runs based on root attributes or feedback. Example: ```python 'and(eq(feedback_key,"user_score"), eq(feedback_score,1))' ``` → return runs whose root trace has a user_score of 1. tree_filter : str, optional Filter applied **to any run** in the trace tree (including siblings or children). Example: ```python 'eq(name,"ExpandQuery")' ``` → return runs if *any* run in their trace had that name. order_by : str, default "-start_time" Sort field; prefix with "-" for descending order. limit : int, default 50 Maximum number of runs to return. reference_example_id : str, optional Filter runs by reference example ID. Returns only runs associated with the specified dataset example ID. format_type : str, default "pretty" Output format for extracted messages. Options: - `"pretty"` (default): Human-readable formatted text focusing on human/AI/tool message exchanges - `"json"`: Pretty-printed JSON format - `"raw"`: Compact single-line JSON format When format_type is set, the tool extracts messages from runs and formats them, making it ideal for conversational AI agents that care about message exchanges rather than full trace details. The response returns only the formatted output: - `formatted`: Formatted string representation of messages (when format_type is provided) When format_type is not set, the response returns: - `runs`: Full run data --- 📤 RETURNS ---------- Dict[str, Any] Dictionary containing: - If format_type is set: `{"formatted": str}` - formatted string representation of messages - If format_type is not set: `{"runs": List[Dict]}` - list of LangSmith run dictionaries --- 🧪 EXAMPLES ------------ 1️⃣ **Get latest 10 root runs** ```python runs = fetch_runs("alpha-project", is_root="true", limit=10) ``` 2️⃣ **Get all tool runs that errored** ```python runs = fetch_runs("alpha-project", run_type="tool", error="true") ``` 3️⃣ **Get all runs that took >5s and have tag "experimental"** ```python runs = fetch_runs("alpha-project", filter='and(gt(latency,"5s"), has(tags,"experimental"))') ``` 4️⃣ **Get all runs in a specific conversation thread** ```python thread_id = "abc-123" fql = f'and(in(metadata_key, ["session_id","conversation_id","thread_id"]), eq(metadata_value, "{thread_id}"))' runs = fetch_runs("alpha-project", is_root="true", filter=fql) ``` 5️⃣ **List all runs called "extractor" whose root trace has feedback user_score=1** ```python runs = fetch_runs( "alpha-project", filter='eq(name,"extractor")', trace_filter='and(eq(feedback_key,"user_score"), eq(feedback_score,1))' ) ``` 6️⃣ **List all runs that started after a timestamp and either errored or got low feedback** ```python fql = 'and(gt(start_time,"2023-07-15T12:34:56Z"), or(neq(error,null), and(eq(feedback_key,"Correctness"), eq(feedback_score,0.0))))' runs = fetch_runs("alpha-project", filter=fql) ``` 7️⃣ **Get formatted messages for conversational AI (default: pretty format)** ```python # Returns formatted messages focusing on human/AI/tool exchanges result = fetch_runs("alpha-project", limit=10, format_type="pretty") # result["formatted"] contains human-readable formatted messages # result["messages"] contains the raw message list # result["runs"] contains full run data ``` 8️⃣ **Get messages in JSON format** ```python result = fetch_runs("alpha-project", limit=10, format_type="json") # result["messages"] contains messages as JSON array # result["formatted"] contains pretty-printed JSON string ``` --- 🧠 NOTES FOR AGENTS -------------------- - Use this to **query LangSmith data sources dynamically**. - Compose FQL strings programmatically based on your intent. - Combine `filter`, `trace_filter`, and `tree_filter` for hierarchical logic. - Always verify that `project_name` matches an existing LangSmith project. - Returned `dict` objects have fields like: - `id`, `name`, `run_type`, `inputs`, `outputs`, `error`, `start_time`, `end_time`, `latency`, `metadata`, `feedback`, etc. - If the trace is big, save it to a file (if you have this ability) and analyze it locally. - **For conversational AI agents**: Use `format_type="pretty"` (default) to get human-readable message exchanges focusing on human/AI/tool messages rather than full trace details. """ # noqa: W293 try: client = get_client_from_context(ctx) # Parse project_name - can be a single string or JSON array parsed_project_name = project_name if project_name and project_name.startswith("["): try: parsed_project_name = json.loads(project_name) except json.JSONDecodeError: pass # Use as-is if not valid JSON # Parse boolean strings parsed_error = None if error is not None: parsed_error = ( error.lower() == "true" if error.lower() in ("true", "false") else None ) parsed_is_root = None if is_root is not None: if is_root.lower() == "true": parsed_is_root = True elif is_root.lower() == "false": parsed_is_root = False # Validate format_type valid_formats = ["raw", "json", "pretty"] if format_type not in valid_formats: return { "error": f"Invalid format_type: {format_type}. Must be one of {valid_formats}" } return fetch_runs_tool( client, project_name=parsed_project_name, trace_id=trace_id, run_type=run_type, error=parsed_error, is_root=parsed_is_root, filter=filter, trace_filter=trace_filter, tree_filter=tree_filter, order_by=order_by, limit=limit, reference_example_id=reference_example_id, format_type=format_type, ctx=ctx, ) except Exception as e: return {"error": str(e)} # Register project tools @mcp.tool() def list_projects( limit: int = 5, project_name: str = None, more_info: str = "false", reference_dataset_id: str = None, reference_dataset_name: str = None, ctx: Context = None, ) -> Dict[str, Any]: """ List LangSmith projects with optional filtering and detail level control. Fetches projects from LangSmith, optionally filtering by name and controlling the level of detail returned. Can return either simplified project information or full project details. In case a dataset id or name is provided, you don't need to provide a project name. --- 🧩 PURPOSE ---------- This function provides a convenient way to list and explore LangSmith projects. It supports: - Filtering projects by name (partial match) - Limiting the number of results - Choosing between simplified or full project information - Automatically extracting deployment IDs from nested project data --- ⚙️ PARAMETERS ------------- limit : int, default 5 Maximum number of projects to return (as string, e.g., "5"). This can be adjusted by agents or users based on their needs. project_name : str, optional Filter projects by name using partial matching. If provided, only projects whose names contain this string will be returned. Example: `project_name="Chat"` will match "Chat-LangChain", "ChatBot", etc. more_info : str, default "false" Controls the level of detail returned: - `"false"` (default): Returns simplified project information with only essential fields: `name`, `project_id`, and `agent_deployment_id` (if available) - `"true"`: Returns full project details as returned by the LangSmith API reference_dataset_id : str, optional The ID of the reference dataset to filter projects by. Either this OR `reference_dataset_name` must be provided (but not both). reference_dataset_name : str, optional The name of the reference dataset to filter projects by. Either this OR `reference_dataset_id` must be provided (but not both). --- 📤 RETURNS ---------- List[dict] A list of project dictionaries. The structure depends on `more_info`: **When `more_info=False` (simplified):** ```python [ { "name": "Chat-LangChain", "project_id": "787d5165-f110-43ff-a3fb-66ea1a70c971", "agent_deployment_id": "deployment-123" # Only if available }, ... ] ``` **When `more_info=True` (full details):** Returns complete project objects with all fields from the LangSmith API, including metadata, settings, statistics, and nested structures. --- 🧪 EXAMPLES ------------ 1️⃣ **List first 5 projects (simplified)** ```python projects = list_projects(limit="5") ``` 2️⃣ **Search for projects with "Chat" in the name** ```python projects = list_projects(project_name="Chat", limit="10") ``` 3️⃣ **Get full project details** ```python projects = list_projects(limit="3", more_info="true") ``` 4️⃣ **Find a specific project with full details** ```python projects = list_projects(project_name="MyProject", more_info="true", limit="1") ``` --- 🧠 NOTES FOR AGENTS -------------------- - Use `more_info="false"` for quick project discovery and listing - Use `more_info="true"` when you need detailed project information - The `agent_deployment_id` field is automatically extracted from nested project data when available, making it easy to identify agent deployments - Projects are filtered to exclude reference projects by default - The function uses `name_contains` for filtering, so partial matches work """ # noqa: W293 try: client = get_client_from_context(ctx) parsed_more_info = more_info.lower() == "true" if reference_dataset_id is not None and reference_dataset_name is not None: parsed_more_info = True return list_projects_tool( client, limit=limit, project_name=project_name, more_info=parsed_more_info, reference_dataset_id=reference_dataset_id, reference_dataset_name=reference_dataset_name, ) except Exception as e: return {"error": str(e)} @mcp.tool() def list_experiments( reference_dataset_id: Optional[str] = None, reference_dataset_name: Optional[str] = None, limit: int = 5, project_name: Optional[str] = None, ctx: Context = None, ) -> Dict[str, Any]: """ List LangSmith experiment projects (reference projects) with mandatory dataset filtering. Fetches experiment projects from LangSmith that are associated with a specific dataset. These are projects used for model evaluation and comparison. Requires either a dataset ID or dataset name to filter experiments. --- 🧩 PURPOSE ---------- This function provides a convenient way to list and explore LangSmith experiment projects. It supports: - Filtering experiments by reference dataset (mandatory) - Filtering projects by name (partial match) - Limiting the number of results - Automatically extracting deployment IDs from nested project data - Returns simplified project information with key metrics (latency, cost, feedback stats) --- ⚙️ PARAMETERS ------------- reference_dataset_id : str, optional The ID of the reference dataset to filter experiments by. Either this OR `reference_dataset_name` must be provided (but not both). reference_dataset_name : str, optional The name of the reference dataset to filter experiments by. Either this OR `reference_dataset_id` must be provided (but not both). limit : int, default 5 Maximum number of experiments to return. This can be adjusted by agents or users based on their needs. project_name : str, optional Filter projects by name using partial matching. If provided, only projects whose names contain this string will be returned. Example: `project_name="Chat"` will match "Chat-LangChain", "ChatBot", etc. --- 📤 RETURNS ---------- Dict[str, Any] A dictionary containing an "experiments" key with a list of simplified experiment project dictionaries: ```python { "experiments": [ { "name": "Experiment-Chat-LangChain", "experiment_id": "787d5165-f110-43ff-a3fb-66ea1a70c971", "feedback_stats": {...}, # Feedback statistics if available "latency_p50_seconds": 1.626, # 50th percentile latency in seconds "latency_p99_seconds": 2.390, # 99th percentile latency in seconds "total_cost": 0.00013005, # Total cost in dollars "prompt_cost": 0.00002085, # Prompt cost in dollars "completion_cost": 0.0001092, # Completion cost in dollars "agent_deployment_id": "deployment-123" # Only if available }, ... ] } ``` --- 🧪 EXAMPLES ------------ 1️⃣ **List experiments for a dataset by ID** ```python experiments = list_experiments(reference_dataset_id="f5ca13c6-96ad-48ba-a432-ebb6bf94528f") ``` 2️⃣ **List experiments for a dataset by name** ```python experiments = list_experiments(reference_dataset_name="my-dataset", limit=10) ``` 3️⃣ **Find experiments with specific name pattern** ```python experiments = list_experiments( reference_dataset_id="f5ca13c6-96ad-48ba-a432-ebb6bf94528f", project_name="Chat", limit=1 ) ``` --- 🧠 NOTES FOR AGENTS -------------------- - Returns simplified experiment information with key metrics (latency, cost, feedback stats) - The `agent_deployment_id` field is automatically extracted from nested project data when available, making it easy to identify agent deployments - Experiments are filtered to include only reference projects (associated with datasets) - The function uses `name_contains` for filtering, so partial matches work - You must provide either `reference_dataset_id` OR `reference_dataset_name`, but not both - Experiment projects are used for model evaluation and comparison across different runs """ # noqa: W293 try: client = get_client_from_context(ctx) return list_experiments_tool( client, reference_dataset_id=reference_dataset_id, reference_dataset_name=reference_dataset_name, limit=limit, project_name=project_name, ) except Exception as e: return {"error": str(e)} # Register dataset tools @mcp.tool() def list_datasets( dataset_ids: Optional[str] = None, data_type: Optional[str] = None, dataset_name: Optional[str] = None, dataset_name_contains: Optional[str] = None, metadata: Optional[str] = None, limit: int = 20, ctx: Context = None, ) -> Dict[str, Any]: """ Fetch LangSmith datasets. Note: If no arguments are provided, all datasets will be returned. Args: dataset_ids (Optional[str]): Dataset IDs to filter by as JSON array string (e.g., '["id1", "id2"]') or single ID data_type (Optional[str]): Filter by dataset data type (e.g., 'chat', 'kv') dataset_name (Optional[str]): Filter by exact dataset name dataset_name_contains (Optional[str]): Filter by substring in dataset name metadata (Optional[str]): Filter by metadata as JSON object string (e.g., '{"key": "value"}') limit (int): Max number of datasets to return (default: 20) ctx: FastMCP context (automatically provided) Returns: Dict[str, Any]: Dictionary containing the datasets and metadata, or an error message if the datasets cannot be retrieved """ # noqa: W293 try: client = get_client_from_context(ctx) # Parse list strings (JSON arrays) parsed_dataset_ids = None if dataset_ids is not None: try: parsed_dataset_ids = ( json.loads(dataset_ids) if dataset_ids.startswith("[") else [dataset_ids] ) except (json.JSONDecodeError, AttributeError): parsed_dataset_ids = [dataset_ids] if dataset_ids else None # Parse metadata (JSON object) parsed_metadata = None if metadata is not None: try: parsed_metadata = json.loads(metadata) if metadata.startswith("{") else None except (json.JSONDecodeError, AttributeError): parsed_metadata = None return list_datasets_tool( client, dataset_ids=parsed_dataset_ids, data_type=data_type, dataset_name=dataset_name, dataset_name_contains=dataset_name_contains, metadata=parsed_metadata, limit=limit, ) except Exception as e: return {"error": str(e)} @mcp.tool() def list_examples( dataset_id: Optional[str] = None, dataset_name: Optional[str] = None, example_ids: Optional[str] = None, filter: Optional[str] = None, metadata: Optional[str] = None, splits: Optional[str] = None, inline_s3_urls: Optional[str] = None, include_attachments: Optional[str] = None, as_of: Optional[str] = None, limit: int = 10, offset: Optional[str] = None, ctx: Context = None, ) -> Dict[str, Any]: """ Fetch examples from a LangSmith dataset with advanced filtering options. Note: Either dataset_id, dataset_name, or example_ids must be provided. If multiple are provided, they are used in order of precedence: example_ids, dataset_id, dataset_name. Args: dataset_id (Optional[str]): Dataset ID to retrieve examples from dataset_name (Optional[str]): Dataset name to retrieve examples from example_ids (Optional[str]): Specific example IDs as JSON array string (e.g., '["id1", "id2"]') or single ID limit (int): Maximum number of examples to return (default: 10) offset (int): Number of examples to skip (default: 0) filter (Optional[str]): Filter string using LangSmith query syntax (e.g., 'has(metadata, {"key": "value"})') metadata (Optional[str]): Metadata to filter by as JSON object string (e.g., '{"key": "value"}') splits (Optional[str]): Dataset splits as JSON array string (e.g., '["train", "test"]') or single split inline_s3_urls (Optional[str]): Whether to inline S3 URLs: "true" or "false" (default: SDK default if not specified) include_attachments (Optional[str]): Whether to include attachments: "true" or "false" (default: SDK default if not specified) as_of (Optional[str]): Dataset version tag OR ISO timestamp to retrieve examples as of that version/time ctx: FastMCP context (automatically provided) Returns: Dict[str, Any]: Dictionary containing the examples and metadata, or an error message if the examples cannot be retrieved """ # noqa: W293 try: client = get_client_from_context(ctx) # Parse list strings (JSON arrays) parsed_example_ids = None if example_ids is not None: try: parsed_example_ids = ( json.loads(example_ids) if example_ids.startswith("[") else [example_ids] ) except (json.JSONDecodeError, AttributeError): parsed_example_ids = [example_ids] if example_ids else None parsed_splits = None if splits is not None: try: parsed_splits = json.loads(splits) if splits.startswith("[") else [splits] except (json.JSONDecodeError, AttributeError): parsed_splits = [splits] if splits else None # Parse metadata (JSON object) parsed_metadata = None if metadata is not None: try: parsed_metadata = json.loads(metadata) if metadata.startswith("{") else None except (json.JSONDecodeError, AttributeError): parsed_metadata = None # Parse boolean strings parsed_inline_s3_urls = None if inline_s3_urls is not None: parsed_inline_s3_urls = inline_s3_urls.lower() == "true" parsed_include_attachments = None if include_attachments is not None: parsed_include_attachments = include_attachments.lower() == "true" # Parse integer strings parsed_limit = int(limit) if limit else None parsed_offset = int(offset) if offset else None return list_examples_tool( client, dataset_id=dataset_id, dataset_name=dataset_name, example_ids=parsed_example_ids, filter=filter, metadata=parsed_metadata, splits=parsed_splits, inline_s3_urls=parsed_inline_s3_urls, include_attachments=parsed_include_attachments, as_of=as_of, limit=parsed_limit, offset=parsed_offset, ) except Exception as e: return {"error": str(e)} @mcp.tool() def read_dataset( dataset_id: Optional[str] = None, dataset_name: Optional[str] = None, ctx: Context = None, ) -> Dict[str, Any]: """ Read a specific dataset from LangSmith. Note: Either dataset_id or dataset_name must be provided to identify the dataset. If both are provided, dataset_id takes precedence. Args: dataset_id (Optional[str]): Dataset ID to retrieve dataset_name (Optional[str]): Dataset name to retrieve ctx: FastMCP context (automatically provided) Returns: Dict[str, Any]: Dictionary containing the dataset details, or an error message if the dataset cannot be retrieved Example in case you need to create a separate python script to read a dataset: ```python from langsmith import Client client = Client() dataset = client.read_dataset(dataset_name="My Dataset") # Or by ID: # dataset = client.read_dataset(dataset_id="dataset-id-here") ``` """ try: client = get_client_from_context(ctx) return read_dataset_tool( client, dataset_id=dataset_id, dataset_name=dataset_name, ) except Exception as e: return {"error": str(e)} @mcp.tool() def read_example( example_id: str, as_of: Optional[str] = None, ctx: Context = None, ) -> Dict[str, Any]: """ Read a specific example from LangSmith. Args: example_id (str): Example ID to retrieve as_of (Optional[str]): Dataset version tag OR ISO timestamp to retrieve the example as of that version/time ctx: FastMCP context (automatically provided) Returns: Dict[str, Any]: Dictionary containing the example details, or an error message if the example cannot be retrieved Example in case you need to create a separate python script to read an example: ```python from langsmith import Client client = Client() example = client.read_example(example_id="example-id-here") # Or with version: # example = client.read_example(example_id="example-id-here", as_of="v1.0") ``` """ try: client = get_client_from_context(ctx) return read_example_tool( client, example_id=example_id, as_of=as_of, ) except Exception as e: return {"error": str(e)} @mcp.tool() def create_dataset(ctx: Context = None) -> str: """ Call this tool when you need to understand how to create datasets in LangSmith. """ documentation = """ Documentation tool for understanding how to create datasets in LangSmith. This tool provides comprehensive documentation on creating datasets programmatically using the LangSmith Python SDK, including creating datasets from lists, traces, CSV files, and pandas DataFrames. --- 🧩 PURPOSE ---------- This is a **documentation-only tool** that explains how to: - Create datasets from lists of examples - Create datasets from traces/runs - Create datasets from CSV files - Create datasets from pandas DataFrames - Add examples to datasets using bulk operations --- 📦 REQUIRED DEPENDENCIES ------------------------ To use the functionality described in this documentation, you need: - `langsmith` - The LangSmith Python client - `pandas` (optional) - Required only for DataFrame operations Install with: ```bash pip install langsmith # Optional, for DataFrame operations: pip install pandas ``` --- 🔧 CREATING DATASETS -------------------- 1️⃣ **Create Dataset from List of Values** The most flexible way to create a dataset is by creating examples from a list of inputs and optional outputs. You can add arbitrary metadata to each example. ```python from langsmith import Client client = Client() examples = [ { "inputs": {"question": "What is the largest mammal?"}, "outputs": {"answer": "The blue whale"}, "metadata": {"source": "Wikipedia"}, }, { "inputs": {"question": "What do mammals and birds have in common?"}, "outputs": {"answer": "They are both warm-blooded"}, "metadata": {"source": "Wikipedia"}, }, { "inputs": {"question": "What are reptiles known for?"}, "outputs": {"answer": "Having scales"}, "metadata": {"source": "Wikipedia"}, }, ] dataset_name = "Elementary Animal Questions" # Create the dataset dataset = client.create_dataset( dataset_name=dataset_name, description="Questions and answers about animal phylogenetics.", ) # Bulk create examples (more efficient than creating one at a time) client.create_examples( dataset_id=dataset.id, examples=examples ) ``` **Note:** For many examples, use `create_examples()` for bulk creation. For a single example, use `create_example()`. 2️⃣ **Create Dataset from Traces** You can create datasets from the runs (spans) of your traces by filtering runs and converting them to examples. ```python from langsmith import Client client = Client() dataset_name = "Example Dataset" # Filter runs to add to the dataset runs = client.list_runs( project_name="my_project", is_root=True, error=False, ) # Create the dataset dataset = client.create_dataset( dataset_name=dataset_name, description="An example dataset" ) # Prepare inputs and outputs for bulk creation examples = [ {"inputs": run.inputs, "outputs": run.outputs} for run in runs ] # Use the bulk create_examples method client.create_examples( dataset_id=dataset.id, examples=examples ) ``` 3️⃣ **Create Dataset from CSV File** You can create a dataset by uploading a CSV file. Ensure your CSV has columns that represent your input and output keys. ```python from langsmith import Client client = Client() csv_file = 'path/to/your/csvfile.csv' input_keys = ['column1', 'column2'] # Replace with your input column names output_keys = ['output1', 'output2'] # Replace with your output column names dataset = client.upload_csv( csv_file=csv_file, input_keys=input_keys, output_keys=output_keys, name="My CSV Dataset", description="Dataset created from a CSV file", data_type="kv" # "kv" or "chat" ) ``` 4️⃣ **Create Dataset from Pandas DataFrame (Python only)** The Python client offers a convenience method to upload a dataset from a pandas DataFrame. ```python from langsmith import Client import pandas as pd client = Client() # Load your data df = pd.read_parquet('path/to/your/myfile.parquet') # Or: df = pd.read_csv('path/to/your/myfile.csv') input_keys = ['column1', 'column2'] # Replace with your input column names output_keys = ['output1', 'output2'] # Replace with your output column names dataset = client.upload_dataframe( df=df, input_keys=input_keys, output_keys=output_keys, name="My Parquet Dataset", description="Dataset created from a parquet file", data_type="kv" # The default, can also be "chat" ) ``` --- 📝 DATASET STRUCTURE -------------------- Each example in a dataset should have: - `inputs` (dict): The input data for the example - `outputs` (dict, optional): The expected output data - `metadata` (dict, optional): Arbitrary metadata (e.g., source, notes, tags) --- 📤 RETURNS ---------- None This tool is documentation-only and returns None. The documentation is in the docstring. --- 🧠 NOTES FOR AGENTS -------------------- - This tool is **documentation-only** - it does not execute any code - Use `create_examples()` for bulk operations (more efficient) - Use `create_example()` for single example creation - Datasets can be of type "kv" (key-value) or "chat" (conversational) - Metadata is stored as a dictionary and can contain any key-value pairs - Always ensure you have the required dependencies installed before using these patterns - The dataset name should be unique and descriptive """ return documentation @mcp.tool() def update_examples(ctx: Context = None) -> str: """ Call this tool when you need to understand how to update dataset examples in LangSmith. """ documentation = """ Documentation tool for understanding how to update dataset examples in LangSmith. This tool provides comprehensive documentation on updating examples programmatically using the LangSmith Python SDK, including single example updates and bulk updates. --- 🧩 PURPOSE ---------- This is a **documentation-only tool** that explains how to: - Update a single example in a dataset - Bulk update multiple examples in a single request - Update inputs, outputs, metadata, and splits --- 📦 REQUIRED DEPENDENCIES ------------------------ To use the functionality described in this documentation, you need: - `langsmith` - The LangSmith Python client Install with: ```bash pip install langsmith ``` --- 🔧 UPDATING EXAMPLES -------------------- 1️⃣ **Update Single Example** You can update a single example using the `update_example()` method. You can update inputs, outputs, metadata, and split assignments. ```python from langsmith import Client client = Client() # Update a single example client.update_example( example_id=example.id, # The example ID to update inputs={"input": "updated input"}, outputs={"output": "updated output"}, metadata={"foo": "bar", "source": "updated"}, split="train" # Can be a string or list of strings ) ``` **Parameters:** - `example_id` (str, required): The ID of the example to update - `inputs` (dict, optional): Updated input data - `outputs` (dict, optional): Updated output data - `metadata` (dict, optional): Updated metadata dictionary - `split` (str or list, optional): Updated split assignment(s) 2️⃣ **Bulk Update Examples** You can update multiple examples in a single request using the `update_examples()` method. This is more efficient than updating examples one at a time. ```python from langsmith import Client client = Client() # Update multiple examples at once client.update_examples( example_ids=[example.id, example_2.id], inputs=[ {"input": "updated input 1"}, {"input": "updated input 2"} ], outputs=[ {"output": "updated output 1"}, {"output": "updated output 2"} ], metadata=[ {"foo": "baz", "source": "source1"}, {"foo": "qux", "source": "source2"} ], splits=[ ["training", "foo"], # Splits can be arrays "training" # Or standalone strings ] ) ``` **Parameters:** - `example_ids` (list[str], required): List of example IDs to update - `inputs` (list[dict], optional): List of updated input dictionaries - `outputs` (list[dict], optional): List of updated output dictionaries - `metadata` (list[dict], optional): List of updated metadata dictionaries - `splits` (list[str or list], optional): List of split assignments (can be strings or lists) **Important Notes:** - All list parameters must have the same length as `example_ids` - Each list index corresponds to the example at the same index in `example_ids` - Splits can be either a single string or a list of strings (for multiple splits) - You can update any combination of fields - you don't need to provide all parameters 3️⃣ **Partial Updates** You can update only specific fields without providing all parameters: ```python # Update only metadata for a single example client.update_example( example_id=example.id, metadata={"updated_at": "2024-01-01", "status": "reviewed"} ) # Update only outputs for multiple examples client.update_examples( example_ids=[example.id, example_2.id], outputs=[ {"output": "new output 1"}, {"output": "new output 2"} ] ) ``` --- 📝 SPLIT ASSIGNMENTS --------------------- Examples can be assigned to one or more splits (e.g., "train", "test", "validation"): ```python # Single split client.update_example(example_id=example.id, split="train") # Multiple splits client.update_example(example_id=example.id, split=["train", "validation"]) # In bulk updates, each example can have different split assignments client.update_examples( example_ids=[example.id, example_2.id], splits=["train", ["train", "test"]] ) ``` --- 📤 RETURNS ---------- None This tool is documentation-only and returns None. The documentation is in the docstring. --- 🧠 NOTES FOR AGENTS -------------------- - This tool is **documentation-only** - it does not execute any code - Use `update_examples()` for bulk operations (more efficient than single updates) - Use `update_example()` for single example updates - All list parameters in bulk updates must match the length of `example_ids` - You can update any combination of fields - partial updates are supported - Splits can be strings or lists of strings (for multiple split assignments) - Always ensure you have the required dependencies installed before using these patterns - Example IDs can be obtained from `list_examples()` or `read_example()` methods """ return documentation @mcp.tool() def run_experiment(ctx: Context = None) -> str: """ Call this tool when you need to understand how to run experiments and evaluations in LangSmith. """ documentation = """ Documentation tool for understanding how to run experiments and evaluations in LangSmith. This tool provides comprehensive documentation on running evaluations using LangSmith's evaluate SDK, creating custom evaluators, and using the openevals library for pre-built evaluators like LLM-as-judge and trajectory evaluators. --- 🧩 PURPOSE ---------- This is a **documentation-only tool** that explains how to: - Run experiments using LangSmith's `evaluate()` method - Create custom evaluator functions in Python - Use openevals library for pre-built evaluators - Set up LLM-as-judge evaluators - Use trajectory evaluators for multi-turn conversations --- 📦 REQUIRED DEPENDENCIES ------------------------ To use the functionality described in this documentation, you need: - `langsmith` - The LangSmith Python client (>=0.3.13 for evaluate) - `openevals` (optional) - For pre-built evaluators like LLM-as-judge Install with: ```bash pip install langsmith # Optional, for pre-built evaluators: pip install openevals ``` --- 🔧 RUNNING EXPERIMENTS WITH LANGSMITH ------------------------------------- 1️⃣ **Basic Evaluation Setup** The `evaluate()` method runs your application on a dataset and scores outputs using evaluators. ```python from langsmith import Client, traceable, wrappers from openai import OpenAI # Step 1: Define your application oai_client = wrappers.wrap_openai(OpenAI()) @traceable def toxicity_classifier(inputs: dict) -> dict: instructions = ( "Please review the user query below and determine if it contains any form of " "toxic behavior, such as insults, threats, or highly negative comments. " "Respond with 'Toxic' if it does and 'Not toxic' if it doesn't." ) messages = [ {"role": "system", "content": instructions}, {"role": "user", "content": inputs["text"]}, ] result = oai_client.chat.completions.create( messages=messages, model="gpt-4o-mini", temperature=0 ) return {"class": result.choices[0].message.content} # Step 2: Create or select a dataset ls_client = Client() dataset = ls_client.create_dataset(dataset_name="Toxic Queries") examples = [ {"inputs": {"text": "Shut up, idiot"}, "outputs": {"label": "Toxic"}}, {"inputs": {"text": "You're a wonderful person"}, "outputs": {"label": "Not toxic"}}, ] ls_client.create_examples(dataset_id=dataset.id, examples=examples) # Step 3: Define an evaluator def correct(inputs: dict, outputs: dict, reference_outputs: dict) -> bool: return outputs["class"] == reference_outputs["label"] # Step 4: Run the evaluation results = ls_client.evaluate( toxicity_classifier, data=dataset.name, evaluators=[correct], experiment_prefix="gpt-4o-mini, baseline", description="Testing the baseline system.", max_concurrency=4, # Optional: parallelize evaluation ) ``` **Key Parameters:** - `target`: Your application function (takes inputs dict, returns outputs dict) - `data`: Dataset name or UUID, or an iterator of examples - `evaluators`: List of evaluator functions - `experiment_prefix`: Optional name prefix for the experiment - `description`: Optional experiment description - `max_concurrency`: Optional number of parallel workers 2️⃣ **Creating Custom Evaluators** Evaluators are functions that score your application's outputs. They receive: - `inputs`: The example inputs - `outputs`: Your application's actual outputs - `reference_outputs`: Expected outputs (if available) ```python # Simple boolean evaluator def correct(inputs: dict, outputs: dict, reference_outputs: dict) -> bool: return outputs["class"] == reference_outputs["label"] # Evaluator with score and comment def correctness_with_feedback( inputs: dict, outputs: dict, reference_outputs: dict ) -> dict: is_correct = outputs["class"] == reference_outputs["label"] comment = "Match" if is_correct else "Mismatch" return { "key": "correctness", "score": is_correct, "comment": comment } # Evaluator that returns a float score def similarity_score( inputs: dict, outputs: dict, reference_outputs: dict ) -> float: # Calculate some similarity metric (0.0 to 1.0) return 0.85 ``` **Evaluator Return Types:** - `bool`: Binary score (True/False) - `float`: Numeric score (0.0 to 1.0) - `dict`: Full result with `key`, `score`, and optional `comment` --- 🎯 USING OPENEVALS FOR PRE-BUILT EVALUATORS ------------------------------------------- The `openevals` library provides pre-built evaluators that you can use out of the box. 1️⃣ **LLM-as-Judge Evaluators** Use an LLM to judge your application's outputs. This is useful when you need subjective evaluation or don't have reference outputs. ```python from openevals.llm import create_llm_as_judge from openevals.prompts import CORRECTNESS_PROMPT, CONCISENESS_PROMPT # Correctness evaluator (requires reference outputs) correctness_evaluator = create_llm_as_judge( prompt=CORRECTNESS_PROMPT, feedback_key="correctness", model="openai:o3-mini", ) # Conciseness evaluator (no reference needed) conciseness_evaluator = create_llm_as_judge( prompt=CONCISENESS_PROMPT, feedback_key="conciseness", model="openai:o3-mini", ) # Use with LangSmith evaluate def wrapped_correctness_evaluator(inputs, outputs, reference_outputs): return correctness_evaluator( inputs=inputs, outputs=outputs, reference_outputs=reference_outputs ) results = ls_client.evaluate( toxicity_classifier, data=dataset.name, evaluators=[wrapped_correctness_evaluator], ) ``` **Pre-built Prompts Available:** - `CORRECTNESS_PROMPT`: Evaluates correctness against reference outputs - `CONCISENESS_PROMPT`: Evaluates how concise the output is - `HALLUCINATION_PROMPT`: Checks for hallucinations (requires context) - `RAG_HELPFULNESS_PROMPT`: For RAG applications - `RAG_GROUNDEDNESS_PROMPT`: Checks if output is grounded in context - `RAG_RETRIEVAL_RELEVANCE_PROMPT`: Evaluates retrieval quality 2️⃣ **Custom LLM-as-Judge with Custom Prompts** You can create custom LLM-as-judge evaluators with your own prompts: ```python from openevals.llm import create_llm_as_judge CUSTOM_PROMPT = ''' You are an expert evaluator. Rate the output quality on a scale of 0-1. <input> {inputs} </input> <output> {outputs} </output> <reference> {reference_outputs} </reference> ''' custom_evaluator = create_llm_as_judge( prompt=CUSTOM_PROMPT, feedback_key="quality", model="openai:o3-mini", continuous=True, # Returns float (0.0-1.0) instead of boolean ) ``` 3️⃣ **Trajectory Evaluators for Multi-turn Conversations** Trajectory evaluators evaluate entire conversation threads, useful for chat applications and agents: ```python from openevals.llm import create_llm_as_judge # Create a trajectory evaluator that looks at the full conversation trajectory_evaluator = create_llm_as_judge( model="openai:o3-mini", prompt="Based on the below conversation, was the user satisfied?\\n{outputs}", feedback_key="satisfaction", ) # When using with evaluate, the outputs will be the full conversation trajectory def wrapped_trajectory_evaluator(inputs, outputs, reference_outputs): # outputs here will be a list of messages representing the conversation return trajectory_evaluator(outputs=outputs) ``` 4️⃣ **Other Pre-built Evaluators** OpenEvals also provides evaluators for: - **Exact Match**: Compare outputs exactly - **Embedding Similarity**: Compare using embeddings - **Levenshtein Distance**: String similarity - **Code Evaluators**: Type checking, execution (requires additional setup) ```python from openevals.exact import exact_match # Simple exact match evaluator def exact_match_evaluator(inputs, outputs, reference_outputs): return exact_match(outputs=outputs, reference_outputs=reference_outputs) ``` --- 📝 EVALUATOR WRAPPER PATTERN ----------------------------- When using openevals evaluators with LangSmith's `evaluate()`, you may need to wrap them to match the expected signature: ```python def wrap_openevals_evaluator(openevals_evaluator): def wrapped(inputs, outputs, reference_outputs): # openevals evaluators may have different parameter names result = openevals_evaluator( inputs=inputs, outputs=outputs, reference_outputs=reference_outputs ) return result return wrapped # Usage wrapped_evaluator = wrap_openevals_evaluator(correctness_evaluator) results = ls_client.evaluate( app_function, data=dataset.name, evaluators=[wrapped_evaluator], ) ``` --- 📤 RETURNS ---------- None This tool is documentation-only and returns None. The documentation is in the docstring. --- 🧠 NOTES FOR AGENTS -------------------- - This tool is **documentation-only** - it does not execute any code - Use `evaluate()` for synchronous evaluation, `aevaluate()` for async (better for large jobs) - Set `max_concurrency` to parallelize evaluation across multiple workers - Custom evaluators can return bool, float, or dict with `key`, `score`, `comment` - OpenEvals evaluators are pre-built and tested - use them when possible - LLM-as-judge evaluators are flexible but cost money (API calls to judge model) - Trajectory evaluators are useful for multi-turn conversations and agent evaluation - Always ensure you have the required dependencies installed before using these patterns - For agent-specific evaluations, consider using the `agentevals` package - Evaluation results are stored as feedback in LangSmith and can be viewed in the UI """ return documentation

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/langchain-ai/langsmith-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

register_tools.py•63.4 KiB