extract_from_url
Extract structured data from web content using AI. Specify a URL and extraction instructions to retrieve organized information from articles, documents, or text-based web pages.
Instructions
Extract structured information from text content at a URL.
Downloads text from the specified URL and extracts structured information using Large Language Models. Ideal for processing web articles, documents, or any text content accessible via HTTP/HTTPS.
Args: url: URL to download text from (must start with http:// or https://) prompt_description: Clear instructions for what to extract examples: List of example extractions to guide the model model_id: LLM model to use (default: "gemini-2.5-flash") max_char_buffer: Max characters per chunk (default: 1000) temperature: Sampling temperature 0.0-1.0 (default: 0.5) extraction_passes: Number of extraction passes for better recall (default: 1) max_workers: Max parallel workers (default: 10)
Returns: Dictionary containing extracted entities with source locations and metadata
Raises: ToolError: If URL is invalid, download fails, or extraction fails
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | ||
| prompt_description | Yes | ||
| examples | Yes | ||
| model_id | No | gemini-2.5-flash | |
| max_char_buffer | No | ||
| temperature | No | ||
| extraction_passes | No | ||
| max_workers | No |
Implementation Reference
- src/langextract_mcp/server.py:319-402 (handler)The primary handler function for the 'extract_from_url' MCP tool. Decorated with @mcp.tool for automatic registration. Validates URL and parameters, creates ExtractionConfig, retrieves API key, calls _langextract_client.extract(url), and returns formatted results using _format_extraction_result.
@mcp.tool def extract_from_url( url: str, prompt_description: str, examples: list[dict[str, Any]], model_id: str = "gemini-2.5-flash", max_char_buffer: int = 1000, temperature: float = 0.5, extraction_passes: int = 1, max_workers: int = 10 ) -> dict[str, Any]: """ Extract structured information from text content at a URL. Downloads text from the specified URL and extracts structured information using Large Language Models. Ideal for processing web articles, documents, or any text content accessible via HTTP/HTTPS. Args: url: URL to download text from (must start with http:// or https://) prompt_description: Clear instructions for what to extract examples: List of example extractions to guide the model model_id: LLM model to use (default: "gemini-2.5-flash") max_char_buffer: Max characters per chunk (default: 1000) temperature: Sampling temperature 0.0-1.0 (default: 0.5) extraction_passes: Number of extraction passes for better recall (default: 1) max_workers: Max parallel workers (default: 10) Returns: Dictionary containing extracted entities with source locations and metadata Raises: ToolError: If URL is invalid, download fails, or extraction fails """ try: if not url.startswith(('http://', 'https://')): raise ToolError("URL must start with http:// or https://") if not examples: raise ToolError("At least one example is required for reliable extraction") if not prompt_description.strip(): raise ToolError("Prompt description cannot be empty") # Validate that only Gemini models are supported if not model_id.startswith('gemini'): raise ToolError( f"Only Google Gemini models are supported. Got: {model_id}. " f"Use 'list_supported_models' tool to see available options." ) # Create config object from individual parameters config = ExtractionConfig( model_id=model_id, max_char_buffer=max_char_buffer, temperature=temperature, extraction_passes=extraction_passes, max_workers=max_workers ) # Get API key (server-side only for security) api_key = _get_api_key() if not api_key: raise ToolError( "API key required. Server administrator must set LANGEXTRACT_API_KEY environment variable." ) # Perform optimized extraction using cached client result = _langextract_client.extract( text_or_url=url, prompt_description=prompt_description, examples=examples, config=config, api_key=api_key ) return _format_extraction_result(result, config, source_url=url) except ValueError as e: raise ToolError(f"Invalid parameters: {str(e)}") except Exception as e: raise ToolError(f"URL extraction failed: {str(e)}") - src/langextract_mcp/server.py:21-28 (schema)Pydantic ExtractionConfig model defining input parameters for the extraction process, used by both extract_from_text and extract_from_url tools.
class ExtractionConfig(BaseModel): """Configuration for extraction parameters.""" model_id: str = Field(default="gemini-2.5-flash", description="LLM model to use") max_char_buffer: int = Field(default=1000, description="Max characters per chunk") temperature: float = Field(default=0.5, description="Sampling temperature (0.0-1.0)") extraction_passes: int = Field(default=1, description="Number of extraction passes for better recall") max_workers: int = Field(default=10, description="Max parallel workers") - src/langextract_mcp/server.py:38-190 (helper)LangExtractClient class implementing the core extraction logic delegated to by the tool handlers. Provides caching for language models, schemas, resolvers, and performs the actual annotation using langextract library.
class LangExtractClient: """Optimized langextract client for MCP server usage. This client maintains persistent connections and caches expensive operations like schema generation and prompt templates for better performance in a long-running MCP server context. """ def __init__(self): self._language_models: dict[str, Any] = {} self._schema_cache: dict[str, Any] = {} self._prompt_template_cache: dict[str, Any] = {} self._resolver_cache: dict[str, Any] = {} def _get_examples_hash(self, examples: list[dict[str, Any]]) -> str: """Generate a hash for caching based on examples.""" examples_str = json.dumps(examples, sort_keys=True) return hashlib.md5(examples_str.encode()).hexdigest() def _get_language_model(self, config: ExtractionConfig, api_key: str, schema: Any | None = None, schema_hash: str | None = None) -> Any: """Get or create a cached language model instance.""" # Include schema hash in cache key to prevent schema mutation conflicts model_key = f"{config.model_id}_{config.temperature}_{config.max_workers}_{schema_hash or 'no_schema'}" if model_key not in self._language_models: # Validate that only Gemini models are supported if not config.model_id.startswith('gemini'): raise ValueError(f"Only Gemini models are supported. Got: {config.model_id}") language_model = lx.inference.GeminiLanguageModel( model_id=config.model_id, api_key=api_key, temperature=config.temperature, max_workers=config.max_workers, gemini_schema=schema ) self._language_models[model_key] = language_model return self._language_models[model_key] def _get_schema(self, examples: list[dict[str, Any]], model_id: str) -> tuple[Any, str]: """Get or create a cached schema for the examples. Returns: Tuple of (schema, examples_hash) for use in caching language models """ if not model_id.startswith('gemini'): return None, "" examples_hash = self._get_examples_hash(examples) schema_key = f"{model_id}_{examples_hash}" if schema_key not in self._schema_cache: # Convert examples to langextract format langextract_examples = self._create_langextract_examples(examples) # Create prompt template to generate schema prompt_template = lx.prompting.PromptTemplateStructured(description="Schema generation") prompt_template.examples.extend(langextract_examples) # Generate schema schema = lx.schema.GeminiSchema.from_examples(prompt_template.examples) self._schema_cache[schema_key] = schema return self._schema_cache[schema_key], examples_hash def _get_resolver(self, format_type: str = "JSON") -> Any: """Get or create a cached resolver.""" if format_type not in self._resolver_cache: resolver = lx.resolver.Resolver( fence_output=False, format_type=lx.data.FormatType.JSON if format_type == "JSON" else lx.data.FormatType.YAML, extraction_attributes_suffix="_attributes", extraction_index_suffix=None, ) self._resolver_cache[format_type] = resolver return self._resolver_cache[format_type] def _create_langextract_examples(self, examples: list[dict[str, Any]]) -> list[lx.data.ExampleData]: """Convert dictionary examples to langextract ExampleData objects.""" langextract_examples = [] for example in examples: extractions = [] for extraction_data in example["extractions"]: extractions.append( lx.data.Extraction( extraction_class=extraction_data["extraction_class"], extraction_text=extraction_data["extraction_text"], attributes=extraction_data.get("attributes", {}) ) ) langextract_examples.append( lx.data.ExampleData( text=example["text"], extractions=extractions ) ) return langextract_examples def extract( self, text_or_url: str, prompt_description: str, examples: list[dict[str, Any]], config: ExtractionConfig, api_key: str ) -> lx.data.AnnotatedDocument: """Optimized extraction using cached components.""" # Get or generate schema first schema, examples_hash = self._get_schema(examples, config.model_id) # Get cached components with schema-aware caching language_model = self._get_language_model(config, api_key, schema, examples_hash) resolver = self._get_resolver("JSON") # Convert examples langextract_examples = self._create_langextract_examples(examples) # Create prompt template prompt_template = lx.prompting.PromptTemplateStructured( description=prompt_description ) prompt_template.examples.extend(langextract_examples) # Create annotator annotator = lx.annotation.Annotator( language_model=language_model, prompt_template=prompt_template, format_type=lx.data.FormatType.JSON, fence_output=False, ) # Perform extraction if text_or_url.startswith(('http://', 'https://')): # Download text first text = lx.io.download_text_from_url(text_or_url) else: text = text_or_url return annotator.annotate_text( text=text, resolver=resolver, max_char_buffer=config.max_char_buffer, batch_length=10, additional_context=None, debug=False, # Disable debug for cleaner MCP output extraction_passes=config.extraction_passes, ) - _format_extraction_result helper function that converts the langextract AnnotatedDocument result into the dictionary format returned by the tool.
def _format_extraction_result(result: lx.data.AnnotatedDocument, config: ExtractionConfig, source_url: str | None = None) -> dict[str, Any]: """Format langextract result for MCP response.""" extractions = [] for extraction in result.extractions or []: extractions.append({ "extraction_class": extraction.extraction_class, "extraction_text": extraction.extraction_text, "attributes": extraction.attributes, "start_char": getattr(extraction, 'start_char', None), "end_char": getattr(extraction, 'end_char', None), }) response = { "document_id": result.document_id if result.document_id else "anonymous", "total_extractions": len(extractions), "extractions": extractions, "metadata": { "model_id": config.model_id, "extraction_passes": config.extraction_passes, "max_char_buffer": config.max_char_buffer, "temperature": config.temperature, } } if source_url: response["source_url"] = source_url return response - _get_api_key helper function that retrieves the required Google Gemini API key from environment variable.
def _get_api_key() -> str | None: """Get API key from environment (server-side only for security).""" return os.environ.get("LANGEXTRACT_API_KEY")