extract_from_url
Extract structured data from web content using AI. Specify a URL and extraction instructions to retrieve organized information from articles, documents, or text-based web pages.
Instructions
Extract structured information from text content at a URL.
Downloads text from the specified URL and extracts structured information using Large Language Models. Ideal for processing web articles, documents, or any text content accessible via HTTP/HTTPS.
Args: url: URL to download text from (must start with http:// or https://) prompt_description: Clear instructions for what to extract examples: List of example extractions to guide the model model_id: LLM model to use (default: "gemini-2.5-flash") max_char_buffer: Max characters per chunk (default: 1000) temperature: Sampling temperature 0.0-1.0 (default: 0.5) extraction_passes: Number of extraction passes for better recall (default: 1) max_workers: Max parallel workers (default: 10)
Returns: Dictionary containing extracted entities with source locations and metadata
Raises: ToolError: If URL is invalid, download fails, or extraction fails
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | ||
| prompt_description | Yes | ||
| examples | Yes | ||
| model_id | No | gemini-2.5-flash | |
| max_char_buffer | No | ||
| temperature | No | ||
| extraction_passes | No | ||
| max_workers | No |
Implementation Reference
- src/langextract_mcp/server.py:319-402 (handler)The primary handler function for the 'extract_from_url' MCP tool. Decorated with @mcp.tool for automatic registration. Validates URL and parameters, creates ExtractionConfig, retrieves API key, calls _langextract_client.extract(url), and returns formatted results using _format_extraction_result.@mcp.tool def extract_from_url( url: str, prompt_description: str, examples: list[dict[str, Any]], model_id: str = "gemini-2.5-flash", max_char_buffer: int = 1000, temperature: float = 0.5, extraction_passes: int = 1, max_workers: int = 10 ) -> dict[str, Any]: """ Extract structured information from text content at a URL. Downloads text from the specified URL and extracts structured information using Large Language Models. Ideal for processing web articles, documents, or any text content accessible via HTTP/HTTPS. Args: url: URL to download text from (must start with http:// or https://) prompt_description: Clear instructions for what to extract examples: List of example extractions to guide the model model_id: LLM model to use (default: "gemini-2.5-flash") max_char_buffer: Max characters per chunk (default: 1000) temperature: Sampling temperature 0.0-1.0 (default: 0.5) extraction_passes: Number of extraction passes for better recall (default: 1) max_workers: Max parallel workers (default: 10) Returns: Dictionary containing extracted entities with source locations and metadata Raises: ToolError: If URL is invalid, download fails, or extraction fails """ try: if not url.startswith(('http://', 'https://')): raise ToolError("URL must start with http:// or https://") if not examples: raise ToolError("At least one example is required for reliable extraction") if not prompt_description.strip(): raise ToolError("Prompt description cannot be empty") # Validate that only Gemini models are supported if not model_id.startswith('gemini'): raise ToolError( f"Only Google Gemini models are supported. Got: {model_id}. " f"Use 'list_supported_models' tool to see available options." ) # Create config object from individual parameters config = ExtractionConfig( model_id=model_id, max_char_buffer=max_char_buffer, temperature=temperature, extraction_passes=extraction_passes, max_workers=max_workers ) # Get API key (server-side only for security) api_key = _get_api_key() if not api_key: raise ToolError( "API key required. Server administrator must set LANGEXTRACT_API_KEY environment variable." ) # Perform optimized extraction using cached client result = _langextract_client.extract( text_or_url=url, prompt_description=prompt_description, examples=examples, config=config, api_key=api_key ) return _format_extraction_result(result, config, source_url=url) except ValueError as e: raise ToolError(f"Invalid parameters: {str(e)}") except Exception as e: raise ToolError(f"URL extraction failed: {str(e)}")
- src/langextract_mcp/server.py:21-28 (schema)Pydantic ExtractionConfig model defining input parameters for the extraction process, used by both extract_from_text and extract_from_url tools.class ExtractionConfig(BaseModel): """Configuration for extraction parameters.""" model_id: str = Field(default="gemini-2.5-flash", description="LLM model to use") max_char_buffer: int = Field(default=1000, description="Max characters per chunk") temperature: float = Field(default=0.5, description="Sampling temperature (0.0-1.0)") extraction_passes: int = Field(default=1, description="Number of extraction passes for better recall") max_workers: int = Field(default=10, description="Max parallel workers")
- src/langextract_mcp/server.py:38-190 (helper)LangExtractClient class implementing the core extraction logic delegated to by the tool handlers. Provides caching for language models, schemas, resolvers, and performs the actual annotation using langextract library.class LangExtractClient: """Optimized langextract client for MCP server usage. This client maintains persistent connections and caches expensive operations like schema generation and prompt templates for better performance in a long-running MCP server context. """ def __init__(self): self._language_models: dict[str, Any] = {} self._schema_cache: dict[str, Any] = {} self._prompt_template_cache: dict[str, Any] = {} self._resolver_cache: dict[str, Any] = {} def _get_examples_hash(self, examples: list[dict[str, Any]]) -> str: """Generate a hash for caching based on examples.""" examples_str = json.dumps(examples, sort_keys=True) return hashlib.md5(examples_str.encode()).hexdigest() def _get_language_model(self, config: ExtractionConfig, api_key: str, schema: Any | None = None, schema_hash: str | None = None) -> Any: """Get or create a cached language model instance.""" # Include schema hash in cache key to prevent schema mutation conflicts model_key = f"{config.model_id}_{config.temperature}_{config.max_workers}_{schema_hash or 'no_schema'}" if model_key not in self._language_models: # Validate that only Gemini models are supported if not config.model_id.startswith('gemini'): raise ValueError(f"Only Gemini models are supported. Got: {config.model_id}") language_model = lx.inference.GeminiLanguageModel( model_id=config.model_id, api_key=api_key, temperature=config.temperature, max_workers=config.max_workers, gemini_schema=schema ) self._language_models[model_key] = language_model return self._language_models[model_key] def _get_schema(self, examples: list[dict[str, Any]], model_id: str) -> tuple[Any, str]: """Get or create a cached schema for the examples. Returns: Tuple of (schema, examples_hash) for use in caching language models """ if not model_id.startswith('gemini'): return None, "" examples_hash = self._get_examples_hash(examples) schema_key = f"{model_id}_{examples_hash}" if schema_key not in self._schema_cache: # Convert examples to langextract format langextract_examples = self._create_langextract_examples(examples) # Create prompt template to generate schema prompt_template = lx.prompting.PromptTemplateStructured(description="Schema generation") prompt_template.examples.extend(langextract_examples) # Generate schema schema = lx.schema.GeminiSchema.from_examples(prompt_template.examples) self._schema_cache[schema_key] = schema return self._schema_cache[schema_key], examples_hash def _get_resolver(self, format_type: str = "JSON") -> Any: """Get or create a cached resolver.""" if format_type not in self._resolver_cache: resolver = lx.resolver.Resolver( fence_output=False, format_type=lx.data.FormatType.JSON if format_type == "JSON" else lx.data.FormatType.YAML, extraction_attributes_suffix="_attributes", extraction_index_suffix=None, ) self._resolver_cache[format_type] = resolver return self._resolver_cache[format_type] def _create_langextract_examples(self, examples: list[dict[str, Any]]) -> list[lx.data.ExampleData]: """Convert dictionary examples to langextract ExampleData objects.""" langextract_examples = [] for example in examples: extractions = [] for extraction_data in example["extractions"]: extractions.append( lx.data.Extraction( extraction_class=extraction_data["extraction_class"], extraction_text=extraction_data["extraction_text"], attributes=extraction_data.get("attributes", {}) ) ) langextract_examples.append( lx.data.ExampleData( text=example["text"], extractions=extractions ) ) return langextract_examples def extract( self, text_or_url: str, prompt_description: str, examples: list[dict[str, Any]], config: ExtractionConfig, api_key: str ) -> lx.data.AnnotatedDocument: """Optimized extraction using cached components.""" # Get or generate schema first schema, examples_hash = self._get_schema(examples, config.model_id) # Get cached components with schema-aware caching language_model = self._get_language_model(config, api_key, schema, examples_hash) resolver = self._get_resolver("JSON") # Convert examples langextract_examples = self._create_langextract_examples(examples) # Create prompt template prompt_template = lx.prompting.PromptTemplateStructured( description=prompt_description ) prompt_template.examples.extend(langextract_examples) # Create annotator annotator = lx.annotation.Annotator( language_model=language_model, prompt_template=prompt_template, format_type=lx.data.FormatType.JSON, fence_output=False, ) # Perform extraction if text_or_url.startswith(('http://', 'https://')): # Download text first text = lx.io.download_text_from_url(text_or_url) else: text = text_or_url return annotator.annotate_text( text=text, resolver=resolver, max_char_buffer=config.max_char_buffer, batch_length=10, additional_context=None, debug=False, # Disable debug for cleaner MCP output extraction_passes=config.extraction_passes, )
- _format_extraction_result helper function that converts the langextract AnnotatedDocument result into the dictionary format returned by the tool.def _format_extraction_result(result: lx.data.AnnotatedDocument, config: ExtractionConfig, source_url: str | None = None) -> dict[str, Any]: """Format langextract result for MCP response.""" extractions = [] for extraction in result.extractions or []: extractions.append({ "extraction_class": extraction.extraction_class, "extraction_text": extraction.extraction_text, "attributes": extraction.attributes, "start_char": getattr(extraction, 'start_char', None), "end_char": getattr(extraction, 'end_char', None), }) response = { "document_id": result.document_id if result.document_id else "anonymous", "total_extractions": len(extractions), "extractions": extractions, "metadata": { "model_id": config.model_id, "extraction_passes": config.extraction_passes, "max_char_buffer": config.max_char_buffer, "temperature": config.temperature, } } if source_url: response["source_url"] = source_url return response
- _get_api_key helper function that retrieves the required Google Gemini API key from environment variable.def _get_api_key() -> str | None: """Get API key from environment (server-side only for security).""" return os.environ.get("LANGEXTRACT_API_KEY")