extract_data
Extract structured data from web pages including tables, lists, JSON-LD, or specific fields using CSS selectors for efficient information retrieval.
Instructions
Extract structured data from web pages.
Extracts tables, lists, or specific fields from HTML pages and returns
structured data. Much more efficient than parsing full page text.
Extract Types:
- "table": Extract HTML tables as list of dicts
- "list": Extract lists (ul/ol/dl) as structured list
- "fields": Extract specific elements using CSS selectors
- "json-ld": Extract JSON-LD structured data
- "auto": Automatically detect and extract structured content
Examples:
- extract_data("https://pypi.org/project/fastapi/", reasoning="Get package info")
- extract_data("https://github.com/user/repo/releases", reasoning="Get releases", extract_type="list")
- extract_data(
"https://example.com/product",
reasoning="Extract product details",
extract_type="fields",
selectors={"price": ".price", "title": "h1.product-name"}
)
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | ||
| reasoning | Yes | ||
| extract_type | No | auto | |
| selectors | No | ||
| max_items | No |
Input Schema (JSON Schema)
{
"properties": {
"extract_type": {
"default": "auto",
"enum": [
"table",
"list",
"fields",
"json-ld",
"auto"
],
"title": "Extract Type",
"type": "string"
},
"max_items": {
"default": 100,
"title": "Max Items",
"type": "integer"
},
"reasoning": {
"title": "Reasoning",
"type": "string"
},
"selectors": {
"anyOf": [
{
"additionalProperties": {
"type": "string"
},
"type": "object"
},
{
"type": "null"
}
],
"default": null,
"title": "Selectors"
},
"url": {
"title": "Url",
"type": "string"
}
},
"required": [
"url",
"reasoning"
],
"type": "object"
}
Implementation Reference
- src/searxng_mcp/server.py:1195-1304 (handler)The handler function for the 'extract_data' tool. Fetches HTML using CrawlerClient and extracts structured data (tables, lists, fields, JSON-LD, or auto) using DataExtractor based on extract_type parameter. Includes input schema via Annotated types and docstring. Registered via @mcp.tool() decorator using function name.@mcp.tool() async def extract_data( url: Annotated[str, "HTTP(S) URL to extract structured data from"], reasoning: Annotated[str, "Why you're extracting data from this URL (required for analytics)"], extract_type: Annotated[ Literal["table", "list", "fields", "json-ld", "auto"], 'Extraction type: "table", "list", "fields", "json-ld", or "auto"', ] = "auto", selectors: Annotated[ dict[str, str] | None, "CSS selectors for field extraction (only used with extract_type='fields')", ] = None, max_items: Annotated[int, "Maximum number of items to extract"] = 100, ) -> str: """ Extract structured data from web pages. Extracts tables, lists, or specific fields from HTML pages and returns structured data. Much more efficient than parsing full page text. Extract Types: - "table": Extract HTML tables as list of dicts - "list": Extract lists (ul/ol/dl) as structured list - "fields": Extract specific elements using CSS selectors - "json-ld": Extract JSON-LD structured data - "auto": Automatically detect and extract structured content Examples: - extract_data("https://pypi.org/project/fastapi/", reasoning="Get package info") - extract_data("https://github.com/user/repo/releases", reasoning="Get releases", extract_type="list") - extract_data( "https://example.com/product", reasoning="Extract product details", extract_type="fields", selectors={"price": ".price", "title": "h1.product-name"} ) """ import json start_time = time.time() success = False error_msg = None result = "" try: # Fetch raw HTML html = await crawler_client.fetch_raw(url) # Extract based on type if extract_type == "table": tables = data_extractor.extract_tables(html, max_tables=max_items) extracted_data = { "type": "table", "tables": [ { "caption": t.caption, "headers": t.headers, "rows": t.rows[:max_items], } for t in tables ], "source": url, } elif extract_type == "list": lists = data_extractor.extract_lists(html, max_lists=max_items) extracted_data = { "type": "list", "lists": [{"title": li.title, "items": li.items[:max_items]} for li in lists], "source": url, } elif extract_type == "fields": if not selectors: raise ValueError("selectors parameter is required for extract_type='fields'") fields = data_extractor.extract_fields(html, selectors) extracted_data = {"type": "fields", "data": fields, "source": url} elif extract_type == "json-ld": json_ld = data_extractor.extract_json_ld(html) extracted_data = {"type": "json-ld", "data": json_ld, "source": url} else: # auto auto_data = data_extractor.auto_extract(html) extracted_data = {"type": "auto", "data": auto_data, "source": url} # Format result result = json.dumps(extracted_data, indent=2, ensure_ascii=False) result = clamp_text(result, MAX_RESPONSE_CHARS) success = True except Exception as exc: # noqa: BLE001 error_msg = str(exc) result = f"Data extraction failed for {url}: {exc}" finally: # Track usage response_time = (time.time() - start_time) * 1000 tracker.track_usage( tool_name="extract_data", reasoning=reasoning, parameters={ "url": url, "extract_type": extract_type, "has_selectors": selectors is not None, "max_items": max_items, }, response_time_ms=response_time, success=success, error_message=error_msg, response_size=len(result.encode("utf-8")), ) return result
- src/searxng_mcp/extractor.py:52-237 (helper)DataExtractor class with core extraction methods (extract_tables, extract_lists, extract_fields, extract_json_ld, auto_extract) called by the extract_data handler. Includes supporting dataclasses TableData and ListData, and _sanitize_text utility.class DataExtractor: """Extract structured data from HTML.""" def extract_tables(self, html: str, max_tables: int = 5) -> list[TableData]: """Extract HTML tables. Args: html: Raw HTML content max_tables: Maximum number of tables to extract Returns: List of TableData objects with caption, headers, and rows """ soup = BeautifulSoup(html, "html.parser") tables = [] for table in soup.find_all("table")[:max_tables]: # Extract caption caption_elem = table.find("caption") caption = _sanitize_text(caption_elem.get_text(strip=True)) if caption_elem else None # Extract headers headers = [] header_row = table.find("thead") if header_row: headers = [ _sanitize_text(th.get_text(strip=True)) for th in header_row.find_all("th") ] else: # Try first row first_row = table.find("tr") if first_row: headers = [ _sanitize_text(th.get_text(strip=True)) for th in first_row.find_all("th") ] # If no headers found, use generic column names if not headers: first_row = table.find("tr") if first_row: num_cols = len(first_row.find_all(["td", "th"])) headers = [f"Column {i + 1}" for i in range(num_cols)] if not headers: continue # Extract rows rows = [] tbody = table.find("tbody") row_elements = ( tbody.find_all("tr") if tbody else table.find_all("tr")[1:] ) # Skip header row if no tbody for tr in row_elements: cells = tr.find_all(["td", "th"]) if cells and len(cells) == len(headers): row_dict = {} for i, cell in enumerate(cells): row_dict[headers[i]] = _sanitize_text(cell.get_text(strip=True)) rows.append(row_dict) if rows: tables.append(TableData(caption=caption, headers=headers, rows=rows)) return tables def extract_lists(self, html: str, max_lists: int = 5) -> list[ListData]: """Extract HTML lists (ul, ol, dl). Args: html: Raw HTML content max_lists: Maximum number of lists to extract Returns: List of ListData objects with title and items """ soup = BeautifulSoup(html, "html.parser") lists = [] for list_elem in soup.find_all(["ul", "ol", "dl"])[:max_lists]: # Try to find a title (preceding heading) title = None prev = list_elem.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"]) if prev: title = _sanitize_text(prev.get_text(strip=True)) # Extract items items = [] if list_elem.name in ["ul", "ol"]: for li in list_elem.find_all("li", recursive=False): items.append(_sanitize_text(li.get_text(strip=True))) else: # dl for dt in list_elem.find_all("dt"): dd = dt.find_next_sibling("dd") if dd: items.append( f"{_sanitize_text(dt.get_text(strip=True))}: {_sanitize_text(dd.get_text(strip=True))}" ) if items: lists.append( ListData( title=title, items=items, nested=False, # TODO: detect nested lists ) ) return lists def extract_fields(self, html: str, selectors: dict[str, str]) -> dict[str, str | list[str]]: """Extract specific fields using CSS selectors. Args: html: Raw HTML content selectors: Dict mapping field names to CSS selectors Returns: Dict with extracted field values (single string or list of strings) """ soup = BeautifulSoup(html, "html.parser") data: dict[str, str | list[str]] = {} for field_name, selector in selectors.items(): elements = soup.select(selector) if elements: if len(elements) == 1: data[field_name] = _sanitize_text(elements[0].get_text(strip=True)) else: data[field_name] = [_sanitize_text(el.get_text(strip=True)) for el in elements] return data def extract_json_ld(self, html: str) -> list[dict[str, Any]]: """Extract JSON-LD structured data. Args: html: Raw HTML content Returns: List of JSON-LD objects found in the page """ soup = BeautifulSoup(html, "html.parser") json_ld_scripts = soup.find_all("script", type="application/ld+json") data = [] for script in json_ld_scripts: try: if script.string: parsed = json.loads(script.string) data.append(parsed) except json.JSONDecodeError: pass return data def auto_extract(self, html: str) -> dict[str, Any]: """Automatically detect and extract structured content. Args: html: Raw HTML content Returns: Dict containing all detected structured data (tables, lists, json_ld) """ results: dict[str, Any] = {"tables": [], "lists": [], "json_ld": []} # Try JSON-LD first (highest quality) json_ld = self.extract_json_ld(html) if json_ld: results["json_ld"] = json_ld # Extract tables tables = self.extract_tables(html, max_tables=3) if tables: results["tables"] = [ {"caption": t.caption, "headers": t.headers, "rows": t.rows} for t in tables ] # Extract lists lists = self.extract_lists(html, max_lists=3) if lists: results["lists"] = [{"title": li.title, "items": li.items} for li in lists] return results
- src/searxng_mcp/server.py:24-40 (helper)Instantiation of the DataExtractor instance used by the extract_data handler.from .github import GitHubClient, RepoInfo from .images import PixabayClient from .registry import PackageInfo, PackageRegistryClient from .search import SearxSearcher from .service_health import ServiceHealthChecker from .tracking import get_tracker mcp = FastMCP("web-research-assistant") searcher = SearxSearcher() crawler_client = CrawlerClient() registry_client = PackageRegistryClient() github_client = GitHubClient() pixabay_client = PixabayClient() error_parser = ErrorParser() api_docs_detector = APIDocsDetector() api_docs_extractor = APIDocsExtractor() data_extractor = DataExtractor()
- src/searxng_mcp/server.py:23-40 (registration)Import and global instantiation of DataExtractor used in the tool handler.from .extractor import DataExtractor from .github import GitHubClient, RepoInfo from .images import PixabayClient from .registry import PackageInfo, PackageRegistryClient from .search import SearxSearcher from .service_health import ServiceHealthChecker from .tracking import get_tracker mcp = FastMCP("web-research-assistant") searcher = SearxSearcher() crawler_client = CrawlerClient() registry_client = PackageRegistryClient() github_client = GitHubClient() pixabay_client = PixabayClient() error_parser = ErrorParser() api_docs_detector = APIDocsDetector() api_docs_extractor = APIDocsExtractor() data_extractor = DataExtractor()