@mcp.tool()
def scrape_websites(
websites: Dict[str, str],
formats: List[str] = ['markdown', 'html'],
api_key: Optional[str] = None
) -> List[str]:
"""
Scrape multiple websites using Firecrawl and store their content.
Args:
websites: Dictionary of provider_name -> URL mappings
formats: List of formats to scrape ['markdown', 'html'] (default: both)
api_key: Firecrawl API key (if None, expects environment variable)
Returns:
List of provider names for successfully scraped websites
"""
if api_key is None:
api_key = os.getenv('FIRECRAWL_API_KEY')
if not api_key:
raise ValueError("API key must be provided or set as FIRECRAWL_API_KEY environment variable")
app = FirecrawlApp(api_key=api_key)
path = os.path.join(SCRAPE_DIR)
os.makedirs(path, exist_ok=True)
# save the scraped content to files and then create scraped_metadata.json as a summary file
# check if the provider has already been scraped and decide if you want to overwrite
metadata_file = os.path.join(path, "scraped_metadata.json")
existing_metadata = {}
if os.path.exists(metadata_file):
try:
with open(metadata_file, "r") as f:
existing_metadata = json.load(f)
except json.JSONDecodeError:
logger.warning(f"Could not decode {metadata_file}, starting fresh.")
scraped_providers = []
for provider, url in websites.items():
logger.info(f"Scraping {provider} at {url}")
try:
scrape_result = app.scrape(url, formats=formats)
# Prepare metadata entry
timestamp = datetime.now().isoformat()
domain = urlparse(url).netloc
content_files = {}
for fmt in formats:
content = getattr(scrape_result, fmt, "")
if content:
filename = f"{provider}_{fmt}.txt"
file_path = os.path.join(path, filename)
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
content_files[fmt] = filename
# Handle metadata safely
title = "Unknown Title"
description = "No description"
if hasattr(scrape_result, 'metadata'):
title = getattr(scrape_result.metadata, 'title', "Unknown Title")
description = getattr(scrape_result.metadata, 'description', "No description")
metadata_entry = {
"provider_name": provider,
"url": url,
"domain": domain,
"scraped_at": timestamp,
"formats": formats,
"success": "true",
"content_files": content_files,
"title": title,
"description": description
}
existing_metadata[provider] = metadata_entry
scraped_providers.append(provider)
logger.info(f"Successfully scraped {provider}")
except Exception as e:
logger.error(f"Failed to scrape {provider}: {e}")
# Optionally record failure in metadata
existing_metadata[provider] = {
"provider_name": provider,
"url": url,
"scraped_at": datetime.now().isoformat(),
"success": "false",
"error": str(e)
}
with open(metadata_file, "w", encoding="utf-8") as f:
json.dump(existing_metadata, f, indent=4)
return scraped_providers