Memex

from firecrawl import FirecrawlApp from pathlib import Path import os async def scrape_url(cache_dir: Path, url: str) -> str: """Fetch page content from FireCrawl API""" firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") if not firecrawl_api_key: raise Exception("FIRECRAWL_API_KEY environment variable is required") app = FirecrawlApp(api_key=firecrawl_api_key) response = app.scrape_url(url) content = response['markdown'] # Cache the response output_path = get_url_cache_path(cache_dir, url) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "w") as f: f.write(content) return content def get_url_cache_path(cache_dir: Path, url: str) -> Path: """Get the cache URL for a given URL""" return cache_dir / f"{url.replace('://', '/')}.md" def fetch_cached_page(cache_dir: Path, url: str) -> str | None: """Get cached page content""" path = get_url_cache_path(cache_dir, url) if not path.exists(): return None return path.read_text() async def fetch_page(cache_dir: Path, url: str) -> str: """Get cached page content or scrape if not found""" # Check if URL is already scraped content = fetch_cached_page(cache_dir, url) if content: return content content = await scrape_url(cache_dir, url) return content