web-scrapper-stdio

helpers.py•4.34 KiB

from urllib.parse import urlparse def discover_rss_feeds(domain_url: str) -> list[str]: """Tries to discover RSS feeds for the given domain URL. Returns a list of discovered RSS feed URLs, ordered by likelihood of success.""" parsed_url = urlparse(domain_url) domain_root = parsed_url.netloc base_domain = domain_root.replace("www.", "") # Domain-specific known feeds with higher priority domain_specific_feeds = { # NOTE: Forbes does not provide a public RSS feed for Innovation as of # 2025-06-08. # VALID: RSS XML (Playwright, checked 2025-06-08) 'fortune.com': ['https://fortune.com/feed/'], # VALID: RSS XML (Playwright, checked 2025-06-06) 'techcrunch.com': ['https://techcrunch.com/feed/'], # VALID: RSS XML (Playwright, checked 2025-06-08) 'wired.com': ['https://www.wired.com/feed/rss'], # VALID: RSS XML (Playwright, checked 2025-06-08) 'engadget.com': ['https://www.engadget.com/rss.xml'], # VALID: RSS XML (Playwright, checked 2025-06-08) 'medium.com': ['https://medium.com/feed/', 'https://medium.com/feed/tag/technology'], # VALID: RSS XML (Playwright, checked 2025-06-08) 'dev.to': ['https://dev.to/feed/', 'https://dev.to/feed/top'], # VALID: RSS XML (Playwright, checked 2025-06-08) 'tomsguide.com': ['https://www.tomsguide.com/feeds/news.xml', 'https://www.tomsguide.com/feeds/all-news.xml'], # VALID: RSS XML (Playwright, checked 2025-06-08) 'xda-developers.com': ['https://www.xda-developers.com/feed/'], # VALID: RSS XML (Playwright, checked 2025-06-08) 'dmnews.com': ['https://www.dmnews.com/feed/'], } # Common RSS feed paths to check if no domain-specific feed matches common_paths = [ "/feed", "/rss", "/feed/", "/rss/", "/feed.xml", "/rss.xml", "/atom.xml", "/feeds/posts/default", "/index.xml", "/index.rss", "/rss/index.rss", "/feed/rss", "/blog/feed", "/blog/rss", "/news/feed", "/articles/feed", ] # Check domain-specific feeds first discovered_feeds = [] # Add domain-specific feeds if available for domain, feeds in domain_specific_feeds.items(): if domain in base_domain: discovered_feeds.extend(feeds) break # Only use the first matching domain's feeds # If no domain-specific feeds found, try common paths if not discovered_feeds: for path in common_paths: feed_url = f"{parsed_url.scheme}://{domain_root}{path}" discovered_feeds.append(feed_url) return discovered_feeds def verify_rss_feed(feed_data): """Verify if the parsed feed data is a valid RSS feed""" # Check for essential RSS feed elements if not hasattr(feed_data, 'feed') or not hasattr(feed_data, 'entries'): return False # Check if the feed has a title (most valid feeds do) if not hasattr(feed_data.feed, 'title'): return False # Check if there are entries if not feed_data.entries: return False # Verify at least one entry has a link for entry in feed_data.entries: if hasattr(entry, 'link') and entry.link: return True return False def extract_article_from_feed(feed_data): """Extract a valid article link from feed data""" if not feed_data.entries: return None # Try to find an entry with title, link and preferably a summary for entry in feed_data.entries: if hasattr( entry, 'link') and entry.link and urlparse( entry.link).scheme in [ 'http', 'https']: # Prioritize entries that have a title and summary/content if hasattr( entry, 'title') and ( hasattr( entry, 'summary') or hasattr( entry, 'content')): return entry.link # Fall back to any entry with a valid link for entry in feed_data.entries: if hasattr( entry, 'link') and entry.link and urlparse( entry.link).scheme in [ 'http', 'https']: return entry.link return None

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/JustAzul/web-scrapper-stdio'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

helpers.py•4.34 KiB