Web-Scout

Web-Scout
services

web_scraper.py•2.2 KiB

import requests from bs4 import BeautifulSoup from typing import Optional import re def scrape_webpage_content(url: str, max_length: int = 3000) -> Optional[str]: """Scrape and extract meaningful content from a webpage.""" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } # Fetch webpage response = requests.get(url, headers=headers) response.raise_for_status() # Parse HTML soup = BeautifulSoup(response.content, 'lxml') # Remove unwanted elements for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'sidebar']): element.decompose() # Extract main content using common selectors content_elements = [] # Try to find main content areas main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content') or soup.find('div', class_='post-content') if main_content: # Get text from main content area paragraphs = main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']) content_elements.extend(paragraphs) # If no main content found, extract from body paragraphs = soup.find_all('p') content_elements.extend(paragraphs) # Extract and clean text content_text = [] for element in content_elements[:50]: # Limit to first 50 relevant elements text = element.get_text().strip() if text and len(text) > 50: # Filter out very short texts # Clean up whitespace text = re.sub(r'\s+', ' ', text) content_text.append(text) # Join and limit content length full_content = ' '.join(content_text) if len(full_content) > max_length: full_content = full_content[:max_length] + '...' return full_content.strip() if full_content else None except requests.RequestException as e: print(f"Error scraping {url}: {str(e)}") return None except Exception as e: print(f"Unexpected error scraping {url}: {str(e)}") return None

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/UpperMoon0/Web-Scout'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

web_scraper.py•2.2 KiB