MCP News Scraper

Overview Schema Related Servers Score Discussions

mcp-news
news_sources

khaleej_times.py

khaleej_times.py•8.83 kB

import requests from bs4 import BeautifulSoup import logging from typing import List, Dict from datetime import datetime logger = logging.getLogger(__name__) def get_timeline_events(soup: BeautifulSoup) -> List[Dict[str, str]]: """Get key events timeline from Khaleej Times""" timeline_events = [] # Find all timeline card boxes timeline_cards = soup.find_all('div', class_='card-box') for card in timeline_cards: try: # Get the title row title_row = card.find('div', class_='post-title-rows') if not title_row: continue # Get the timestamp time_stamp = title_row.find('div', class_='time-stmp') timestamp = "" if time_stamp: time_elem = time_stamp.find('span', class_='tme-evnt') date_elem = time_stamp.find('span', class_='date-evnt') if time_elem and date_elem: timestamp = f"{time_elem.text.strip()} {date_elem.text.strip()}" # Get the headline headline_elem = title_row.find('h4') if not headline_elem: continue headline_link = headline_elem.find('a') if not headline_link: continue headline = headline_link.text.strip() event_id = headline_link.get('href', '').strip('#') timeline_events.append({ 'title': headline, 'timestamp': timestamp, 'event_id': event_id, 'is_timeline': True }) except Exception as e: logger.error(f"Error parsing timeline event: {str(e)}") continue return timeline_events def get_card_articles(soup: BeautifulSoup) -> List[Dict[str, str]]: """Get articles formatted in card structure from Khaleej Times""" card_articles = [] # Find all card articles card_elements = soup.find_all('li', class_='rcnt-evntPost') for card in card_elements: try: # Get the article content div content_div = card.find('div', class_='evnt-content') if not content_div: continue # Get the headline headline_elem = content_div.find('h2') if not headline_elem: continue headline = headline_elem.text.strip() # Get the content content_elem = content_div.find('div') content = "" if content_elem: paragraphs = content_elem.find_all('p') content = " ".join([p.text.strip() for p in paragraphs if p.text.strip()]) # Get the timestamp timestamp = "" time_elem = card.find('span', class_='tme-evnt') if time_elem: timestamp = time_elem.text.strip() card_articles.append({ 'title': headline, 'content': content, 'timestamp': timestamp, 'is_card': True }) except Exception as e: logger.error(f"Error parsing card article: {str(e)}") continue return card_articles def get_headlines() -> List[Dict[str, str]]: """Get all main headlines from Khaleej Times, including the main top news, excluding the popular section, and including card articles and timeline events""" try: url = "https://www.khaleejtimes.com/" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') headlines = [] # Get the main top news first main_top_news = soup.find('div', class_='main-top-teaser-content') if main_top_news: main_headline = main_top_news.find('h1') if main_headline: headline_link = main_headline.find('a') if headline_link: headline = { 'title': headline_link.get('title', '').strip(), 'url': headline_link.get('href', '').strip(), 'is_main': True } if headline['title'] and headline['url']: headlines.append(headline) # Get the subtitle/description if available subtitle = main_top_news.find('p') if subtitle: subtitle_link = subtitle.find('a') if subtitle_link: subtitle_text = subtitle_link.get_text().strip() if subtitle_text: headlines[0]['subtitle'] = subtitle_text # Find all article containers article_containers = soup.find_all('div', class_='rendered_board_article') for container in article_containers: # Skip if it's in the popular section if container.find_parent('div', class_='most-popuplar-ongoing-viral-outer'): continue # Find the headline link headline_link = container.find('a', title=True) if headline_link: headline = { 'title': headline_link.get('title', '').strip(), 'url': headline_link.get('href', '').strip(), 'is_main': False } if headline['title'] and headline['url']: headlines.append(headline) # Get card articles card_articles = get_card_articles(soup) headlines.extend(card_articles) # Get timeline events timeline_events = get_timeline_events(soup) headlines.extend(timeline_events) return headlines except Exception as e: logger.error(f"Error fetching headlines from Khaleej Times: {str(e)}") return [] def get_headline() -> str: """Get the first main headline from Khaleej Times (for backward compatibility)""" headlines = get_headlines() if headlines: main_headline = headlines[0] if main_headline.get('subtitle'): return f"{main_headline['title']} - {main_headline['subtitle']}" return main_headline['title'] return "No headlines found" def get_article_content(url): """ Fetch and parse the full article content from a given Khaleej Times URL. Args: url (str): The URL of the article to fetch Returns: dict: A dictionary containing: - title: Article title - content: List of paragraphs - author: Author name - date: Publication date - error: Error message if any """ try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Get article title title = soup.find('h1', class_='article-title') title = title.text.strip() if title else "No title found" # Get article content content_div = soup.find('div', class_='article-center-wrap-nf') paragraphs = [] if content_div: # Find all paragraph elements for p in content_div.find_all('p'): # Skip empty paragraphs and those with only whitespace text = p.text.strip() if text: paragraphs.append(text) # Get author information author_div = soup.find('div', class_='details') author = None if author_div: author_name = author_div.find('h4') if author_name: author = author_name.text.strip() # Get publication date date = None date_element = soup.find('time') if date_element: date = date_element.text.strip() return { 'title': title, 'content': paragraphs, 'author': author, 'date': date, 'error': None } except Exception as e: logger.error(f"Error fetching article content: {str(e)}") return { 'title': None, 'content': [], 'author': None, 'date': None, 'error': str(e) }

Latest Blog Posts

OpenTelemetry for Model Context Protocol (MCP) Analytics and Agent Observability
By Om-Shree-0709 on .
observability
mcp
opentelemetry
Securing Enterprise AI Agents with Unique Identities in the Model Context Protocol (MCP)
By Om-Shree-0709 on .
When Your Year of Work Gets Copied Overnight: What Actually Matters?
By punkpeye on .
startups

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/sdk03/mcp-news'

If you have feedback or need assistance with the MCP directory API, please join our Discord server