Substack Reader

from typing import Any, Optional import json import requests import re from pathlib import Path from mcp.server.fastmcp import FastMCP # Initialize FastMCP server for Trade Companion by Adam Mancini mcp = FastMCP("trade_companion_reader") # Path to stored cookies COOKIES_FILE = Path("substack_cookies.json") # Trade Companion Substack URL TRADE_COMPANION_URL = "https://tradecompanion.substack.com" def get_cookies_dict() -> dict: """Load cookies from file and convert to requests format.""" if not COOKIES_FILE.exists(): return {} # Load cookies from file cookies_data = json.loads(COOKIES_FILE.read_text()) # Convert cookies to requests format (name: value dict) return {cookie['name']: cookie['value'] for cookie in cookies_data} def get_headers() -> dict: """Return headers that mimic a browser request.""" return { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'en-US,en;q=0.9', 'Referer': 'https://substack.com/', 'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"macOS"', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'same-origin', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', 'cache-control': 'max-age=0', 'priority': 'u=0, i' } def clean_html_text(html_text: str) -> str: """Remove HTML tags and clean up text.""" # Remove HTML tags text = re.sub(r'<[^>]+>', ' ', html_text) # Replace HTML entities text = re.sub(r'&nbsp;', ' ', text) text = re.sub(r'&amp;', '&', text) text = re.sub(r'&lt;', '<', text) text = re.sub(r'&gt;', '>', text) text = re.sub(r'&quot;', '"', text) text = re.sub(r'&#39;', "'", text) # Replace multiple spaces with a single space text = re.sub(r'\s+', ' ', text) # Replace multiple newlines with a single newline text = re.sub(r'\n+', '\n', text) return text.strip() def fetch_substack_article_text(url: str) -> Optional[dict[str, Any]]: """ Fetch a Trade Companion article by Adam Mancini and extract plain text content. Returns the article title, author, date, and plain text content. """ cookies = get_cookies_dict() headers = get_headers() try: # Make the request response = requests.get(url, cookies=cookies, headers=headers) response.raise_for_status() # Extract title title_match = re.search(r'<h1[^>]*?>(.*?)</h1>', response.text, re.DOTALL) title = clean_html_text(title_match.group(1)) if title_match else "Unknown Title" # Set author to Adam Mancini author = "Adam Mancini" # Extract date date_pattern = r'<time[^>]*?datetime="([^"]+)"' date_match = re.search(date_pattern, response.text) date = date_match.group(1) if date_match else "" # Extract article content # First, try to find the article container content_pattern = r'<div[^>]*?class="[^"]*?body[^"]*?"[^>]*?>(.*?)</div>\s*<(footer|div\s+class="[^"]*?comments)' content_match = re.search(content_pattern, response.text, re.DOTALL) if not content_match: # Try alternative pattern content_pattern = r'<article[^>]*?>(.*?)</article>' content_match = re.search(content_pattern, response.text, re.DOTALL) if content_match: html_content = content_match.group(1) # Remove scripts, styles, and other non-content elements html_content = re.sub(r'<script[^>]*?>.*?</script>', '', html_content, flags=re.DOTALL) html_content = re.sub(r'<style[^>]*?>.*?</style>', '', html_content, flags=re.DOTALL) html_content = re.sub(r'<svg[^>]*?>.*?</svg>', '', html_content, flags=re.DOTALL) html_content = re.sub(r'<figure[^>]*?>.*?</figure>', '', html_content, flags=re.DOTALL) # Extract text from paragraphs and headings text_blocks = [] # Get headings for i in range(2, 7): # h2 through h6 headings = re.findall(f'<h{i}[^>]*?>(.*?)</h{i}>', html_content, re.DOTALL) for h in headings: text_blocks.append(f"{'#' * (i-1)} {clean_html_text(h)}") # Get paragraphs paragraphs = re.findall(r'<p[^>]*?>(.*?)</p>', html_content, re.DOTALL) for p in paragraphs: cleaned_p = clean_html_text(p) if cleaned_p: # Only add non-empty paragraphs text_blocks.append(cleaned_p) # Get list items list_items = re.findall(r'<li[^>]*?>(.*?)</li>', html_content, re.DOTALL) for li in list_items: cleaned_li = clean_html_text(li) if cleaned_li: # Only add non-empty list items text_blocks.append(f"• {cleaned_li}") # Combine all text blocks with double newlines text_content = "\n\n".join(text_blocks) # Final cleanup text_content = re.sub(r'\n{3,}', '\n\n', text_content) # Remove excessive newlines else: text_content = "Could not extract article content." return { "title": title, "author": author, "date": date, "content": text_content } except Exception as e: return None def fetch_trade_companion_articles() -> list[dict[str, Any]]: """ Fetch a list of articles from Trade Companion by Adam Mancini. Returns a list of articles with title, url, date, and preview. Excludes the "My Trade Methodology Fundamentals" article. """ cookies = get_cookies_dict() headers = get_headers() try: # Make the request to the publication homepage response = requests.get(TRADE_COMPANION_URL, cookies=cookies, headers=headers) response.raise_for_status() # Extract article URLs article_urls = [] # Direct pattern to find all URLs in the format "https://tradecompanion.substack.com/p/something" url_pattern = r'https://tradecompanion\.substack\.com/p/[^/\s"\']+(?=[\s"\'])' url_matches = re.findall(url_pattern, response.text) # Excluded article slug excluded_slug = "my-trade-methodology-fundamentals" for url in url_matches: # Only add URLs that match the publication domain and aren't the excluded article if url.startswith(TRADE_COMPANION_URL.rstrip('/')) and '/p/' in url: if excluded_slug not in url: article_urls.append(url) # Create articles list from URLs articles = [] # Remove duplicates while preserving order unique_urls = [] for url in article_urls: if url not in unique_urls: unique_urls.append(url) # Create a basic article entry for each URL for url in unique_urls: # Extract title from URL slug = url.split('/')[-1] title = slug.replace('-', ' ').title() articles.append({ "title": title, "url": url, "date": "", # We'll need to fetch the article to get the date "preview": "" }) return articles except Exception as e: return [] @mcp.tool() def get_latest_trade_companion_adam_mancini_article() -> str: """ Fetch and return the content of the latest Trade Companion article by Adam Mancini. Excludes the "My Trade Methodology Fundamentals" article. """ articles = fetch_trade_companion_articles() if not articles: return "Failed to fetch articles from Trade Companion. The service might be temporarily unavailable." # Get the latest article (first in the list) latest_article_url = articles[0]["url"] article_data = fetch_substack_article_text(latest_article_url) if not article_data: return "Failed to fetch the latest article. The article might not be accessible." # Format the article data formatted_article = f""" Title: {article_data['title']} Author: {article_data['author']} Published: {article_data['date']} URL: {latest_article_url} {article_data['content']} """ return formatted_article if __name__ == "__main__": # Initialize and run the server for Trade Companion by Adam Mancini mcp.run(transport='stdio')