Skip to main content
Glama

MCP Securities Analysis

by druce
scrape.py6.87 kB
""" Scrape text from a given URL using Playwright and Trafilatura. """ # pylint: disable=broad-except import random import time import re import os from pathlib import Path from bs4 import BeautifulSoup from trafilatura import extract from dotenv import load_dotenv load_dotenv() FIREFOX_PROFILE_PATH = os.getenv("FIREFOX_PROFILE_PATH") async def get_browser(playwright): """ Initializes a Playwright browser instance with stealth settings. Args: playwright: The Playwright instance. Returns: Browser: The initialized browser instance. """ viewport = random.choice([ {"width": 1920, "height": 1080}, {"width": 1366, "height": 768}, {"width": 1440, "height": 900}, {"width": 1536, "height": 864}, {"width": 1280, "height": 720} ]) # random device-scale-factor for additional randomization device_scale_factor = random.choice([1, 1.25, 1.5, 1.75, 2]) # Random color scheme and timezone color_scheme = random.choice(['light', 'dark', 'no-preference']) timezone_id = random.choice([ 'America/New_York', 'Europe/London', 'Europe/Paris', 'Asia/Tokyo', 'Australia/Sydney', 'America/Los_Angeles' ]) locale = random.choice([ 'en-US', 'en-GB' ]) extra_http_headers = { "Accept-Language": f"{locale.split('-')[0]},{locale};q=0.9", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", "DNT": "1" if random.choice([True, False]) else "0" } return await playwright.firefox.launch_persistent_context( user_data_dir=FIREFOX_PROFILE_PATH, headless=False, # run headless, hide splash window viewport=viewport, device_scale_factor=device_scale_factor, timezone_id=timezone_id, color_scheme=color_scheme, extra_http_headers=extra_http_headers, # removes Playwright's default flag ignore_default_args=["--enable-automation"], args=[ "--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage" ], # provide a valid realistic User-Agent string for the latest Firefox on Apple Silicon user_agent="Mozilla/5.0 (Macintosh; ARM Mac OS X 14.4; rv:125.0) Gecko/20100101 Firefox/125.0", accept_downloads=True, ) def perform_human_like_actions(page): """Perform random human-like actions on a page to mimic real user behavior.""" # Random mouse movements for _ in range(random.randint(3, 8)): # Move mouse with multiple steps to simulate human-like movement x = random.randint(100, 1200) y = random.randint(100, 700) steps = random.randint(5, 10) # Calculate increments for smooth movement for step in range(1, steps + 1): next_x = x * step / steps next_y = y * step / steps # Add slight randomness to path jitter_x = random.uniform(-5, 5) jitter_y = random.uniform(-5, 5) page.mouse.move(next_x + jitter_x, next_y + jitter_y) time.sleep(random.uniform(0.01, 0.05)) # Random scrolling behavior scroll_amount = random.randint(300, 700) page.evaluate(f"window.scrollBy(0, {scroll_amount})") time.sleep(random.uniform(0.5, 2)) # Sometimes scroll back up a bit if random.random() > 0.7: page.evaluate(f"window.scrollBy(0, -{random.randint(100, 300)})") time.sleep(random.uniform(0.3, 1)) return page def normalize_html(path: Path | str) -> str: """ Clean and extract text content from an HTML file, including titles and social media metadata. Args: path (Path | str): Path to the HTML file to process Returns: - str: Extracted and cleaned text content, or empty string if processing fails The function extracts: - Page title from <title> tag - Social media titles from OpenGraph and Twitter meta tags - Social media descriptions from OpenGraph and Twitter meta tags - Main content using trafilatura library All extracted content is concatenated and truncated to MAX_INPUT_TOKENS length. """ try: with open(path, 'r', encoding='utf-8') as file: html_content = file.read() except Exception as exc: print(f"Error: {str(exc)}") print(f"Skipping {path}") return "" # Parse the HTML content using trafilatura soup = BeautifulSoup(html_content, 'html.parser') try: # Try to get the title from the <title> tag title_tag = soup.find("title") title_str = "Page title: " + title_tag.string.strip() + \ "\n" if title_tag and title_tag.string else "" except Exception as exc: title_str = "" print(str(exc), "clean_html page_title") try: # Try to get the title from the Open Graph meta tag og_title_tag = soup.find("meta", property="og:title") if not og_title_tag: og_title_tag = soup.find( "meta", attrs={"name": "twitter:title"}) og_title = og_title_tag["content"].strip( ) + "\n" if og_title_tag and og_title_tag.get("content") else "" og_title = "Social card title: " + og_title if og_title else "" except Exception as exc: og_title = "" print(str(exc), "clean_html og_title") try: # get summary from social media cards og_desc_tag = soup.find("meta", property="og:description") if not og_desc_tag: # Extract the Twitter description og_desc_tag = soup.find( "meta", attrs={"name": "twitter:description"}) og_desc = og_desc_tag.get("content").strip() + \ "\n" if og_desc_tag else "" og_desc = 'Social card description: ' + og_desc if og_desc else "" except Exception as exc: og_desc = "" print(str(exc), "clean_html og_desc") # Get text and strip leading/trailing whitespace print(title_str + og_title + og_desc, "clean_html") try: plaintext = extract(html_content) plaintext = plaintext.strip() if plaintext else "" except Exception as exc: plaintext = html_content print(str(exc), "clean_html trafilatura") # remove special tokens, have found in artiles about tokenization # All OpenAI special tokens follow the pattern <|something|> special_token_re = re.compile(r"<\|\w+\|>") plaintext = special_token_re.sub("", plaintext) visible_text = title_str + og_title + og_desc + plaintext return visible_text

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/druce/MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server