"""
Text link handler - Extract and summarize text content from web pages
"""
import aiohttp
from bs4 import BeautifulSoup
import logging
import re
from src.utils.llm_summarizer import summarize_with_llama3
logger = logging.getLogger(__name__)
async def extract_text_content(url: str) -> str:
"""
Extract text content from a web page
Args:
url: URL to extract content from
Returns:
str: Extracted text content
"""
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=30)) as response:
# Return 401 for unauthorized requests (PlayMCP policy requirement)
if response.status == 401:
raise Exception(f"HTTP 401: Unauthorized - Authentication required")
if response.status != 200:
raise Exception(f"HTTP {response.status}: Failed to fetch URL")
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Try to find main content
main_content = (
soup.find('main') or
soup.find('article') or
soup.find('div', class_=re.compile('content|post|article', re.I)) or
soup.find('body')
)
if main_content:
text = main_content.get_text(separator=' ', strip=True)
else:
text = soup.get_text(separator=' ', strip=True)
# Clean up whitespace
text = ' '.join(text.split())
return text
except aiohttp.ClientError as e:
raise Exception(f"Network error: {str(e)}")
except Exception as e:
raise Exception(f"Error extracting text: {str(e)}")
async def summarize_text_link(url: str) -> str:
"""
Summarize text content from URL
Args:
url: URL to summarize
Returns:
str: Summary text
"""
text = await extract_text_content(url)
if not text or not text.strip():
return "No text content available from the URL."
# Summarize using Llama3 with text-specific prompts
logger.info("Summarizing text content with Llama3...")
summary = await summarize_with_llama3(text, content_type="text")
return summary