Skip to main content
Glama

Roam Research MCP Server

content_parsers.py4.66 kB
"""External content parsing operations for the Roam MCP server.""" import os import tempfile import logging from typing import Dict, Any, Optional import httpx import trafilatura from unstructured.partition.pdf import partition_pdf # Set up logging logger = logging.getLogger("roam-mcp.content_parsers") async def parse_webpage(url: str) -> Dict[str, Any]: """ Parse content from a web page URL. Args: url: URL of the webpage to parse Returns: Result with parsed content """ try: logger.debug(f"Fetching web page content from: {url}") downloaded = trafilatura.fetch_url(url) if not downloaded: return { "success": False, "error": f"Failed to download content from {url}" } # Extract main content with document structure preserved content = trafilatura.extract( downloaded, output_format='text', include_links=False, include_formatting=True ) if not content: return { "success": False, "error": f"Failed to extract meaningful content from {url}" } # Get metadata metadata = trafilatura.extract_metadata(downloaded) title = metadata.get('title', 'Untitled Page') return { "success": True, "content": content, "title": title, "url": url } except Exception as e: logger.error(f"Error parsing web page: {str(e)}") return { "success": False, "error": f"Error parsing web page: {str(e)}" } async def parse_pdf(url: str) -> Dict[str, Any]: """ Parse content from a PDF URL. Args: url: URL of the PDF to parse Returns: Result with parsed content """ try: logger.debug(f"Fetching PDF content from: {url}") # Download the PDF to a temporary file async with httpx.AsyncClient() as client: response = await client.get(url, follow_redirects=True) response.raise_for_status() # Check if it's a PDF based on Content-Type content_type = response.headers.get('Content-Type', '') if 'application/pdf' not in content_type.lower(): return { "success": False, "error": f"URL does not point to a PDF (Content-Type: {content_type})" } # Create a temporary file for the PDF with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as temp_file: temp_path = temp_file.name temp_file.write(response.content) # Extract content using unstructured try: elements = partition_pdf( temp_path, strategy="hi_res", extract_images=False, extract_tables=True ) # Convert to formatted text while preserving structure content = "\n\n".join([str(element) for element in elements]) except UnicodeDecodeError: # Fall back to a simpler strategy if hi_res fails with encoding issues logger.warning(f"Encountered encoding issues with hi_res strategy, trying fast strategy") elements = partition_pdf( temp_path, strategy="fast", extract_images=False, extract_tables=False ) content = "\n\n".join([str(element) for element in elements]) # Try to extract a title from the filename in the URL path_parts = url.split('/') filename = path_parts[-1].split('?')[0] # Remove query parameters title = os.path.splitext(filename)[0].replace('-', ' ').replace('_', ' ').title() if not title: title = "PDF Document" # Clean up temporary file os.unlink(temp_path) return { "success": True, "content": content, "title": title, "url": url } except Exception as e: logger.error(f"Error parsing PDF: {str(e)}") # Clean up temporary file if it exists try: if 'temp_path' in locals(): os.unlink(temp_path) except: pass return { "success": False, "error": f"Error parsing PDF: {str(e)}" }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/PhiloSolares/roam-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server