Google Search MCP Server

by mixelpixx
Verified
from flask import Flask, request, jsonify from flask_cors import CORS from bs4 import BeautifulSoup import requests import trafilatura from markdownify import markdownify from urllib.parse import urlparse import re from typing import Dict, Optional app = Flask(__name__) CORS(app) class LinkViewer: def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } def is_valid_url(self, url: str) -> bool: """Check if URL is valid and supported""" try: result = urlparse(url) return all([result.scheme, result.netloc]) except: return False def _clean_markdown(self, md_text: str) -> str: """Clean up markdown text for better readability""" # Remove multiple blank lines md_text = re.sub(r'\n\s*\n\s*\n', '\n\n', md_text) # Remove excessive spaces md_text = re.sub(r' +', ' ', md_text) # Ensure headers have space after # md_text = re.sub(r'#([A-Za-z0-9])', r'# \1', md_text) return md_text.strip() def extract_content(self, url: str) -> Dict: """ Extract webpage content and convert to Markdown Returns structured data including markdown content """ if not self.is_valid_url(url): raise ValueError("Invalid URL provided") try: # Fetch the webpage response = requests.get(url, headers=self.headers, timeout=10) response.raise_for_status() # Parse with BeautifulSoup soup = BeautifulSoup(response.text, 'html.parser') # Extract metadata meta_tags = {} for tag in soup.find_all('meta'): name = tag.get('name', tag.get('property', '')) content = tag.get('content', '') if name and content: meta_tags[name] = content # Try trafilatura first for main content main_content_html = trafilatura.extract(response.text, include_links=True, include_tables=True, output_format='html') if main_content_html: # Convert main content to markdown markdown_content = markdownify(main_content_html, heading_style="ATX") else: # Fallback: Convert relevant body content # Remove unwanted elements first for element in soup.select('script, style, nav, footer, header, aside'): element.decompose() main_content_html = str(soup.find('main') or soup.find('article') or soup.find('body')) markdown_content = markdownify(main_content_html, heading_style="ATX") # Clean up the markdown markdown_content = self._clean_markdown(markdown_content) # Calculate content stats word_count = len(re.findall(r'\w+', markdown_content)) # Structure the extracted data extracted_data = { 'url': url, 'title': soup.title.string if soup.title else '', 'description': meta_tags.get('description', ''), 'markdown_content': markdown_content, 'meta_tags': meta_tags, 'stats': { 'word_count': word_count, 'approximate_chars': len(markdown_content) }, 'content_preview': { 'first_500_chars': markdown_content[:500] + '...' if len(markdown_content) > 500 else markdown_content } } return extracted_data except requests.RequestException as e: raise Exception(f"Failed to fetch content: {str(e)}") except Exception as e: raise Exception(f"Error processing content: {str(e)}") # Initialize the LinkViewer viewer = LinkViewer() @app.route('/analyze', methods=['POST']) def analyze_link(): """Endpoint to analyze a webpage and convert to markdown""" data = request.get_json() if not data or 'url' not in data: return jsonify({'error': 'Missing URL'}), 400 try: content = viewer.extract_content(data['url']) return jsonify(content), 200 except ValueError as e: return jsonify({'error': str(e)}), 400 except Exception as e: return jsonify({'error': f'Analysis failed: {str(e)}'}), 500 @app.route('/batch_analyze', methods=['POST']) def batch_analyze(): """Endpoint to analyze multiple URLs""" data = request.get_json() if not data or 'urls' not in data: return jsonify({'error': 'Missing URLs'}), 400 results = {} for url in data['urls']: try: results[url] = viewer.extract_content(url) except Exception as e: results[url] = {'error': str(e)} return jsonify(results), 200 if __name__ == '__main__': app.run(host='0.0.0.0', port=5001)