Skip to main content
Glama

AWS Documentation MCP Server

util.py8.69 kB
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Utility functions for AWS Documentation MCP Server.""" import markdownify from awslabs.aws_documentation_mcp_server.models import RecommendationResult from typing import Any, Dict, List def extract_content_from_html(html: str) -> str: """Extract and convert HTML content to Markdown format. Args: html: Raw HTML content to process Returns: Simplified markdown version of the content """ if not html: return '<e>Empty HTML content</e>' try: # First use BeautifulSoup to clean up the HTML from bs4 import BeautifulSoup # Parse HTML with BeautifulSoup soup = BeautifulSoup(html, 'html.parser') # Try to find the main content area main_content = None # Common content container selectors for AWS documentation content_selectors = [ 'main', 'article', '#main-content', '.main-content', '#content', '.content', "div[role='main']", '#awsdocs-content', '.awsui-article', ] # Try to find the main content using common selectors for selector in content_selectors: content = soup.select_one(selector) if content: main_content = content break # If no main content found, use the body if not main_content: main_content = soup.body if soup.body else soup # Remove navigation elements that might be in the main content nav_selectors = [ 'noscript', '.prev-next', '#main-col-footer', '.awsdocs-page-utilities', '#quick-feedback-yes', '#quick-feedback-no', '.page-loading-indicator', '#tools-panel', '.doc-cookie-banner', 'awsdocs-copyright', 'awsdocs-thumb-feedback', ] for selector in nav_selectors: for element in main_content.select(selector): element.decompose() # Define tags to strip - these are elements we don't want in the output tags_to_strip = [ 'script', 'style', 'noscript', 'meta', 'link', 'footer', 'nav', 'aside', 'header', # AWS documentation specific elements 'awsdocs-cookie-consent-container', 'awsdocs-feedback-container', 'awsdocs-page-header', 'awsdocs-page-header-container', 'awsdocs-filter-selector', 'awsdocs-breadcrumb-container', 'awsdocs-page-footer', 'awsdocs-page-footer-container', 'awsdocs-footer', 'awsdocs-cookie-banner', # Common unnecessary elements 'js-show-more-buttons', 'js-show-more-text', 'feedback-container', 'feedback-section', 'doc-feedback-container', 'doc-feedback-section', 'warning-container', 'warning-section', 'cookie-banner', 'cookie-notice', 'copyright-section', 'legal-section', 'terms-section', ] # Use markdownify on the cleaned HTML content content = markdownify.markdownify( str(main_content), heading_style=markdownify.ATX, autolinks=True, default_title=True, escape_asterisks=True, escape_underscores=True, newline_style='SPACES', strip=tags_to_strip, ) if not content: return '<e>Page failed to be simplified from HTML</e>' return content except Exception as e: return f'<e>Error converting HTML to Markdown: {str(e)}</e>' def is_html_content(page_raw: str, content_type: str) -> bool: """Determine if content is HTML. Args: page_raw: Raw page content content_type: Content-Type header Returns: True if content is HTML, False otherwise """ return '<html' in page_raw[:100] or 'text/html' in content_type or not content_type def format_documentation_result(url: str, content: str, start_index: int, max_length: int) -> str: """Format documentation result with pagination information. Args: url: Documentation URL content: Content to format start_index: Start index for pagination max_length: Maximum content length Returns: Formatted documentation result """ original_length = len(content) if start_index >= original_length: return f'AWS Documentation from {url}:\n\n<e>No more content available.</e>' # Calculate the end index, ensuring we don't go beyond the content length end_index = min(start_index + max_length, original_length) truncated_content = content[start_index:end_index] if not truncated_content: return f'AWS Documentation from {url}:\n\n<e>No more content available.</e>' actual_content_length = len(truncated_content) remaining_content = original_length - (start_index + actual_content_length) result = f'AWS Documentation from {url}:\n\n{truncated_content}' # Only add the prompt to continue fetching if there is still remaining content if remaining_content > 0: next_start = start_index + actual_content_length result += f'\n\n<e>Content truncated. Call the read_documentation tool with start_index={next_start} to get more content.</e>' return result def parse_recommendation_results(data: Dict[str, Any]) -> List[RecommendationResult]: """Parse recommendation API response into RecommendationResult objects. Args: data: Raw API response data Returns: List of recommendation results """ results = [] # Process highly rated recommendations if 'highlyRated' in data and 'items' in data['highlyRated']: for item in data['highlyRated']['items']: context = item.get('abstract') if 'abstract' in item else None results.append( RecommendationResult( url=item.get('url', ''), title=item.get('assetTitle', ''), context=context ) ) # Process journey recommendations (organized by intent) if 'journey' in data and 'items' in data['journey']: for intent_group in data['journey']['items']: intent = intent_group.get('intent', '') if 'urls' in intent_group: for url_item in intent_group['urls']: # Add intent as part of the context context = f'Intent: {intent}' if intent else None results.append( RecommendationResult( url=url_item.get('url', ''), title=url_item.get('assetTitle', ''), context=context, ) ) # Process new content recommendations if 'new' in data and 'items' in data['new']: for item in data['new']['items']: # Add "New content" label to context date_created = item.get('dateCreated', '') context = f'New content added on {date_created}' if date_created else 'New content' results.append( RecommendationResult( url=item.get('url', ''), title=item.get('assetTitle', ''), context=context ) ) # Process similar recommendations if 'similar' in data and 'items' in data['similar']: for item in data['similar']['items']: context = item.get('abstract') if 'abstract' in item else 'Similar content' results.append( RecommendationResult( url=item.get('url', ''), title=item.get('assetTitle', ''), context=context ) ) return results

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/daniel-levesque/aws-documentation-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server