# Copyright 2025 ryu1maniwa. All Rights Reserved.
#
# This file is derived from awslabs.aws-documentation-mcp-server, which is licensed as follows:
#
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
# with the License. A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
# and limitations under the License.
"""Utility functions for OpenTelemetry Documentation MCP Server."""
import markdownify
from .models import SearchResult
from bs4 import BeautifulSoup
from typing import Any, Dict, List
def extract_content_from_html(html: str) -> str:
"""Extract and convert HTML content to Markdown format.
Args:
html: Raw HTML content to process
Returns:
Simplified markdown version of the content
"""
if not html:
return '<e>Empty HTML content</e>'
try:
# Parse HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
# Try to find the main content area
main_content = None
# Common content container selectors for OpenTelemetry documentation
content_selectors = [
'.td-content', # opentelemetry.io uses this selector for main content
'main',
'article',
'#content',
'.content',
'#body-content',
"div[role='main']",
'.td-main',
]
# Try to find the main content using common selectors
for selector in content_selectors:
content = soup.select_one(selector)
if content:
main_content = content
break
# If no main content found, use the body
if not main_content:
main_content = soup.body if soup.body else soup
# Remove navigation elements that might be in the main content
nav_selectors = [
'noscript',
'.prevNext',
'.docsite-footer',
'.feedback',
'.td-sidebar',
'.td-sidebar-nav',
'.td-page-meta',
'.td-search',
]
for selector in nav_selectors:
for element in main_content.select(selector):
element.decompose()
# Define tags to strip - these are elements we don't want in the output
tags_to_strip = [
'script',
'style',
'noscript',
'meta',
'link',
'footer',
'nav',
'aside',
'header',
'.td-sidebar',
'.td-sidebar-nav',
'.td-page-meta',
'.td-search',
# Common unnecessary elements
'js-show-more-buttons',
'js-show-more-text',
'feedback-container',
'feedback-section',
'doc-feedback-container',
'doc-feedback-section',
'warning-container',
'warning-section',
'cookie-banner',
'cookie-notice',
'copyright-section',
'legal-section',
'terms-section',
]
# Use markdownify on the cleaned HTML content
content = markdownify.markdownify(
str(main_content),
heading_style='ATX',
autolinks=True,
default_title=True,
escape_asterisks=True,
escape_underscores=True,
newline_style='SPACES',
strip=tags_to_strip,
)
if not content:
return '<e>Page failed to be simplified from HTML</e>'
return content
except Exception as e:
return f'<e>Error converting HTML to Markdown: {str(e)}</e>'
def is_html_content(page_raw: str, content_type: str) -> bool:
"""Determine if content is HTML.
Args:
page_raw: Raw page content
content_type: Content-Type header
Returns:
True if content is HTML, False otherwise
"""
return '<html' in page_raw[:100] or 'text/html' in content_type or not content_type
def format_documentation_result(url: str, content: str, start_index: int, max_length: int) -> str:
"""Format documentation result with pagination information.
Args:
url: Documentation URL
content: Content to format
start_index: Start index for pagination
max_length: Maximum content length
Returns:
Formatted documentation result
"""
original_length = len(content)
if start_index >= original_length:
return f'OpenTelemetry Documentation from {url}:\n\n<e>No more content available.</e>'
# Calculate the end index, ensuring we don't go beyond the content length
end_index = min(start_index + max_length, original_length)
truncated_content = content[start_index:end_index]
if not truncated_content:
return f'OpenTelemetry Documentation from {url}:\n\n<e>No more content available.</e>'
actual_content_length = len(truncated_content)
remaining_content = original_length - (start_index + actual_content_length)
result = f'OpenTelemetry Documentation from {url}:\n\n{truncated_content}'
# Only add the prompt to continue fetching if there is still remaining content
if remaining_content > 0:
next_start = start_index + actual_content_length
result += f'\n\n<e>Content truncated. Call the read_documentation tool with start_index={next_start} to get more content.</e>'
return result
def parse_search_results(data: Dict[str, Any]) -> List[SearchResult]:
"""Parse Google Custom Search results into structured format.
Args:
data: Raw API response data from Google Custom Search
Returns:
List of SearchResult objects in a standard format
"""
results = []
if 'items' in data:
for i, item in enumerate(data['items']):
results.append(
SearchResult(
rank_order=i + 1,
url=item.get('link', ''),
title=item.get('title', ''),
context=item.get('snippet'),
)
)
return results