Home Assistant MCP
by allenporter
- src
- mcp_atlassian
import logging
import re
import warnings
from bs4 import BeautifulSoup
from markdownify import markdownify as md
logger = logging.getLogger("mcp-atlassian")
class TextPreprocessor:
"""Handles text preprocessing for Confluence and Jira content."""
def __init__(self, base_url: str, confluence_client=None):
self.base_url = base_url.rstrip("/")
self.confluence_client = confluence_client
def process_html_content(self, html_content: str, space_key: str = "") -> tuple[str, str]:
"""Process HTML content to replace user refs and page links."""
try:
soup = BeautifulSoup(html_content, "html.parser")
# Process user mentions
user_mentions = soup.find_all("ri:user")
for user in user_mentions:
account_id = user.get("ri:account-id")
if account_id and self.confluence_client:
try:
# Fetch user info using the Confluence API
user_info = self.confluence_client.get_user_details_by_accountid(account_id)
display_name = user_info.get("displayName", account_id)
# Replace the entire ac:link structure with @mention
link_tag = user.find_parent("ac:link")
if link_tag:
link_tag.replace_with(f"@{display_name}")
except Exception as e:
logger.warning(f"Could not fetch user info for {account_id}: {e}")
# Fallback: just use the account ID
link_tag = user.find_parent("ac:link")
if link_tag:
link_tag.replace_with(f"@user_{account_id}")
processed_html = str(soup)
processed_markdown = md(processed_html)
return processed_html, processed_markdown
except Exception as e:
logger.error(f"Error in process_html_content: {str(e)}")
raise
def clean_jira_text(self, text: str) -> str:
"""
Clean Jira text content by:
1. Processing user mentions and links
2. Converting HTML/wiki markup to markdown
"""
if not text:
return ""
# Process user mentions
mention_pattern = r"\[~accountid:(.*?)\]"
text = self._process_mentions(text, mention_pattern)
# Process Jira smart links
text = self._process_smart_links(text)
# Convert HTML to markdown if needed
text = self._convert_html_to_markdown(text)
return text.strip()
def _process_mentions(self, text: str, pattern: str) -> str:
"""Process user mentions in text."""
mentions = re.findall(pattern, text)
for account_id in mentions:
try:
# Note: This is a placeholder - actual user fetching should be injected
display_name = f"User:{account_id}"
text = text.replace(f"[~accountid:{account_id}]", display_name)
except Exception as e:
logger.error(f"Error getting user info for {account_id}: {str(e)}")
return text
def _process_smart_links(self, text: str) -> str:
"""Process Jira/Confluence smart links."""
# Pattern matches: [text|url|smart-link]
link_pattern = r"\[(.*?)\|(.*?)\|smart-link\]"
matches = re.finditer(link_pattern, text)
for match in matches:
full_match = match.group(0)
link_text = match.group(1)
link_url = match.group(2)
# Extract issue key if it's a Jira issue link
issue_key_match = re.search(r"browse/([A-Z]+-\d+)", link_url)
# Check if it's a Confluence wiki link
confluence_match = re.search(r"wiki/spaces/.+?/pages/\d+/(.+?)(?:\?|$)", link_url)
if issue_key_match:
issue_key = issue_key_match.group(1)
clean_url = f"{self.base_url}/browse/{issue_key}"
text = text.replace(full_match, f"[{issue_key}]({clean_url})")
elif confluence_match:
url_title = confluence_match.group(1)
readable_title = url_title.replace("+", " ")
readable_title = re.sub(r"^[A-Z]+-\d+\s+", "", readable_title)
text = text.replace(full_match, f"[{readable_title}]({link_url})")
else:
clean_url = link_url.split("?")[0]
text = text.replace(full_match, f"[{link_text}]({clean_url})")
return text
def _convert_html_to_markdown(self, text: str) -> str:
"""Convert HTML content to markdown if needed."""
if re.search(r"<[^>]+>", text):
try:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning)
soup = BeautifulSoup(f"<div>{text}</div>", "html.parser")
html = str(soup.div.decode_contents()) if soup.div else text
text = md(html)
except Exception as e:
logger.warning(f"Error converting HTML to markdown: {e}")
return text