"""HTML to Markdown conversion utility."""
from bs4 import BeautifulSoup
def html_to_markdown(html_content: str) -> str:
"""Convert HTML to simplified Markdown format.
Uses BeautifulSoup for basic conversion.
Args:
html_content: Raw HTML content to convert.
Returns:
Converted Markdown text.
"""
soup = BeautifulSoup(html_content, "html.parser")
# Remove unwanted tags
for tag in soup(["script", "style", "noscript", "iframe", "nav", "footer", "header"]):
tag.decompose()
markdown_text = []
# Process headings
for i in range(1, 7):
for heading in soup.find_all(f"h{i}"):
heading_text = heading.get_text(strip=True)
if heading_text:
markdown_text.append(f"\n{'#' * i} {heading_text}\n")
heading.decompose()
# Process links
for link in soup.find_all("a"):
href = link.get("href", "")
text = link.get_text(strip=True)
if text and href:
link.replace_with(f"[{text}]({href})")
# Process lists
for ul in soup.find_all("ul"):
for li in ul.find_all("li"):
li_text = li.get_text(strip=True)
if li_text:
markdown_text.append(f"- {li_text}")
ul.decompose()
for ol in soup.find_all("ol"):
for idx, li in enumerate(ol.find_all("li"), 1):
li_text = li.get_text(strip=True)
if li_text:
markdown_text.append(f"{idx}. {li_text}")
ol.decompose()
# Process paragraphs and remaining text
for p in soup.find_all("p"):
p_text = p.get_text(strip=True)
if p_text:
markdown_text.append(f"\n{p_text}\n")
p.decompose()
# Get remaining text
remaining_text = soup.get_text(separator="\n")
lines = (line.strip() for line in remaining_text.splitlines())
cleaned_lines = [line for line in lines if line]
markdown_text.extend(cleaned_lines)
# Merge and clean
result = "\n".join(markdown_text)
# Remove excess blank lines
while "\n\n\n" in result:
result = result.replace("\n\n\n", "\n\n")
return result.strip()