Europe PMC Literature Search MCP Server

html_to_markdown.py•10.8 KiB

#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ HTML转Markdown功能模块使用markdownify库实现专业的HTML到Markdown转换 """ from typing import Optional, Dict, Any from markdownify import markdownify as md import logging def html_to_markdown(html_content: str, **options) -> Optional[str]: """将HTML内容转换为Markdown格式功能说明： - 使用markdownify库将HTML内容转换为Markdown格式 - 支持自定义转换选项 - 处理复杂的HTML结构（表格、列表、链接等）参数说明： - html_content: 必需，HTML内容字符串 - **options: 可选，markdownify转换选项返回值说明： - 转换后的Markdown内容 - 如果输入为空或转换失败则返回None 使用场景： - 将PMC XML内容转换为Markdown格式 - 提取网页内容并保存为Markdown - 学术文献的格式转换技术特点： - 基于markdownify库，转换质量高 - 支持自定义转换规则 - 处理复杂的HTML结构 """ if not html_content: return None try: # 默认配置选项 default_options = { 'heading_style': 'ATX', # 使用#风格的标题 'convert': ['a', 'b', 'strong', 'i', 'em', 'code', 'pre', 'p', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'blockquote', 'table', 'thead', 'tbody', 'tr', 'th', 'td'], 'default_title': False, # 不添加默认标题 } # 合并用户自定义选项 final_options = {**default_options, **options} # 执行转换 markdown_content = md(html_content, **final_options) # 清理多余的空行 markdown_content = '\n'.join(line.rstrip() for line in markdown_content.split('\n')) markdown_content = markdown_content.strip() return markdown_content except Exception as e: logging.error(f"HTML转Markdown时发生错误: {e}") return None def html_to_text(html_content: str) -> Optional[str]: """将HTML内容转换为纯文本（兼容原有函数）功能说明： - 使用markdownify的strip功能将HTML转换为纯文本 - 保持与原有函数的接口兼容性参数说明： - html_content: 必需，HTML内容字符串返回值说明： - 转换后的纯文本内容 - 如果输入为空则返回None """ if not html_content: return None try: # 使用markdownify的纯文本模式 text_content = md(html_content, strip=['a', 'b', 'strong', 'i', 'em', 'code', 'pre', 'p', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'blockquote', 'table', 'thead', 'tbody', 'tr', 'th', 'td', 'script', 'style'], heading_style='ATX') # 清理多余的空行和空格 lines = [line.strip() for line in text_content.split('\n') if line.strip()] text_content = '\n'.join(lines) return text_content except Exception as e: logging.error(f"HTML转文本时发生错误: {e}") return None def extract_structured_content(html_content: str, output_format: str = 'markdown') -> Dict[str, Any]: """从HTML中提取结构化内容功能说明： - 从HTML内容中提取标题、正文等结构化信息 - 支持输出为Markdown或纯文本格式参数说明： - html_content: 必需，HTML内容字符串 - output_format: 可选，输出格式 ('markdown' 或 'text') 返回值说明： - 包含提取内容的字典 """ if not html_content: return { "title": None, "content": None, "format": output_format, "error": "HTML内容为空" } try: # 提取标题（从title标签或h1标签） import re title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL) if not title_match: title_match = re.search(r'<h1[^>]*>(.*?)</h1>', html_content, re.IGNORECASE | re.DOTALL) title = title_match.group(1).strip() if title_match else "无标题" # 根据输出格式转换内容 if output_format == 'markdown': content = html_to_markdown(html_content) else: content = html_to_text(html_content) return { "title": title, "content": content, "format": output_format, "error": None } except Exception as e: return { "title": None, "content": None, "format": output_format, "error": f"提取结构化内容时发生错误: {str(e)}" } def convert_pmc_xml_to_markdown(xml_content: str) -> Optional[str]: """专门处理PMC XML内容的转换功能说明： - 针对PMC XML格式优化的转换函数 - 处理学术文献特有的标签结构 - 增强对front matter和section结构的解析参数说明： - xml_content: 必需，PMC XML内容字符串返回值说明： - 转换后的Markdown内容 """ if not xml_content: return None try: # 首先进行XML预处理，提取关键结构 import re # 提取文章标题 title_match = re.search(r'<article-title[^>]*>(.*?)</article-title>', xml_content, re.DOTALL) title = title_match.group(1).strip() if title_match else "" # 提取作者信息 authors = [] author_matches = re.findall(r'<contrib[^>]*contrib-type="author"[^>]*>(.*?)</contrib>', xml_content, re.DOTALL) for author_match in author_matches: surname = re.search(r'<surname>([^<]+)</surname>', author_match) given_names = re.search(r'<given-names>([^<]+)</given-names>', author_match) if surname and given_names: authors.append(f"{given_names.group(1)} {surname.group(1)}") # 提取摘要 abstract_match = re.search(r'<abstract[^>]*>(.*?)</abstract>', xml_content, re.DOTALL) abstract = abstract_match.group(1).strip() if abstract_match else "" # 提取关键词 keywords = re.findall(r'<kwd>([^<]+)</kwd>', xml_content) # 提取正文内容 body_match = re.search(r'<body[^>]*>(.*?)</body>', xml_content, re.DOTALL) body_content = body_match.group(1).strip() if body_match else xml_content # 处理section标题 def process_sections(content): # 将section标题转换为Markdown格式 content = re.sub(r'<sec[^>]*>.*?<title>([^<]+)</title>', r'\n\n## \1\n\n', content) # 处理子section content = re.sub(r'<sec[^>]*>.*?<title>([^<]+)</title>', r'\n\n### \1\n\n', content) return content body_content = process_sections(body_content) # 处理引用链接 def process_references(content): # 处理文献引用 content = re.sub(r'<xref[^>]*ref-type="bibr"[^>]*>([^<]+)</xref>', r'[\1]', content) # 处理图表引用 content = re.sub(r'<xref[^>]*ref-type="fig"[^>]*>([^<]+)</xref>', r'\1', content) return content body_content = process_references(body_content) # 构建增强的转换选项 pmc_options = { 'heading_style': 'ATX', 'convert': [ 'title', 'abstract', 'sec', 'p', 'bold', 'italic', 'underline', 'sc', 'monospace', 'sub', 'sup', 'table', 'thead', 'tbody', 'tr', 'th', 'td', 'list', 'list-item', 'email', 'ext-link' ], 'default_title': False, } # 转换正文内容 body_markdown = html_to_markdown(body_content, **pmc_options) # 构建完整的Markdown文档 markdown_parts = [] if title: markdown_parts.append(f"# {title}") markdown_parts.append("") if authors: markdown_parts.append("**Authors:** " + ", ".join(authors)) markdown_parts.append("") if keywords: markdown_parts.append("**Keywords:** " + ", ".join(keywords)) markdown_parts.append("") if abstract: # 转换摘要 abstract_markdown = html_to_markdown(abstract, **pmc_options) if abstract_markdown: markdown_parts.append("## Abstract") markdown_parts.append(abstract_markdown) markdown_parts.append("") if body_markdown: markdown_parts.append(body_markdown) return "\n".join(markdown_parts) except Exception as e: logging.error(f"PMC XML转Markdown时发生错误: {e}") # 如果增强转换失败，回退到基础转换 return html_to_markdown(xml_content) # 测试代码 if __name__ == "__main__": # 测试HTML test_html = """ <html> <head><title>测试学术文献</title></head> <body> <h1>研究背景</h1> <p>这是一个<strong>重要</strong>的学术研究，涉及<em>机器学习</em>和<a href="http://example.com">深度学习</a>技术。</p> <h2>实验方法</h2> <p>我们使用了以下方法：</p> <ul> <li>数据预处理</li> <li>模型训练</li> <li>结果评估</li> </ul> <h2>实验结果</h2> <table> <thead> <tr> <th>方法</th> <th>准确率</th> </tr> </thead> <tbody> <tr> <td>方法A</td> <td>95.2%</td> </tr> <tr> <td>方法B</td> <td>97.8%</td> </tr> </tbody> </table> <blockquote> <p>这是一段重要的引用内容。</p> </blockquote> </body> </html> """ print("测试HTML转Markdown功能:") print("=" * 50) markdown_result = html_to_markdown(test_html) print(markdown_result) print("\n\n测试HTML转文本功能:") print("=" * 50) text_result = html_to_text(test_html) print(text_result) print("\n\n测试结构化内容提取:") print("=" * 50) structured = extract_structured_content(test_html, output_format='markdown') print(f"标题: {structured['title']}") print(f"格式: {structured['format']}") print(f"内容长度: {len(structured['content']) if structured['content'] else 0} 字符") print(f"错误信息: {structured['error']}")

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/gqy20/article-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

html_to_markdown.py•10.8 KiB