get_similar_articles

Find similar academic articles based on a given DOI using PubMed's related articles algorithm. Filters results from the last 5 years and retrieves detailed metadata for research reviews, related studies, and scholarly analysis.

Instructions

根据DOI获取相似文章（基于PubMed相关文章算法）

功能说明：

基于PubMed的相关文章算法查找与给定DOI相似的文献
使用NCBI eLink服务查找相关文章
自动过滤最近5年内的文献
批量获取相关文章的详细信息

参数说明：

doi: 必需，数字对象标识符（如："10.1126/science.adf6218"）
email: 可选，联系邮箱，用于获得更高的API访问限制
max_results: 可选，返回的最大相似文章数量，默认20篇

返回值说明：

original_article: 原始文章信息
- title: 文章标题
- authors: 作者列表
- journal: 期刊名称
- publication_date: 发表日期
- pmid: PubMed ID
- pmcid: PMC ID（如果有）
- abstract: 摘要
similar_articles: 相似文章列表（格式同原始文章）
total_similar_count: 总相似文章数量
retrieved_count: 实际获取的文章数量
message: 处理信息
error: 错误信息（如果有）

使用场景：

文献综述研究
寻找相关研究
学术调研
相关工作分析

技术特点：

基于PubMed官方相关文章算法
自动日期过滤（最近5年）
批量获取详细信息
完整的错误处理

Input Schema

TableJSON Schema

Name	Required	Description	Default
`doi`	Yes
`email`	No
`max_results`	No

Implementation Reference

src/article_mcp/services/similar_articles.py:146-335 (handler)

Core handler function that implements the logic to fetch similar articles by DOI using PubMed APIs (ESearch, ELink, EFetch). Parses XML responses and returns structured article data.

def get_similar_articles_by_doi(
    doi: str, email: str = None, max_results: int = 20
) -> dict[str, Any]:
    """根据DOI获取相似文章"""
    try:
        # 验证DOI
        if not doi or not doi.strip():
            return {
                "original_article": None,
                "similar_articles": [],
                "total_similar_count": 0,
                "retrieved_count": 0,
                "error": "DOI不能为空",
            }

        if not email:
            email = "user@example.com"

        headers = {"User-Agent": f"{TOOL_NAME}/1.0 ({email})"}

        # 步骤1：通过DOI获取初始文章的PMID
        logger.info(f"正在为 DOI {doi} 搜索 PMID")
        esearch_params = {
            "db": "pubmed",
            "term": doi,
            "retmax": 1,
            "retmode": "xml",
            "email": email,
            "tool": TOOL_NAME,
        }

        response = requests.get(
            f"{NCBI_BASE_URL}esearch.fcgi", params=esearch_params, headers=headers
        )
        response.raise_for_status()

        esearch_xml = ET.fromstring(response.content)
        ids = esearch_xml.findall(".//Id")

        if not ids:
            return {
                "original_article": None,
                "similar_articles": [],
                "total_similar_count": 0,
                "message": f"未找到 DOI: {doi} 对应的 PubMed 记录",
            }

        initial_pmid = ids[0].text
        logger.info(f"找到初始文章 PMID: {initial_pmid}")

        # 步骤2：获取初始文章详情
        efetch_params = {
            "db": "pubmed",
            "id": initial_pmid,
            "rettype": "xml",
            "retmode": "xml",
            "email": email,
            "tool": TOOL_NAME,
        }

        response = requests.get(
            f"{NCBI_BASE_URL}efetch.fcgi", params=efetch_params, headers=headers
        )
        response.raise_for_status()

        efetch_xml = ET.fromstring(response.content)
        original_article_xml = efetch_xml.find(".//PubmedArticle")
        original_article = parse_pubmed_article(original_article_xml)

        if not original_article:
            return {
                "original_article": None,
                "similar_articles": [],
                "total_similar_count": 0,
                "error": f"无法解析初始 PMID: {initial_pmid} 的文章信息",
            }

        # 步骤3：使用elink查找相关文章
        elink_params = {
            "dbfrom": "pubmed",
            "db": "pubmed",
            "id": initial_pmid,
            "linkname": "pubmed_pubmed",
            "cmd": "neighbor_history",
            "email": email,
            "tool": TOOL_NAME,
        }

        response = requests.get(f"{NCBI_BASE_URL}elink.fcgi", params=elink_params, headers=headers)
        response.raise_for_status()

        elink_xml = ET.fromstring(response.content)
        webenv_elink = elink_xml.findtext(".//WebEnv")
        query_key_elink = elink_xml.findtext(".//LinkSetDbHistory/QueryKey")

        if not webenv_elink or not query_key_elink:
            return {
                "original_article": original_article,
                "similar_articles": [],
                "total_similar_count": 0,
                "message": "找到了原始文章，但未找到相关文章",
            }

        # 步骤4：使用日期过滤获取相关文章
        today = datetime.now()
        five_years_ago = today - timedelta(days=5 * 365.25)
        min_date = five_years_ago.strftime("%Y/%m/%d")
        max_date = today.strftime("%Y/%m/%d")

        esearch_params2 = {
            "db": "pubmed",
            "query_key": query_key_elink,
            "WebEnv": webenv_elink,
            "retmax": str(max_results),
            "retmode": "xml",
            "datetype": "pdat",
            "mindate": min_date,
            "maxdate": max_date,
            "email": email,
            "tool": TOOL_NAME,
            "usehistory": "y",
        }

        response = requests.get(
            f"{NCBI_BASE_URL}esearch.fcgi", params=esearch_params2, headers=headers
        )
        response.raise_for_status()

        esearch_xml2 = ET.fromstring(response.content)
        total_count = int(esearch_xml2.findtext(".//Count", "0"))
        webenv_filtered = esearch_xml2.findtext(".//WebEnv")
        query_key_filtered = esearch_xml2.findtext(".//QueryKey")

        if total_count == 0:
            return {
                "original_article": original_article,
                "similar_articles": [],
                "total_similar_count": 0,
                "message": "在最近5年内未找到相关文章",
            }

        # 步骤5：批量获取相关文章详情
        similar_articles = []
        actual_fetch_count = min(total_count, max_results)

        efetch_params_batch = {
            "db": "pubmed",
            "query_key": query_key_filtered,
            "WebEnv": webenv_filtered,
            "retstart": "0",
            "retmax": str(actual_fetch_count),
            "rettype": "xml",
            "retmode": "xml",
            "email": email,
            "tool": TOOL_NAME,
        }

        response = requests.get(
            f"{NCBI_BASE_URL}efetch.fcgi", params=efetch_params_batch, headers=headers
        )
        response.raise_for_status()

        efetch_xml_batch = ET.fromstring(response.content)
        article_elements = efetch_xml_batch.findall(".//PubmedArticle")

        for article_xml in article_elements:
            article_details = parse_pubmed_article(article_xml)
            if article_details:
                similar_articles.append(article_details)

        logger.info(f"成功获取了 {len(similar_articles)} 篇相关文章")

        return {
            "original_article": original_article,
            "similar_articles": similar_articles,
            "total_similar_count": total_count,
            "retrieved_count": len(similar_articles),
            "message": f"成功找到并获取了 {len(similar_articles)} 篇相关文章",
        }

    except requests.exceptions.RequestException as e:
        logger.error(f"网络请求错误: {e}")
        return {"error": f"网络请求错误: {e}"}
    except ET.ParseError as e:
        logger.error(f"XML解析错误: {e}")
        return {"error": f"XML解析错误: {e}"}
    except Exception as e:
        logger.error(f"获取相似文章时出错: {e}")
        return {"error": f"获取相似文章时出错: {e}"}

src/article_mcp/tools/core/relation_tools.py:404-467 (helper)

Helper function in relation tools that handles identifier conversion to DOI and invokes the similar articles service, used within get_literature_relations tool.

def _get_similar_articles(
    identifier: str, id_type: str, max_results: int, sources: list[str], logger
) -> list[dict[str, Any]]:
    """获取相似文献"""
    try:
        # 确保有DOI标识符
        if id_type != "doi":
            doi = _convert_to_doi(identifier, id_type, logger)
            if not doi:
                logger.warning(f"无法将 {id_type}:{identifier} 转换为DOI，无法获取相似文献")
                return []
        else:
            doi = identifier

        logger.info(f"获取DOI {doi} 的相似文献")
        similar_articles = []

        for source in sources:
            try:
                if source == "pubmed" and "pubmed" in _relation_services:
                    # 使用现有的相似文献服务（基于PubMed E-utilities）
                    logger.info(f"使用PubMed服务获取 {doi} 的相似文献")
                    try:
                        from src.article_mcp.services.similar_articles import get_similar_articles_by_doi
                        result = get_similar_articles_by_doi(doi, max_results=max_results)

                        if result.get("similar_articles"):
                            pubmed_similar = result.get("similar_articles", [])
                            similar_articles.extend(pubmed_similar)
                            logger.info(f"PubMed返回 {len(pubmed_similar)} 篇相似文献")
                        else:
                            logger.warning(f"PubMed相似文献查询无结果")
                    except ImportError:
                        logger.error("无法导入similar_articles模块")
                    except Exception as e:
                        logger.warning(f"PubMed相似文献查询失败: {e}")

                elif source == "openalex" and "openalex" in _relation_services:
                    # OpenAlex相似文献查询（第二阶段实现）
                    service = _relation_services["openalex"]
                    logger.info(f"使用OpenAlex查询 {doi} 的相似文献")
                    # TODO: 实现OpenAlex相似文献API集成
                    logger.debug("OpenAlex相似文献功能待实现")

                elif source == "europe_pmc" and "europe_pmc" in _relation_services:
                    # Europe PMC相似文献查询（第二阶段实现）
                    service = _relation_services["europe_pmc"]
                    logger.info(f"使用Europe PMC查询 {doi} 的相似文献")
                    # TODO: 实现Europe PMC相似文献API集成
                    logger.debug("Europe PMC相似文献功能待实现")

            except Exception as e:
                logger.error(f"从 {source} 获取相似文献失败: {e}")

        # 去重和限制数量
        unique_similar = _deduplicate_references(similar_articles, max_results)
        logger.info(f"相似文献去重后共 {len(unique_similar)} 篇")

        return unique_similar

    except Exception as e:
        logger.error(f"获取相似文献失败: {e}")
        return []

src/article_mcp/services/similar_articles.py:35-144 (helper)

Supporting parser for PubMed article XML data used in similar articles retrieval.

def parse_pubmed_article(article_xml: ET.Element) -> dict[str, Any] | None:
    """解析PubMed文章XML元素"""
    if article_xml is None:
        return None

    pmid = None
    try:
        medline_citation = article_xml.find("./MedlineCitation")
        pubmed_data = article_xml.find("./PubmedData")

        if medline_citation is None:
            return None

        pmid = medline_citation.findtext("./PMID")
        article = medline_citation.find("./Article")

        if article is None or pmid is None:
            return None

        # 提取标题
        title_element = article.find("./ArticleTitle")
        title = (
            "".join(title_element.itertext()).strip() if title_element is not None else "未找到标题"
        )

        # 提取作者
        author_list = []
        author_elements = article.findall("./AuthorList/Author")
        for author in author_elements:
            last_name = author.findtext("LastName")
            fore_name = author.findtext("ForeName")
            collective_name = author.findtext("CollectiveName")

            if collective_name:
                author_list.append(collective_name.strip())
            elif last_name:
                name_parts = []
                if fore_name:
                    name_parts.append(fore_name.strip())
                name_parts.append(last_name.strip())
                author_list.append(" ".join(name_parts))

        # 提取摘要
        abstract_parts = []
        abstract_elements = article.findall("./Abstract/AbstractText")
        if abstract_elements:
            for part in abstract_elements:
                label = part.get("Label")
                text = "".join(part.itertext()).strip()
                if label and text:
                    abstract_parts.append(f"{label.upper()}: {text}")
                elif text:
                    abstract_parts.append(text)

        abstract = "\n".join(abstract_parts) if abstract_parts else None

        # 提取PMCID
        pmcid = None
        pmcid_link = None
        if pubmed_data is not None:
            pmc_element = pubmed_data.find("./ArticleIdList/ArticleId[@IdType='pmc']")
            if pmc_element is not None and pmc_element.text:
                pmcid_raw = pmc_element.text.strip().upper()
                if pmcid_raw.startswith("PMC"):
                    pmcid = pmcid_raw
                    pmcid_link = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/"

        # 提取期刊名称
        journal_title_raw = article.findtext("./Journal/Title")
        journal_name = None
        if journal_title_raw:
            journal_name = re.sub(r"\s*\(.*?\)\s*", "", journal_title_raw).strip()
            if not journal_name:
                journal_name = journal_title_raw.strip()

        # 提取发表日期
        pub_date_element = article.find("./Journal/JournalIssue/PubDate")
        publication_date = None
        if pub_date_element is not None:
            year = pub_date_element.findtext("Year")
            if year and year.isdigit():
                month = pub_date_element.findtext("Month", "01")
                day = pub_date_element.findtext("Day", "01")

                # 处理月份名称
                if month in MONTH_MAP:
                    month = MONTH_MAP[month]
                elif month.isdigit():
                    month = month.zfill(2)
                else:
                    month = "01"

                day = day.zfill(2) if day.isdigit() else "01"
                publication_date = f"{year}-{month}-{day}"

        return {
            "title": title,
            "authors": author_list if author_list else None,
            "journal": journal_name,
            "publication_date": publication_date,
            "pmid": pmid,
            "pmid_link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
            "pmcid": pmcid,
            "pmcid_link": pmcid_link,
            "abstract": abstract,
        }
    except Exception as e:
        logger.error(f"解析文章 PMID {pmid or 'UNKNOWN'} 时出错: {e}")
        return None

src/article_mcp/services/__init__.py:19-37 (registration)

Exports the get_similar_articles_by_doi function for use across the codebase.

from .similar_articles import get_similar_articles_by_doi

__all__ = [
    # 核心服务类
    "EuropePMCService",
    "CrossRefService",
    "OpenAlexService",
    "UnifiedReferenceService",
    "LiteratureRelationService",
    # 服务创建函数
    "create_europe_pmc_service",
    "create_pubmed_service",
    "create_reference_service",
    "create_literature_relation_service",
    "create_arxiv_service",
    # 工具函数
    "search_arxiv",
    "get_similar_articles_by_doi",
]

Europe PMC Literature Search MCP Server