get_similar_articles
Find similar academic articles based on a given DOI using PubMed's related articles algorithm. Filters results from the last 5 years and retrieves detailed metadata for research reviews, related studies, and scholarly analysis.
Instructions
根据DOI获取相似文章(基于PubMed相关文章算法)
功能说明:
基于PubMed的相关文章算法查找与给定DOI相似的文献
使用NCBI eLink服务查找相关文章
自动过滤最近5年内的文献
批量获取相关文章的详细信息
参数说明:
doi: 必需,数字对象标识符(如:"10.1126/science.adf6218")
email: 可选,联系邮箱,用于获得更高的API访问限制
max_results: 可选,返回的最大相似文章数量,默认20篇
返回值说明:
original_article: 原始文章信息
title: 文章标题
authors: 作者列表
journal: 期刊名称
publication_date: 发表日期
pmid: PubMed ID
pmcid: PMC ID(如果有)
abstract: 摘要
similar_articles: 相似文章列表(格式同原始文章)
total_similar_count: 总相似文章数量
retrieved_count: 实际获取的文章数量
message: 处理信息
error: 错误信息(如果有)
使用场景:
文献综述研究
寻找相关研究
学术调研
相关工作分析
技术特点:
基于PubMed官方相关文章算法
自动日期过滤(最近5年)
批量获取详细信息
完整的错误处理
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| doi | Yes | ||
| No | |||
| max_results | No |
Implementation Reference
- Core handler function that implements the logic to fetch similar articles by DOI using PubMed APIs (ESearch, ELink, EFetch). Parses XML responses and returns structured article data.def get_similar_articles_by_doi( doi: str, email: str = None, max_results: int = 20 ) -> dict[str, Any]: """根据DOI获取相似文章""" try: # 验证DOI if not doi or not doi.strip(): return { "original_article": None, "similar_articles": [], "total_similar_count": 0, "retrieved_count": 0, "error": "DOI不能为空", } if not email: email = "user@example.com" headers = {"User-Agent": f"{TOOL_NAME}/1.0 ({email})"} # 步骤1:通过DOI获取初始文章的PMID logger.info(f"正在为 DOI {doi} 搜索 PMID") esearch_params = { "db": "pubmed", "term": doi, "retmax": 1, "retmode": "xml", "email": email, "tool": TOOL_NAME, } response = requests.get( f"{NCBI_BASE_URL}esearch.fcgi", params=esearch_params, headers=headers ) response.raise_for_status() esearch_xml = ET.fromstring(response.content) ids = esearch_xml.findall(".//Id") if not ids: return { "original_article": None, "similar_articles": [], "total_similar_count": 0, "message": f"未找到 DOI: {doi} 对应的 PubMed 记录", } initial_pmid = ids[0].text logger.info(f"找到初始文章 PMID: {initial_pmid}") # 步骤2:获取初始文章详情 efetch_params = { "db": "pubmed", "id": initial_pmid, "rettype": "xml", "retmode": "xml", "email": email, "tool": TOOL_NAME, } response = requests.get( f"{NCBI_BASE_URL}efetch.fcgi", params=efetch_params, headers=headers ) response.raise_for_status() efetch_xml = ET.fromstring(response.content) original_article_xml = efetch_xml.find(".//PubmedArticle") original_article = parse_pubmed_article(original_article_xml) if not original_article: return { "original_article": None, "similar_articles": [], "total_similar_count": 0, "error": f"无法解析初始 PMID: {initial_pmid} 的文章信息", } # 步骤3:使用elink查找相关文章 elink_params = { "dbfrom": "pubmed", "db": "pubmed", "id": initial_pmid, "linkname": "pubmed_pubmed", "cmd": "neighbor_history", "email": email, "tool": TOOL_NAME, } response = requests.get(f"{NCBI_BASE_URL}elink.fcgi", params=elink_params, headers=headers) response.raise_for_status() elink_xml = ET.fromstring(response.content) webenv_elink = elink_xml.findtext(".//WebEnv") query_key_elink = elink_xml.findtext(".//LinkSetDbHistory/QueryKey") if not webenv_elink or not query_key_elink: return { "original_article": original_article, "similar_articles": [], "total_similar_count": 0, "message": "找到了原始文章,但未找到相关文章", } # 步骤4:使用日期过滤获取相关文章 today = datetime.now() five_years_ago = today - timedelta(days=5 * 365.25) min_date = five_years_ago.strftime("%Y/%m/%d") max_date = today.strftime("%Y/%m/%d") esearch_params2 = { "db": "pubmed", "query_key": query_key_elink, "WebEnv": webenv_elink, "retmax": str(max_results), "retmode": "xml", "datetype": "pdat", "mindate": min_date, "maxdate": max_date, "email": email, "tool": TOOL_NAME, "usehistory": "y", } response = requests.get( f"{NCBI_BASE_URL}esearch.fcgi", params=esearch_params2, headers=headers ) response.raise_for_status() esearch_xml2 = ET.fromstring(response.content) total_count = int(esearch_xml2.findtext(".//Count", "0")) webenv_filtered = esearch_xml2.findtext(".//WebEnv") query_key_filtered = esearch_xml2.findtext(".//QueryKey") if total_count == 0: return { "original_article": original_article, "similar_articles": [], "total_similar_count": 0, "message": "在最近5年内未找到相关文章", } # 步骤5:批量获取相关文章详情 similar_articles = [] actual_fetch_count = min(total_count, max_results) efetch_params_batch = { "db": "pubmed", "query_key": query_key_filtered, "WebEnv": webenv_filtered, "retstart": "0", "retmax": str(actual_fetch_count), "rettype": "xml", "retmode": "xml", "email": email, "tool": TOOL_NAME, } response = requests.get( f"{NCBI_BASE_URL}efetch.fcgi", params=efetch_params_batch, headers=headers ) response.raise_for_status() efetch_xml_batch = ET.fromstring(response.content) article_elements = efetch_xml_batch.findall(".//PubmedArticle") for article_xml in article_elements: article_details = parse_pubmed_article(article_xml) if article_details: similar_articles.append(article_details) logger.info(f"成功获取了 {len(similar_articles)} 篇相关文章") return { "original_article": original_article, "similar_articles": similar_articles, "total_similar_count": total_count, "retrieved_count": len(similar_articles), "message": f"成功找到并获取了 {len(similar_articles)} 篇相关文章", } except requests.exceptions.RequestException as e: logger.error(f"网络请求错误: {e}") return {"error": f"网络请求错误: {e}"} except ET.ParseError as e: logger.error(f"XML解析错误: {e}") return {"error": f"XML解析错误: {e}"} except Exception as e: logger.error(f"获取相似文章时出错: {e}") return {"error": f"获取相似文章时出错: {e}"}
- Helper function in relation tools that handles identifier conversion to DOI and invokes the similar articles service, used within get_literature_relations tool.def _get_similar_articles( identifier: str, id_type: str, max_results: int, sources: list[str], logger ) -> list[dict[str, Any]]: """获取相似文献""" try: # 确保有DOI标识符 if id_type != "doi": doi = _convert_to_doi(identifier, id_type, logger) if not doi: logger.warning(f"无法将 {id_type}:{identifier} 转换为DOI,无法获取相似文献") return [] else: doi = identifier logger.info(f"获取DOI {doi} 的相似文献") similar_articles = [] for source in sources: try: if source == "pubmed" and "pubmed" in _relation_services: # 使用现有的相似文献服务(基于PubMed E-utilities) logger.info(f"使用PubMed服务获取 {doi} 的相似文献") try: from src.article_mcp.services.similar_articles import get_similar_articles_by_doi result = get_similar_articles_by_doi(doi, max_results=max_results) if result.get("similar_articles"): pubmed_similar = result.get("similar_articles", []) similar_articles.extend(pubmed_similar) logger.info(f"PubMed返回 {len(pubmed_similar)} 篇相似文献") else: logger.warning(f"PubMed相似文献查询无结果") except ImportError: logger.error("无法导入similar_articles模块") except Exception as e: logger.warning(f"PubMed相似文献查询失败: {e}") elif source == "openalex" and "openalex" in _relation_services: # OpenAlex相似文献查询(第二阶段实现) service = _relation_services["openalex"] logger.info(f"使用OpenAlex查询 {doi} 的相似文献") # TODO: 实现OpenAlex相似文献API集成 logger.debug("OpenAlex相似文献功能待实现") elif source == "europe_pmc" and "europe_pmc" in _relation_services: # Europe PMC相似文献查询(第二阶段实现) service = _relation_services["europe_pmc"] logger.info(f"使用Europe PMC查询 {doi} 的相似文献") # TODO: 实现Europe PMC相似文献API集成 logger.debug("Europe PMC相似文献功能待实现") except Exception as e: logger.error(f"从 {source} 获取相似文献失败: {e}") # 去重和限制数量 unique_similar = _deduplicate_references(similar_articles, max_results) logger.info(f"相似文献去重后共 {len(unique_similar)} 篇") return unique_similar except Exception as e: logger.error(f"获取相似文献失败: {e}") return []
- Supporting parser for PubMed article XML data used in similar articles retrieval.def parse_pubmed_article(article_xml: ET.Element) -> dict[str, Any] | None: """解析PubMed文章XML元素""" if article_xml is None: return None pmid = None try: medline_citation = article_xml.find("./MedlineCitation") pubmed_data = article_xml.find("./PubmedData") if medline_citation is None: return None pmid = medline_citation.findtext("./PMID") article = medline_citation.find("./Article") if article is None or pmid is None: return None # 提取标题 title_element = article.find("./ArticleTitle") title = ( "".join(title_element.itertext()).strip() if title_element is not None else "未找到标题" ) # 提取作者 author_list = [] author_elements = article.findall("./AuthorList/Author") for author in author_elements: last_name = author.findtext("LastName") fore_name = author.findtext("ForeName") collective_name = author.findtext("CollectiveName") if collective_name: author_list.append(collective_name.strip()) elif last_name: name_parts = [] if fore_name: name_parts.append(fore_name.strip()) name_parts.append(last_name.strip()) author_list.append(" ".join(name_parts)) # 提取摘要 abstract_parts = [] abstract_elements = article.findall("./Abstract/AbstractText") if abstract_elements: for part in abstract_elements: label = part.get("Label") text = "".join(part.itertext()).strip() if label and text: abstract_parts.append(f"{label.upper()}: {text}") elif text: abstract_parts.append(text) abstract = "\n".join(abstract_parts) if abstract_parts else None # 提取PMCID pmcid = None pmcid_link = None if pubmed_data is not None: pmc_element = pubmed_data.find("./ArticleIdList/ArticleId[@IdType='pmc']") if pmc_element is not None and pmc_element.text: pmcid_raw = pmc_element.text.strip().upper() if pmcid_raw.startswith("PMC"): pmcid = pmcid_raw pmcid_link = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/" # 提取期刊名称 journal_title_raw = article.findtext("./Journal/Title") journal_name = None if journal_title_raw: journal_name = re.sub(r"\s*\(.*?\)\s*", "", journal_title_raw).strip() if not journal_name: journal_name = journal_title_raw.strip() # 提取发表日期 pub_date_element = article.find("./Journal/JournalIssue/PubDate") publication_date = None if pub_date_element is not None: year = pub_date_element.findtext("Year") if year and year.isdigit(): month = pub_date_element.findtext("Month", "01") day = pub_date_element.findtext("Day", "01") # 处理月份名称 if month in MONTH_MAP: month = MONTH_MAP[month] elif month.isdigit(): month = month.zfill(2) else: month = "01" day = day.zfill(2) if day.isdigit() else "01" publication_date = f"{year}-{month}-{day}" return { "title": title, "authors": author_list if author_list else None, "journal": journal_name, "publication_date": publication_date, "pmid": pmid, "pmid_link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/", "pmcid": pmcid, "pmcid_link": pmcid_link, "abstract": abstract, } except Exception as e: logger.error(f"解析文章 PMID {pmid or 'UNKNOWN'} 时出错: {e}") return None
- src/article_mcp/services/__init__.py:19-37 (registration)Exports the get_similar_articles_by_doi function for use across the codebase.from .similar_articles import get_similar_articles_by_doi __all__ = [ # 核心服务类 "EuropePMCService", "CrossRefService", "OpenAlexService", "UnifiedReferenceService", "LiteratureRelationService", # 服务创建函数 "create_europe_pmc_service", "create_pubmed_service", "create_reference_service", "create_literature_relation_service", "create_arxiv_service", # 工具函数 "search_arxiv", "get_similar_articles_by_doi", ]