Paperlib MCP

Overview Inspect Schema Related Servers Score Discussions

writing.py•31.8 kB

"""文献综述生成工具""" import json from collections import defaultdict from typing import Any from pydantic import BaseModel from fastmcp import FastMCP from paperlib_mcp.tools.search import hybrid_search from paperlib_mcp.db import query_one, query_all, get_db # 经济金融领域文献综述的标准结构 OUTLINE_TEMPLATES = { "econ_finance_canonical": { "name": "经济金融学经典结构", "sections": [ { "id": "research_question", "title": "研究问题与理论框架", "description": "核心研究问题、理论基础和主要假设", "keywords": ["theory", "hypothesis", "framework", "model", "prediction"], }, { "id": "methodology", "title": "方法与识别策略", "description": "实证方法、因果识别、计量模型", "keywords": ["method", "identification", "strategy", "estimation", "regression", "instrumental", "difference-in-differences", "RDD"], }, { "id": "data", "title": "数据与变量度量", "description": "数据来源、样本选择、关键变量定义", "keywords": ["data", "sample", "variable", "measure", "proxy", "definition"], }, { "id": "findings", "title": "主要发现", "description": "核心结论、稳健性检验、异质性分析", "keywords": ["result", "finding", "evidence", "show", "demonstrate", "coefficient", "significant"], }, { "id": "debates", "title": "争议与不一致发现", "description": "文献中的分歧、methodological debates", "keywords": ["debate", "controversy", "inconsistent", "contrast", "however", "limitation"], }, { "id": "gaps", "title": "研究空白与未来方向", "description": "尚未解决的问题、潜在研究机会", "keywords": ["gap", "future", "direction", "unexplored", "opportunity", "need"], }, ], }, "general": { "name": "通用文献综述结构", "sections": [ { "id": "background", "title": "背景与动机", "description": "研究领域概述和重要性", "keywords": ["background", "motivation", "importance", "context"], }, { "id": "theory", "title": "理论基础", "description": "相关理论和概念框架", "keywords": ["theory", "framework", "concept", "model"], }, { "id": "methods", "title": "研究方法", "description": "主要研究方法和技术路线", "keywords": ["method", "approach", "technique", "design"], }, { "id": "findings", "title": "主要发现", "description": "关键研究结论和证据", "keywords": ["result", "finding", "evidence", "conclusion"], }, { "id": "future", "title": "未来研究方向", "description": "研究空白和潜在机会", "keywords": ["future", "direction", "gap", "opportunity"], }, ], }, } class LitReviewSection(BaseModel): """综述章节""" section_id: str title: str content: str citations: list[dict[str, Any]] class LitReviewDraft(BaseModel): """综述草稿""" topic: str outline_style: str pack_id: int | None total_sources: int unique_documents: int sections: list[LitReviewSection] all_citations: list[dict[str, Any]] class EvidencePackItem(BaseModel): """证据包条目""" doc_id: str chunk_id: int page_start: int page_end: int text: str score: float class EvidencePack(BaseModel): """证据包""" pack_id: int query: str params: dict[str, Any] items: list[EvidencePackItem] stats: dict[str, Any] def get_evidence_pack(pack_id: int) -> EvidencePack | None: """获取证据包内容 Args: pack_id: 证据包 ID Returns: 证据包对象，如果不存在返回 None """ # 获取证据包元数据 pack = query_one( """ SELECT pack_id, query, params_json, created_at::text FROM evidence_packs WHERE pack_id = %s """, (pack_id,) ) if not pack: return None # 获取证据包条目 items = query_all( """ SELECT epi.doc_id, epi.chunk_id, epi.rank, c.page_start, c.page_end, c.text FROM evidence_pack_items epi JOIN chunks c ON epi.chunk_id = c.chunk_id WHERE epi.pack_id = %s ORDER BY epi.rank """, (pack_id,) ) # 统计 unique_docs = len(set(item["doc_id"] for item in items)) return EvidencePack( pack_id=pack["pack_id"], query=pack["query"], params=pack["params_json"] or {}, items=[ EvidencePackItem( doc_id=item["doc_id"], chunk_id=item["chunk_id"], page_start=item["page_start"], page_end=item["page_end"], text=item["text"], score=1.0 / (item["rank"] + 1) if item["rank"] is not None else 0.5, # 基于排名的伪分数 ) for item in items ], stats={ "total_chunks": len(items), "unique_docs": unique_docs, } ) def register_writing_tools(mcp: FastMCP) -> None: """注册写作工具""" @mcp.tool() async def build_evidence_pack( query: str, k: int = 40, per_doc_limit: int = 3, alpha: float = 0.6, ) -> dict[str, Any]: """构建证据包搜索与主题相关的文献片段，并保存为可复用的证据包。证据包可用于多次迭代综述写作，避免每次重新检索导致结果漂移。 Args: query: 搜索主题/研究问题 k: 检索数量，默认 40 per_doc_limit: 每篇文档最多返回的 chunk 数量，默认 3 alpha: 向量搜索权重，默认 0.6 Returns: 证据包信息，包含 pack_id 和检索到的条目 """ try: # 执行搜索 search_result = await hybrid_search( query=query, k=k, alpha=alpha, per_doc_limit=per_doc_limit, ) if not search_result.results: return { "error": "No relevant literature found", "query": query, "pack_id": None, } # 保存证据包 params = { "k": k, "per_doc_limit": per_doc_limit, "alpha": alpha, } with get_db() as conn: with conn.cursor() as cur: # 创建证据包 cur.execute( """ INSERT INTO evidence_packs (query, params_json) VALUES (%s, %s) RETURNING pack_id """, (query, json.dumps(params)) ) pack_result = cur.fetchone() pack_id = pack_result["pack_id"] # 插入条目 for rank, result in enumerate(search_result.results): cur.execute( """ INSERT INTO evidence_pack_items (pack_id, doc_id, chunk_id, rank) VALUES (%s, %s, %s, %s) """, (pack_id, result.doc_id, result.chunk_id, rank) ) # 获取文档元数据 doc_ids = list(set(r.doc_id for r in search_result.results)) doc_metadata = {} for doc_id in doc_ids: doc = query_one( "SELECT title, authors, year FROM documents WHERE doc_id = %s", (doc_id,) ) if doc: doc_metadata[doc_id] = doc # 构建返回结果 items = [] for result in search_result.results: meta = doc_metadata.get(result.doc_id, {}) items.append({ "doc_id": result.doc_id, "chunk_id": result.chunk_id, "page_start": result.page_start, "page_end": result.page_end, "text": result.snippet, "score": result.score_total, "title": meta.get("title"), "authors": meta.get("authors"), "year": meta.get("year"), }) return { "pack_id": pack_id, "query": query, "params": params, "items": items, "stats": { "total_chunks": len(items), "unique_docs": len(doc_ids), }, } except Exception as e: return { "error": str(e), "query": query, "pack_id": None, } @mcp.tool() def get_evidence_pack_info(pack_id: int) -> dict[str, Any]: """获取证据包详情查看已保存的证据包内容和统计信息。 Args: pack_id: 证据包 ID Returns: 证据包详情 """ try: pack = get_evidence_pack(pack_id) if not pack: return { "error": f"Evidence pack not found: {pack_id}", "pack_id": pack_id, } # 获取文档元数据 doc_ids = list(set(item.doc_id for item in pack.items)) doc_metadata = {} for doc_id in doc_ids: doc = query_one( "SELECT title, authors, year FROM documents WHERE doc_id = %s", (doc_id,) ) if doc: doc_metadata[doc_id] = doc items_with_meta = [] for item in pack.items: meta = doc_metadata.get(item.doc_id, {}) text = item.text snippet = text[:200] + "..." if len(text) > 200 else text items_with_meta.append({ "doc_id": item.doc_id, "chunk_id": item.chunk_id, "page_start": item.page_start, "page_end": item.page_end, "snippet": snippet, "title": meta.get("title"), "authors": meta.get("authors"), "year": meta.get("year"), }) return { "pack_id": pack.pack_id, "query": pack.query, "params": pack.params, "items": items_with_meta, "stats": pack.stats, } except Exception as e: return { "error": str(e), "pack_id": pack_id, } @mcp.tool() def list_evidence_packs(limit: int = 20, offset: int = 0) -> dict[str, Any]: """列出所有证据包查看已保存的证据包列表。 Args: limit: 返回数量限制，默认 20 offset: 分页偏移量，默认 0 Returns: 证据包列表 """ try: packs = query_all( """ SELECT ep.pack_id, ep.query, ep.created_at::text, COUNT(epi.id) as item_count, COUNT(DISTINCT epi.doc_id) as doc_count FROM evidence_packs ep LEFT JOIN evidence_pack_items epi ON ep.pack_id = epi.pack_id GROUP BY ep.pack_id ORDER BY ep.created_at DESC LIMIT %s OFFSET %s """, (limit, offset) ) total = query_one("SELECT COUNT(*) as count FROM evidence_packs") return { "total": total["count"] if total else 0, "limit": limit, "offset": offset, "packs": [ { "pack_id": p["pack_id"], "query": p["query"], "created_at": p["created_at"], "item_count": p["item_count"], "doc_count": p["doc_count"], } for p in packs ], } except Exception as e: return { "error": str(e), "total": 0, "packs": [], } @mcp.tool() async def draft_lit_review_v1( topic: str | None = None, pack_id: int | None = None, k: int = 30, outline_style: str = "econ_finance_canonical", ) -> dict[str, Any]: """生成文献综述草稿基于指定主题或已有证据包，按照学术标准结构组织成综述草稿。 Args: topic: 综述主题/研究问题（如果提供 pack_id 则可选） pack_id: 已有证据包 ID（如果提供则直接使用，不重新检索） k: 检索的相关 chunk 数量（仅当未提供 pack_id 时使用），默认 30 outline_style: 大纲样式，可选 "econ_finance_canonical"（经济金融）或 "general"（通用） Returns: 综述草稿，包含： - sections: 按结构组织的章节列表 - all_citations: 所有引用的文献信息 - total_sources: 引用的文献总数 """ try: # 确定使用的证据来源 evidence_items = [] used_pack_id = None actual_topic = topic if pack_id: # 使用已有证据包 pack = get_evidence_pack(pack_id) if not pack: return { "error": f"Evidence pack not found: {pack_id}", "pack_id": pack_id, } used_pack_id = pack_id actual_topic = topic or pack.query # 转换为统一格式 for item in pack.items: evidence_items.append({ "doc_id": item.doc_id, "chunk_id": item.chunk_id, "page_start": item.page_start, "page_end": item.page_end, "text": item.text, "score": item.score, }) else: if not topic: return { "error": "Must provide either topic or pack_id", } # 执行新的搜索 search_result = await hybrid_search(topic, k=k, alpha=0.6, per_doc_limit=3) if not search_result.results: return { "error": "No relevant literature found for the topic", "topic": topic, "sections": [], "all_citations": [], } for result in search_result.results: # 获取完整文本 chunk = query_one( "SELECT text FROM chunks WHERE chunk_id = %s", (result.chunk_id,) ) evidence_items.append({ "doc_id": result.doc_id, "chunk_id": result.chunk_id, "page_start": result.page_start, "page_end": result.page_end, "text": chunk["text"] if chunk else result.snippet, "score": result.score_total, }) # 2. 获取大纲模板 template = OUTLINE_TEMPLATES.get(outline_style, OUTLINE_TEMPLATES["general"]) # 3. 获取文档元数据 doc_ids = list(set(item["doc_id"] for item in evidence_items)) doc_metadata: dict[str, dict] = {} for doc_id in doc_ids: doc = query_one( "SELECT doc_id, title, authors, year FROM documents WHERE doc_id = %s", (doc_id,) ) if doc: doc_metadata[doc_id] = { "doc_id": doc["doc_id"], "title": doc["title"] or "Untitled", "authors": doc["authors"] or "Unknown", "year": doc["year"], } # 4. 将证据分配到各章节（基于关键词匹配） section_evidence: dict[str, list] = {s["id"]: [] for s in template["sections"]} for item in evidence_items: text_lower = item["text"].lower() best_section = None best_score = 0 for section in template["sections"]: # 计算关键词匹配分数 keywords = section.get("keywords", []) match_count = sum(1 for kw in keywords if kw.lower() in text_lower) if match_count > best_score: best_score = match_count best_section = section["id"] # 如果没有明确匹配，放入第一个章节 if not best_section: best_section = template["sections"][0]["id"] section_evidence[best_section].append(item) # 5. 生成各章节内容 sections = [] all_citations = [] for section_template in template["sections"]: section_id = section_template["id"] section_items = section_evidence.get(section_id, []) # 按分数排序 section_items.sort(key=lambda x: x["score"], reverse=True) # 构建章节内容 content_parts = [] section_citations = [] content_parts.append(f"**{section_template['description']}**\n") for item in section_items[:10]: # 每章节最多 10 条 doc_id = item["doc_id"] meta = doc_metadata.get(doc_id, {"title": "Unknown", "authors": "Unknown", "year": None}) # 添加引用信息 citation = { "doc_id": doc_id, "title": meta["title"], "authors": meta["authors"], "year": meta["year"], "page_start": item["page_start"], "page_end": item["page_end"], "chunk_id": item["chunk_id"], } section_citations.append(citation) # 格式化引用标记 year_str = str(meta["year"]) if meta["year"] else "n.d." cite_key = f"[{meta['authors']}, {year_str}: p.{item['page_start']}-{item['page_end']}]" # 生成摘要 text = item["text"] snippet = text[:300] + "..." if len(text) > 300 else text content_parts.append(f"- {snippet} {cite_key}") if not section_items: content_parts.append("（暂无相关内容）") sections.append(LitReviewSection( section_id=section_id, title=section_template["title"], content="\n\n".join(content_parts), citations=section_citations, )) all_citations.extend(section_citations) # 6. 去重引用列表 unique_citations = [] seen_docs = set() for cite in all_citations: if cite["doc_id"] not in seen_docs: seen_docs.add(cite["doc_id"]) unique_citations.append({ "doc_id": cite["doc_id"], "title": cite["title"], "authors": cite["authors"], "year": cite["year"], }) return LitReviewDraft( topic=actual_topic, outline_style=outline_style, pack_id=used_pack_id, total_sources=len(evidence_items), unique_documents=len(unique_citations), sections=sections, all_citations=unique_citations, ).model_dump() except Exception as e: return { "error": str(e), "topic": topic, "sections": [], "all_citations": [], } @mcp.tool() def draft_section( pack_id: int, section: str, outline_style: str = "econ_finance_canonical", ) -> dict[str, Any]: """生成综述特定章节基于证据包，只生成指定章节的内容。适合迭代写作某个特定部分。 Args: pack_id: 证据包 ID section: 章节类型，如 "methodology"、"findings"、"gaps" 等 outline_style: 大纲样式，默认 "econ_finance_canonical" Returns: 章节内容和引用列表 """ try: # 获取证据包 pack = get_evidence_pack(pack_id) if not pack: return { "error": f"Evidence pack not found: {pack_id}", "pack_id": pack_id, } # 获取模板 template = OUTLINE_TEMPLATES.get(outline_style, OUTLINE_TEMPLATES["general"]) # 找到对应章节 section_template = None for s in template["sections"]: if s["id"] == section: section_template = s break if not section_template: available_sections = [s["id"] for s in template["sections"]] return { "error": f"Section '{section}' not found. Available: {available_sections}", "pack_id": pack_id, "section": section, } # 获取文档元数据 doc_ids = list(set(item.doc_id for item in pack.items)) doc_metadata: dict[str, dict] = {} for doc_id in doc_ids: doc = query_one( "SELECT doc_id, title, authors, year FROM documents WHERE doc_id = %s", (doc_id,) ) if doc: doc_metadata[doc_id] = { "doc_id": doc["doc_id"], "title": doc["title"] or "Untitled", "authors": doc["authors"] or "Unknown", "year": doc["year"], } # 筛选与章节相关的证据 keywords = section_template.get("keywords", []) relevant_items = [] for item in pack.items: text_lower = item.text.lower() match_count = sum(1 for kw in keywords if kw.lower() in text_lower) if match_count > 0: relevant_items.append((item, match_count)) # 按匹配数排序 relevant_items.sort(key=lambda x: x[1], reverse=True) # 构建章节内容 content_parts = [] citations = [] content_parts.append(f"# {section_template['title']}\n") content_parts.append(f"**{section_template['description']}**\n") for item, match_count in relevant_items[:15]: # 最多 15 条 meta = doc_metadata.get(item.doc_id, {"title": "Unknown", "authors": "Unknown", "year": None}) citation = { "doc_id": item.doc_id, "title": meta["title"], "authors": meta["authors"], "year": meta["year"], "page_start": item.page_start, "page_end": item.page_end, "chunk_id": item.chunk_id, "relevance": match_count, } citations.append(citation) year_str = str(meta["year"]) if meta["year"] else "n.d." cite_key = f"[{meta['authors']}, {year_str}: p.{item.page_start}-{item.page_end}]" text = item.text snippet = text[:400] + "..." if len(text) > 400 else text content_parts.append(f"- {snippet} {cite_key}") if not relevant_items: content_parts.append("（该章节暂无匹配的相关内容）") # 去重引用 unique_citations = [] seen_docs = set() for cite in citations: if cite["doc_id"] not in seen_docs: seen_docs.add(cite["doc_id"]) unique_citations.append(cite) return { "pack_id": pack_id, "section_id": section, "title": section_template["title"], "content": "\n\n".join(content_parts), "citations": citations, "unique_documents": len(unique_citations), "total_evidence": len(relevant_items), } except Exception as e: return { "error": str(e), "pack_id": pack_id, "section": section, } @mcp.tool() def get_outline_templates() -> dict[str, Any]: """获取可用的综述大纲模板返回所有支持的文献综述结构模板。 Returns: 模板列表，每个包含名称和章节结构 """ return { "templates": [ { "id": key, "name": template["name"], "sections": [ { "id": s["id"], "title": s["title"], "description": s["description"], } for s in template["sections"] ], } for key, template in OUTLINE_TEMPLATES.items() ] } @mcp.tool() async def collect_evidence( topic: str, section_focus: str | None = None, k: int = 20, ) -> dict[str, Any]: """收集特定主题的文献证据搜索与主题相关的文献片段，可选择聚焦于特定章节类型。 Args: topic: 搜索主题 section_focus: 聚焦的章节类型（如 "methodology", "findings"） k: 返回结果数量 Returns: 按文献聚合的证据列表 """ try: # 如果有章节聚焦，调整查询 query = topic if section_focus: focus_keywords = { "methodology": "method approach model estimation identification", "findings": "result finding evidence show demonstrate", "theory": "theory framework hypothesis prediction", "data": "data sample variable measure", } if section_focus in focus_keywords: query = f"{topic} {focus_keywords[section_focus]}" # 搜索 search_result = await hybrid_search(query, k=k, alpha=0.6, per_doc_limit=5) # 按文档聚合 evidence_by_doc: dict[str, dict] = {} for result in search_result.results: doc_id = result.doc_id if doc_id not in evidence_by_doc: # 获取文档信息 doc = query_one( "SELECT title, authors, year FROM documents WHERE doc_id = %s", (doc_id,) ) evidence_by_doc[doc_id] = { "doc_id": doc_id, "title": doc["title"] if doc else "Unknown", "authors": doc["authors"] if doc else "Unknown", "year": doc["year"] if doc else None, "evidence": [], } evidence_by_doc[doc_id]["evidence"].append({ "chunk_id": result.chunk_id, "page_start": result.page_start, "page_end": result.page_end, "text": result.snippet, "relevance_score": result.score_total, }) # 按证据数量排序 sorted_evidence = sorted( evidence_by_doc.values(), key=lambda x: len(x["evidence"]), reverse=True ) return { "topic": topic, "section_focus": section_focus, "total_chunks": len(search_result.results), "unique_documents": len(sorted_evidence), "evidence": sorted_evidence, } except Exception as e: return { "error": str(e), "topic": topic, "evidence": [], }

Implementation Reference

Latest Blog Posts

What Is Context Bloat in MCP?
By Om-Shree-0709 on December 16, 2025.
mcp
Context Bloat
MCP Moves to the Linux Foundation: Neutral Stewardship for Agentic Infrastructure
By Om-Shree-0709 on December 15, 2025.
mcp
anthropic
Linux Foundation
Code Execution with MCP: Architecting Agentic Efficiency
By Om-Shree-0709 on December 14, 2025.
mcp
Token bloat

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/h-lu/paperlib-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server