#!/usr/bin/env python3
"""MCP Server for harcx - BibTeX citation verification."""
import os
import sys
import tempfile
import logging
from contextlib import redirect_stdout, redirect_stderr
from io import StringIO
# Suppress all logging from harcx/httpx
logging.getLogger("httpx").setLevel(logging.CRITICAL)
logging.getLogger("reference_checker").setLevel(logging.CRITICAL)
from mcp.server.fastmcp import FastMCP
from reference_checker import check_citations, check_web_citations
mcp = FastMCP("harcx")
def _is_file_path(content: str) -> bool:
"""Check if the content looks like a file path."""
content = content.strip()
if "\n" in content or "@" in content:
return False
return os.path.exists(content) or content.endswith(".bib")
def _get_bib_path(bib_content: str) -> tuple[str, bool]:
"""
Get a file path from bib_content.
Returns:
tuple: (file_path, is_temp_file)
"""
if _is_file_path(bib_content):
return bib_content, False
# Write inline content to a temp file
fd, temp_path = tempfile.mkstemp(suffix=".bib")
with os.fdopen(fd, "w") as f:
f.write(bib_content)
return temp_path, True
def _format_citation_result(result) -> dict:
"""Format a single CheckResult into a clean dict."""
entry = result.entry
issue = {
"key": entry.key,
"title": entry.title,
"authors": entry.authors,
"year": entry.year,
"found": result.found,
"message": result.message,
}
if result.author_match_score > 0:
issue["author_match_score"] = round(result.author_match_score, 2)
if result.matched_paper:
issue["matched_paper"] = {
"title": result.matched_paper.get("title"),
"authors": [a["name"] if isinstance(a, dict) else a
for a in result.matched_paper.get("authors", [])[:3]],
"year": result.matched_paper.get("year"),
}
return issue
def _format_url_result(result) -> dict:
"""Format a single WebCheckResult into a clean dict."""
entry = result.entry
return {
"key": entry.key,
"title": entry.title,
"url": result.url,
"reachable": result.reachable,
"status_code": result.status_code,
"message": result.message,
}
def _format_results(results: list, is_url_check: bool = False) -> dict:
"""Format harcx results into a structured response."""
if not results:
return {
"status": "success",
"message": "All citations verified successfully",
"issues": [],
"issues_found": 0,
}
issues = []
for result in results:
if is_url_check:
issues.append(_format_url_result(result))
else:
issues.append(_format_citation_result(result))
return {
"status": "issues_found" if issues else "success",
"message": f"Found {len(issues)} issue(s)" if issues else "All citations verified",
"issues": issues,
"issues_found": len(issues),
}
@mcp.tool()
def verify_citations(
bib_content: str,
author_threshold: float = 0.6,
check_urls: bool = False,
api_key: str | None = None,
) -> dict:
"""
Verify BibTeX citations for academic papers and books.
Checks citations against Semantic Scholar and other sources to verify
accuracy of titles, authors, publication years, and other metadata.
Args:
bib_content: Either a path to a .bib file OR inline BibTeX content
author_threshold: Match tolerance for author names (0.0-1.0, default 0.6)
check_urls: Also verify URLs in the citations (default False)
api_key: Optional Semantic Scholar API key for higher rate limits
Returns:
Dictionary with verification results including any issues found
"""
temp_file = None
try:
bib_path, is_temp = _get_bib_path(bib_content)
if is_temp:
temp_file = bib_path
if not os.path.exists(bib_path):
return {
"status": "error",
"message": f"File not found: {bib_path}",
"issues": [],
}
# Build kwargs for check_citations
kwargs = {"author_threshold": author_threshold}
if api_key:
kwargs["api_key"] = api_key
# Suppress stdout/stderr from harcx (interferes with MCP protocol)
with redirect_stdout(StringIO()), redirect_stderr(StringIO()):
results = check_citations(bib_path, **kwargs)
response = _format_results(results)
# Optionally check URLs as well
if check_urls:
with redirect_stdout(StringIO()), redirect_stderr(StringIO()):
url_results = check_web_citations(bib_path)
url_formatted = _format_results(url_results, is_url_check=True)
response["url_issues"] = url_formatted["issues"]
response["url_issues_found"] = url_formatted["issues_found"]
if url_formatted["issues"]:
response["status"] = "issues_found"
response["message"] += f"; {url_formatted['issues_found']} URL issue(s)"
return response
except Exception as e:
return {
"status": "error",
"message": f"Error verifying citations: {str(e)}",
"issues": [],
}
finally:
if temp_file and os.path.exists(temp_file):
os.remove(temp_file)
@mcp.tool()
def verify_urls(
bib_content: str,
) -> dict:
"""
Verify URL citations in a BibTeX file.
Checks that URLs in citations are accessible and return valid responses.
Args:
bib_content: Either a path to a .bib file OR inline BibTeX content
Returns:
Dictionary with URL verification results including any broken links
"""
temp_file = None
try:
bib_path, is_temp = _get_bib_path(bib_content)
if is_temp:
temp_file = bib_path
if not os.path.exists(bib_path):
return {
"status": "error",
"message": f"File not found: {bib_path}",
"issues": [],
}
# Suppress stdout/stderr from harcx (interferes with MCP protocol)
with redirect_stdout(StringIO()), redirect_stderr(StringIO()):
results = check_web_citations(bib_path)
return _format_results(results, is_url_check=True)
except Exception as e:
return {
"status": "error",
"message": f"Error verifying URLs: {str(e)}",
"issues": [],
}
finally:
if temp_file and os.path.exists(temp_file):
os.remove(temp_file)
if __name__ == "__main__":
mcp.run()