Skip to main content
Glama

MCP-DBLP

dblp_client.py21.5 kB
import contextlib import difflib import logging import re from collections import Counter from typing import Any import requests logger = logging.getLogger("dblp_client") # Default timeout for all HTTP requests REQUEST_TIMEOUT = 10 # seconds # Headers for DBLP API requests # DBLP recommends using an identifying User-Agent to avoid rate-limiting # See: https://dblp.org/faq/1474706.html HEADERS = { "User-Agent": "mcp-dblp/1.1.1 (https://github.com/szeider/mcp-dblp)", "Accept": "application/json", } def _fetch_publications(single_query: str, max_results: int) -> list[dict[str, Any]]: """Helper function to fetch publications for a single query string.""" results = [] try: url = "https://dblp.org/search/publ/api" params = {"q": single_query, "format": "json", "h": max_results} response = requests.get(url, params=params, headers=HEADERS, timeout=REQUEST_TIMEOUT) response.raise_for_status() data = response.json() hits = data.get("result", {}).get("hits", {}) total = int(hits.get("@total", "0")) logger.info(f"Found {total} results for query: {single_query}") if total > 0: publications = hits.get("hit", []) if not isinstance(publications, list): publications = [publications] for pub in publications: info = pub.get("info", {}) authors = [] authors_data = info.get("authors", {}).get("author", []) if not isinstance(authors_data, list): authors_data = [authors_data] for author in authors_data: if isinstance(author, dict): authors.append(author.get("text", "")) else: authors.append(str(author)) # Extract the proper DBLP URL or ID for BibTeX retrieval dblp_url = info.get("url", "") dblp_key = "" if dblp_url: # Extract the key from the URL (e.g., https://dblp.org/rec/journals/jmlr/ChowdheryNDBMGMBCDDRSSTWPLLNSZDYJGKPSN23) dblp_key = dblp_url.replace("https://dblp.org/rec/", "") elif "key" in pub: dblp_key = pub.get("key", "").replace("dblp:", "") else: dblp_key = pub.get("@id", "").replace("dblp:", "") result = { "title": info.get("title", ""), "authors": authors, "venue": info.get("venue", ""), "year": int(info.get("year", 0)) if info.get("year") else None, "type": info.get("type", ""), "doi": info.get("doi", ""), "ee": info.get("ee", ""), "url": info.get("url", ""), "dblp_key": dblp_key, # Use more specific name for the DBLP key } results.append(result) except requests.exceptions.Timeout: logger.error(f"Timeout error searching DBLP after {REQUEST_TIMEOUT} seconds") # Provide timeout error information timeout_msg = f"ERROR: Query '{single_query}' timed out after {REQUEST_TIMEOUT} seconds" results.append( { "title": timeout_msg, "authors": [], "venue": "Error", "year": None, "error": f"Timeout after {REQUEST_TIMEOUT} seconds", } ) except Exception as e: logger.error(f"Error searching DBLP: {e}") # Return error result instead of mock data error_msg = f"ERROR: DBLP API error for query '{single_query}': {str(e)}" results.append( { "title": error_msg, "authors": [], "venue": "Error", "year": None, "error": str(e), } ) return results def search( query: str, max_results: int = 10, year_from: int | None = None, year_to: int | None = None, venue_filter: str | None = None, include_bibtex: bool = False, ) -> list[dict[str, Any]]: """ Search DBLP using their public API. Parameters: query (str): The search query string. max_results (int, optional): Maximum number of results to return. Default is 10. year_from (int, optional): Lower bound for publication year. year_to (int, optional): Upper bound for publication year. venue_filter (str, optional): Case-insensitive substring filter for publication venues. include_bibtex (bool, optional): Whether to include BibTeX entries in the results. Default is False. Returns: List[Dict[str, Any]]: A list of publication dictionaries. """ query_lower = query.lower() if "(" in query or ")" in query: logger.warning( "Parentheses are not supported in boolean queries. " "They will be treated as literal characters." ) results = [] if " or " in query_lower: subqueries = [q.strip() for q in query_lower.split(" or ") if q.strip()] seen = set() for q in subqueries: for pub in _fetch_publications(q, max_results): identifier = (pub.get("title"), pub.get("year")) if identifier not in seen: results.append(pub) seen.add(identifier) else: results = _fetch_publications(query, max_results) filtered_results = [] for result in results: if year_from or year_to: year = result.get("year") if year: try: year = int(year) if (year_from and year < year_from) or (year_to and year > year_to): continue except (ValueError, TypeError): pass if venue_filter: venue = result.get("venue", "") if venue_filter.lower() not in venue.lower(): continue filtered_results.append(result) if not filtered_results: logger.info("No results found. Consider revising your query syntax.") filtered_results = filtered_results[:max_results] # Fetch BibTeX entries if requested if include_bibtex: for result in filtered_results: if "dblp_key" in result and result["dblp_key"]: result["bibtex"] = fetch_bibtex_entry(result["dblp_key"]) return filtered_results def get_author_publications( author_name: str, similarity_threshold: float, max_results: int = 20, include_bibtex: bool = False, ) -> dict[str, Any]: """ Get publication information for a specific author with fuzzy matching. Parameters: author_name (str): Author name to search for. similarity_threshold (float): Threshold for fuzzy matching (0-1). max_results (int, optional): Maximum number of results to return. Default is 20. include_bibtex (bool, optional): Whether to include BibTeX entries. Default is False. Returns: Dict[str, Any]: Dictionary with author publication information. """ logger.info( f"Getting publications for author: {author_name} with similarity threshold {similarity_threshold}" ) author_query = f"author:{author_name}" publications = search(author_query, max_results=max_results * 2) filtered_publications = [] for pub in publications: best_ratio = 0.0 for candidate in pub.get("authors", []): ratio = difflib.SequenceMatcher(None, author_name.lower(), candidate.lower()).ratio() if ratio > best_ratio: best_ratio = ratio if best_ratio >= similarity_threshold: filtered_publications.append(pub) filtered_publications = filtered_publications[:max_results] # Fetch BibTeX entries if requested if include_bibtex: for pub in filtered_publications: if "dblp_key" in pub and pub["dblp_key"]: pub["bibtex"] = fetch_bibtex_entry(pub["dblp_key"]) venues = Counter([p.get("venue", "") for p in filtered_publications]) years = Counter([p.get("year", "") for p in filtered_publications]) types = Counter([p.get("type", "") for p in filtered_publications]) return { "name": author_name, "publication_count": len(filtered_publications), "publications": filtered_publications, "stats": { "venues": venues.most_common(5), "years": years.most_common(5), "types": dict(types), }, } def get_title_publications( title_query: str, similarity_threshold: float, max_results: int = 20 ) -> list[dict[str, Any]]: """ Retrieve publications whose titles fuzzy-match the given title_query. Parameters: title_query (str): The title string to search for. similarity_threshold (float): A compulsory threshold (0 <= threshold <= 1), where 1.0 means an exact match. max_results (int): Maximum number of matching publications to return (default is 20). Returns: List[Dict[str, Any]]: A list of publication dictionaries that have a title similarity ratio greater than or equal to the threshold. Announcement: We are pleased to announce a new fuzzy title matching tool in MCP-DBLP. With get_title_publications(), users can now search for publications by title using a similarity threshold (with 1.0 indicating an exact match). This enhancement ensures that minor variations or misspellings in publication titles do not prevent relevant results from being returned. """ candidates = search(title_query, max_results=max_results * 2) filtered = [] for pub in candidates: pub_title = pub.get("title", "") ratio = difflib.SequenceMatcher(None, title_query.lower(), pub_title.lower()).ratio() if ratio >= similarity_threshold: pub["title_similarity"] = ratio filtered.append(pub) filtered = sorted(filtered, key=lambda x: x["title_similarity"], reverse=True) return filtered[:max_results] def fuzzy_title_search( title: str, similarity_threshold: float, max_results: int = 10, year_from: int | None = None, year_to: int | None = None, venue_filter: str | None = None, include_bibtex: bool = False, ) -> list[dict[str, Any]]: """ Search DBLP for publications with fuzzy title matching. Uses multiple search strategies to improve recall: 1. Search with "title:" prefix 2. Search without prefix (broader matching) 3. Calculate similarity scores and rank by best match Note: DBLP's search ranking may not prioritize the exact paper you're looking for. For best results, include author name or year in the title parameter (e.g., "Attention is All You Need Vaswani" or use the regular search() function). Parameters: title (str): Full or partial title of the publication (case-insensitive). similarity_threshold (float): A float between 0 and 1 where 1.0 means an exact match. max_results (int, optional): Maximum number of publications to return. Default is 10. year_from (int, optional): Lower bound for publication year. year_to (int, optional): Upper bound for publication year. venue_filter (str, optional): Case-insensitive substring filter for publication venues. include_bibtex (bool, optional): Whether to include BibTeX entries. Default is False. Returns: List[Dict[str, Any]]: A list of publication objects sorted by title similarity score. """ logger.info(f"Searching for title: '{title}' with similarity threshold {similarity_threshold}") candidates = [] seen_titles = set() # Strategy 1: Search with title prefix title_query = f"title:{title}" results = search( title_query, max_results=max_results * 3, year_from=year_from, year_to=year_to, venue_filter=venue_filter, ) for pub in results: t = pub.get("title", "") if t not in seen_titles: candidates.append(pub) seen_titles.add(t) # Strategy 2: Search without prefix results = search( title, max_results=max_results * 2, year_from=year_from, year_to=year_to, venue_filter=venue_filter, ) for pub in results: t = pub.get("title", "") if t not in seen_titles: candidates.append(pub) seen_titles.add(t) # Calculate similarity scores filtered = [] for pub in candidates: pub_title = pub.get("title", "") ratio = difflib.SequenceMatcher(None, title.lower(), pub_title.lower()).ratio() if ratio >= similarity_threshold: pub["similarity"] = ratio filtered.append(pub) # Sort by similarity score (highest first) filtered = sorted(filtered, key=lambda x: x.get("similarity", 0), reverse=True) filtered = filtered[:max_results] # Fetch BibTeX entries if requested if include_bibtex: for pub in filtered: if "dblp_key" in pub and pub["dblp_key"]: bibtex = fetch_bibtex_entry(pub["dblp_key"]) if bibtex: pub["bibtex"] = bibtex return filtered def fetch_and_process_bibtex(url, new_key): """ Fetch BibTeX from URL and replace the key with new_key. Parameters: url (str): URL to the BibTeX file new_key (str): New citation key to replace the original one Returns: str: BibTeX content with replaced citation key, or error message """ try: response = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT) response.raise_for_status() bibtex = response.text # Replace the key in format @TYPE{KEY, ... -> @TYPE{new_key, ... bibtex = re.sub(r"@(\w+){([^,]+),", r"@\1{" + new_key + ",", bibtex, count=1) return bibtex except requests.exceptions.Timeout: logger.error(f"Timeout fetching {url} after {REQUEST_TIMEOUT} seconds") return f"% Error: Timeout fetching {url} after {REQUEST_TIMEOUT} seconds" except Exception as e: logger.error(f"Error fetching {url}: {str(e)}", exc_info=True) return f"% Error fetching {url}: {str(e)}" def fetch_bibtex_entry(dblp_key: str) -> str: """ Fetch BibTeX entry from DBLP by key. Parameters: dblp_key (str): DBLP publication key. Returns: str: BibTeX entry, or empty string if not found. """ try: # Make sure we have a valid key if not dblp_key or dblp_key.isspace(): logger.warning("Empty or invalid DBLP key provided") return "" # Try multiple URL formats to increase chances of success urls_to_try = [] # Format 1: Direct key urls_to_try.append(f"https://dblp.org/rec/{dblp_key}.bib") # Format 2: If the key has slashes, it might be a full path if "/" in dblp_key: urls_to_try.append(f"https://dblp.org/rec/{dblp_key}.bib") # Format 3: If the key has a colon, it might be a DBLP-style key if ":" in dblp_key: clean_key = dblp_key.replace(":", "/") urls_to_try.append(f"https://dblp.org/rec/{clean_key}.bib") # Try each URL until one works for url in urls_to_try: logger.info(f"Fetching BibTeX from: {url}") response = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT) logger.info(f"Response status: {response.status_code}") if response.status_code == 200: bibtex = response.text if not bibtex or bibtex.isspace(): logger.warning(f"Received empty BibTeX content for URL: {url}") continue logger.info(f"BibTeX content (first 100 chars): {bibtex[:100]}") # Extract the citation type and key (e.g., @article{DBLP:journals/jmlr/ChowdheryNDBMGMBCDDRSSTWPLLNSZDYJGKPSN23,) citation_key_match = re.match(r"@(\w+){([^,]+),", bibtex) if citation_key_match: citation_type = citation_key_match.group(1) old_key = citation_key_match.group(2) logger.info(f"Found citation type: {citation_type}, key: {old_key}") # Create a new key based on the first author's last name and year # Try to extract author and year from the DBLP key or from the BibTeX content author_year_match = re.search(r"([A-Z][a-z]+).*?(\d{2,4})", dblp_key) if author_year_match: author = author_year_match.group(1) year = author_year_match.group(2) if len(year) == 2: # Convert 2-digit year to 4-digit year = "20" + year if int(year) < 50 else "19" + year new_key = f"{author}{year}" logger.info(f"Generated new key: {new_key}") else: # If we can't extract from key, create a simpler key from the DBLP key parts = dblp_key.split("/") new_key = parts[-1] if parts else dblp_key logger.info(f"Using fallback key: {new_key}") # Replace the old key with the new key bibtex = bibtex.replace(f"{{{old_key},", f"{{{new_key},", 1) logger.info("Replaced old key with new key") return bibtex else: logger.warning( f"Could not parse citation key pattern from BibTeX: {bibtex[:100]}..." ) return bibtex # Return the original if we couldn't parse it # If we've tried all URLs and none worked logger.warning( f"Failed to fetch BibTeX for key: {dblp_key} after trying multiple URL formats" ) return "" except requests.exceptions.Timeout: logger.error(f"Timeout fetching BibTeX for {dblp_key} after {REQUEST_TIMEOUT} seconds") return f"% Error: Timeout fetching BibTeX for {dblp_key} after {REQUEST_TIMEOUT} seconds" except Exception as e: logger.error(f"Error fetching BibTeX for {dblp_key}: {str(e)}", exc_info=True) return ( f"% Error: An unexpected error occurred while fetching BibTeX for {dblp_key}: {str(e)}" ) def get_venue_info(venue_name: str) -> dict[str, Any]: """ Get information about a publication venue using DBLP venue search API. Returns venue name, acronym, type, and DBLP URL. """ logger.info(f"Getting information for venue: {venue_name}") try: url = "https://dblp.org/search/venue/api" params = {"q": venue_name, "format": "json", "h": 1} response = requests.get(url, params=params, headers=HEADERS, timeout=REQUEST_TIMEOUT) response.raise_for_status() data = response.json() hits = data.get("result", {}).get("hits", {}) total = int(hits.get("@total", "0")) if total > 0: hit = hits.get("hit", []) if isinstance(hit, list): hit = hit[0] info = hit.get("info", {}) return { "venue": info.get("venue", ""), "acronym": info.get("acronym", ""), "type": info.get("type", ""), "url": info.get("url", ""), } else: logger.warning(f"No venue found for: {venue_name}") return { "venue": "", "acronym": "", "type": "", "url": "", } except Exception as e: logger.error(f"Error fetching venue info for {venue_name}: {str(e)}") return { "venue": "", "acronym": "", "type": "", "url": "", } def calculate_statistics(results: list[dict[str, Any]]) -> dict[str, Any]: """ Calculate statistics from publication results. (Documentation omitted for brevity) """ logger.info(f"Calculating statistics for {len(results)} results") authors = Counter() venues = Counter() years = [] for result in results: for author in result.get("authors", []): authors[author] += 1 venue = result.get("venue", "") # Handle venue as list or string if isinstance(venue, list): venue = ", ".join(venue) if venue else "" if venue: venues[venue] += 1 else: venues["(empty)"] += 1 year = result.get("year") if year: with contextlib.suppress(ValueError, TypeError): years.append(int(year)) stats = { "total_publications": len(results), "time_range": {"min": min(years) if years else None, "max": max(years) if years else None}, "top_authors": sorted(authors.items(), key=lambda x: x[1], reverse=True), "top_venues": sorted(venues.items(), key=lambda x: x[1], reverse=True), } return stats

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/szeider/mcp-dblp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server