Research Tracker MCP Server

inference.py•20.3 kB

""" Research Tracker MCP Server - Core Inference Functions Core inference logic for extracting research metadata from various inputs. """ import logging from typing import List, Dict, Any, Optional from urllib.parse import urlparse import requests import feedparser from bs4 import BeautifulSoup from config import ARXIV_API_BASE, HUGGINGFACE_API_BASE, HF_TOKEN, GITHUB_AUTH, logger from utils import ( get_arxiv_id, is_valid_paper_url, select_best_github_repo, extract_links_from_soup, scrape_huggingface_paper_page, make_github_request, cached_request ) logger = logging.getLogger(__name__) def create_row_data(input_data: str) -> Dict[str, Any]: """Create standardized row data structure from input.""" row_data = { "Name": None, "Authors": [], "Paper": None, "Code": None, "Project": None, "Space": None, "Model": None, "Dataset": None, "Orgs": [], "License": None, "Date": None, } # Classify input based on URL patterns if input_data.startswith(("http://", "https://")): if "arxiv.org" in input_data or "huggingface.co/papers" in input_data: row_data["Paper"] = input_data elif "github.com" in input_data: row_data["Code"] = input_data elif "github.io" in input_data: row_data["Project"] = input_data elif "huggingface.co/spaces" in input_data: row_data["Space"] = input_data elif "huggingface.co/datasets" in input_data: row_data["Dataset"] = input_data elif "huggingface.co/" in input_data: row_data["Model"] = input_data else: row_data["Paper"] = input_data else: row_data["Name"] = input_data return row_data # Core inference functions def infer_paper_from_row(row_data: Dict[str, Any]) -> Optional[str]: """Infer paper URL from row data""" if row_data.get("Paper") is not None: try: url = urlparse(row_data["Paper"]) if url.scheme in ["http", "https"]: # Convert arXiv PDF to abs format if "arxiv.org/pdf/" in row_data["Paper"]: new_url = row_data["Paper"].replace("/pdf/", "/abs/").replace(".pdf", "") logger.info(f"Paper {new_url} inferred from {row_data['Paper']}") return new_url # If this is an arXiv URL, try HuggingFace papers first for better resource discovery if "arxiv.org/abs/" in row_data["Paper"]: arxiv_id = row_data["Paper"].split("arxiv.org/abs/")[1] hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}" try: # Test if HuggingFace paper page exists and has content response = cached_request(hf_paper_url) if response and len(response.text) > 1000: # Basic check for content logger.info(f"Paper {hf_paper_url} inferred from arXiv (HuggingFace preferred)") return hf_paper_url except Exception: pass # Fall back to original arXiv URL return row_data["Paper"] except Exception: pass # Check if paper is in other fields for field in ["Project", "Code", "Model", "Space", "Dataset", "Name"]: if row_data.get(field) is not None: if "arxiv" in row_data[field] or "huggingface.co/papers" in row_data[field]: logger.info(f"Paper {row_data[field]} inferred from {field}") return row_data[field] # Try following project link and look for paper if row_data.get("Project") is not None: try: response = cached_request(row_data["Project"]) if response: soup = BeautifulSoup(response.text, "html.parser") for link in soup.find_all("a"): href = link.get("href") if href and is_valid_paper_url(href): logger.info(f"Paper {href} inferred from Project") return href except Exception as e: logger.debug(f"Failed to scrape project page: {e}") # Try GitHub README parsing if row_data.get("Code") is not None and "github.com" in row_data["Code"]: try: repo = row_data["Code"].split("github.com/")[1] # First try with GitHub API if available if GITHUB_AUTH: readme_response = make_github_request(f"/repos/{repo}/readme") if readme_response: readme = readme_response.json() if readme.get("type") == "file" and readme.get("download_url"): response = cached_request(readme["download_url"]) if response: soup = BeautifulSoup(response.text, "html.parser") links = extract_links_from_soup(soup, response.text) for link in links: if link and is_valid_paper_url(link): logger.info(f"Paper {link} inferred from Code (via GitHub API)") return link # Fallback: try scraping the GitHub page directly try: github_url = row_data["Code"] response = cached_request(github_url) if response: soup = BeautifulSoup(response.text, "html.parser") links = extract_links_from_soup(soup, response.text) for link in links: if link and is_valid_paper_url(link): logger.info(f"Paper {link} inferred from Code (via GitHub scraping)") return link except Exception: pass except Exception: pass return None def infer_name_from_row(row_data: Dict[str, Any]) -> Optional[str]: """Infer research name from row data""" if row_data.get("Name") is not None: return row_data["Name"] # Try to get name using arxiv api if row_data.get("Paper") is not None: arxiv_id = get_arxiv_id(row_data["Paper"]) if arxiv_id is not None: try: search_params = "id_list=" + arxiv_id response = feedparser.parse(f"{ARXIV_API_BASE}?" + search_params) if response.entries and len(response.entries) > 0: entry = response.entries[0] if hasattr(entry, "title"): name = entry.title.strip() logger.info(f"Name {name} inferred from Paper") return name except Exception: pass # Try to get from code repo if row_data.get("Code") is not None and "github.com" in row_data["Code"]: try: repo = row_data["Code"].split("github.com/")[1] name = repo.split("/")[1] logger.info(f"Name {name} inferred from Code") return name except Exception: pass # Try to get from project page if row_data.get("Project") is not None: try: r = requests.get(row_data["Project"], timeout=30) soup = BeautifulSoup(r.text, "html.parser") if soup.title is not None: name = soup.title.string.strip() logger.info(f"Name {name} inferred from Project") return name except Exception: pass return None def infer_code_from_row(row_data: Dict[str, Any]) -> Optional[str]: """Infer code repository URL from row data""" if row_data.get("Code") is not None: try: url = urlparse(row_data["Code"]) if url.scheme in ["http", "https"] and "github" in url.netloc: return row_data["Code"] except Exception: pass # Check if code is in other fields for field in ["Project", "Paper", "Model", "Space", "Dataset", "Name"]: if row_data.get(field) is not None: try: url = urlparse(row_data[field]) if url.scheme in ["http", "https"] and "github.com" in url.netloc: logger.info(f"Code {row_data[field]} inferred from {field}") return row_data[field] except Exception: pass # Try to infer code from project page if row_data.get("Project") is not None: try: r = requests.get(row_data["Project"], timeout=30) soup = BeautifulSoup(r.text, "html.parser") links = extract_links_from_soup(soup, r.text) # Filter GitHub links github_links = [] for link in links: if link: try: url = urlparse(link) if url.scheme in ["http", "https"] and "github.com" in url.netloc: github_links.append(link) except Exception: pass if github_links: # Extract context keywords from the project page context_keywords = [] if soup.title: context_keywords.extend(soup.title.get_text().split()) # Use URL parts as context project_url_parts = row_data["Project"].split('/') context_keywords.extend([part for part in project_url_parts if part and len(part) > 2]) best_repo = select_best_github_repo(github_links, context_keywords) if best_repo: logger.info(f"Code {best_repo} inferred from Project") return best_repo except Exception: pass # Try scraping HuggingFace paper page for code links if row_data.get("Paper") is not None: arxiv_id = get_arxiv_id(row_data["Paper"]) # Try scraping HuggingFace paper page if "huggingface.co/papers" in row_data["Paper"]: resources = scrape_huggingface_paper_page(row_data["Paper"]) if resources["code"]: code_url = resources["code"][0] # Take first code repo found logger.info(f"Code {code_url} inferred from HuggingFace paper page") return code_url # If we have arXiv URL, try the HuggingFace version first elif "arxiv.org/abs/" in row_data["Paper"] and arxiv_id: hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}" resources = scrape_huggingface_paper_page(hf_paper_url) if resources["code"]: code_url = resources["code"][0] logger.info(f"Code {code_url} inferred from HuggingFace paper page (via arXiv)") return code_url # Fallback: Try GitHub search for papers if row_data.get("Paper") is not None and GITHUB_AUTH: arxiv_id = get_arxiv_id(row_data["Paper"]) if arxiv_id: try: search_endpoint = f"/search/repositories?q={arxiv_id}&sort=stars&order=desc" search_response = make_github_request(search_endpoint) if search_response: search_results = search_response.json() if "items" in search_results and len(search_results["items"]) > 0: repo = search_results["items"][0] repo_url = repo["html_url"] logger.info(f"Code {repo_url} inferred from Paper (GitHub search)") return repo_url except Exception as e: logger.warning(f"Failed to infer code from paper: {e}") return None def infer_authors_from_row(row_data: Dict[str, Any]) -> List[str]: """Infer authors from row data""" authors = row_data.get("Authors", []) if not isinstance(authors, list): authors = [] if row_data.get("Paper") is not None: arxiv_id = get_arxiv_id(row_data["Paper"]) if arxiv_id is not None: try: search_params = "id_list=" + arxiv_id response = feedparser.parse(f"{ARXIV_API_BASE}?" + search_params) if response.entries and len(response.entries) > 0: entry = response.entries[0] if hasattr(entry, 'authors'): api_authors = entry.authors for author in api_authors: if author is None or not hasattr(author, "name"): continue if author.name not in authors and author.name != "arXiv api core": authors.append(author.name) logger.info(f"Author {author.name} inferred from Paper") except Exception as e: logger.warning(f"Failed to fetch authors from arXiv: {e}") return authors def infer_date_from_row(row_data: Dict[str, Any]) -> Optional[str]: """Infer publication date from row data""" if row_data.get("Paper") is not None: arxiv_id = get_arxiv_id(row_data["Paper"]) if arxiv_id is not None: try: search_params = "id_list=" + arxiv_id response = feedparser.parse(f"{ARXIV_API_BASE}?" + search_params) if response.entries and len(response.entries) > 0: entry = response.entries[0] date = getattr(entry, "published", None) or getattr(entry, "updated", None) if date is not None: logger.info(f"Date {date} inferred from Paper") return date except Exception as e: logger.warning(f"Failed to fetch date from arXiv: {e}") return None def infer_model_from_row(row_data: Dict[str, Any]) -> Optional[str]: """Infer HuggingFace model from row data by scraping paper page""" if row_data.get("Paper") is not None: # Try scraping HuggingFace paper page if "huggingface.co/papers" in row_data["Paper"]: resources = scrape_huggingface_paper_page(row_data["Paper"]) if resources["models"]: model_url = resources["models"][0] # Take first model found logger.info(f"Model {model_url} inferred from HuggingFace paper page") return model_url # If we have arXiv URL, try the HuggingFace version elif "arxiv.org/abs/" in row_data["Paper"]: arxiv_id = get_arxiv_id(row_data["Paper"]) if arxiv_id: hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}" resources = scrape_huggingface_paper_page(hf_paper_url) if resources["models"]: model_url = resources["models"][0] logger.info(f"Model {model_url} inferred from HuggingFace paper page (via arXiv)") return model_url return None def infer_dataset_from_row(row_data: Dict[str, Any]) -> Optional[str]: """Infer HuggingFace dataset from row data by scraping paper page""" if row_data.get("Paper") is not None: # Try scraping HuggingFace paper page if "huggingface.co/papers" in row_data["Paper"]: resources = scrape_huggingface_paper_page(row_data["Paper"]) if resources["datasets"]: dataset_url = resources["datasets"][0] # Take first dataset found logger.info(f"Dataset {dataset_url} inferred from HuggingFace paper page") return dataset_url # If we have arXiv URL, try the HuggingFace version elif "arxiv.org/abs/" in row_data["Paper"]: arxiv_id = get_arxiv_id(row_data["Paper"]) if arxiv_id: hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}" resources = scrape_huggingface_paper_page(hf_paper_url) if resources["datasets"]: dataset_url = resources["datasets"][0] logger.info(f"Dataset {dataset_url} inferred from HuggingFace paper page (via arXiv)") return dataset_url return None def infer_space_from_row(row_data: Dict[str, Any]) -> Optional[str]: """Infer HuggingFace space from row data by scraping paper page""" if row_data.get("Paper") is not None: # Try scraping HuggingFace paper page if "huggingface.co/papers" in row_data["Paper"]: resources = scrape_huggingface_paper_page(row_data["Paper"]) if resources["spaces"]: space_url = resources["spaces"][0] # Take first space found logger.info(f"Space {space_url} inferred from HuggingFace paper page") return space_url # If we have arXiv URL, try the HuggingFace version elif "arxiv.org/abs/" in row_data["Paper"]: arxiv_id = get_arxiv_id(row_data["Paper"]) if arxiv_id: hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}" resources = scrape_huggingface_paper_page(hf_paper_url) if resources["spaces"]: space_url = resources["spaces"][0] logger.info(f"Space {space_url} inferred from HuggingFace paper page (via arXiv)") return space_url # Fallback: try to infer from model using HF API if row_data.get("Model") is not None: try: model_id = row_data["Model"].split("huggingface.co/")[1] url = f"{HUGGINGFACE_API_BASE}/spaces?models=" + model_id r = requests.get(url, timeout=30) if r.status_code == 200: spaces = r.json() if len(spaces) > 0: space = spaces[0]["id"] space_url = "https://huggingface.co/spaces/" + space logger.info(f"Space {space} inferred from Model") return space_url except Exception as e: logger.warning(f"Failed to infer space from model: {e}") return None def infer_license_from_row(row_data: Dict[str, Any]) -> Optional[str]: """Infer license information from row data""" if row_data.get("Code") is not None and GITHUB_AUTH and "github.com" in row_data["Code"]: try: repo = row_data["Code"].split("github.com/")[1] r = make_github_request(f"/repos/{repo}/license") if r: license_data = r.json() if "license" in license_data and license_data["license"] is not None: license_name = license_data["license"]["name"] logger.info(f"License {license_name} inferred from Code") return license_name except Exception as e: logger.warning(f"Failed to infer license from code: {e}") return None def infer_field_type(value: str) -> str: """Classify the type of research-related URL or input""" if value is None: return "Unknown" if "arxiv.org/" in value or "huggingface.co/papers" in value or ".pdf" in value: return "Paper" if "github.com" in value: return "Code" if "huggingface.co/spaces" in value: return "Space" if "huggingface.co/datasets" in value: return "Dataset" if "github.io" in value: return "Project" if "huggingface.co/" in value: try: path = value.split("huggingface.co/")[1] path_parts = path.strip("/").split("/") if len(path_parts) >= 2 and not path.startswith(("spaces/", "datasets/", "papers/")): return "Model" except (IndexError, AttributeError): pass return "Unknown"

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/vupatel08/research-mcp-tool'

If you have feedback or need assistance with the MCP directory API, please join our Discord server