Skip to main content
Glama
openalex.py8.37 kB
import requests from typing import Any, Dict, Optional from urllib.parse import urlencode from utils import sanitize_api_queries def fetch_openalex_papers( query: Optional[str] = None, author: Optional[str] = None, title: Optional[str] = None, publisher: Optional[str] = None, institution: Optional[str] = None, concepts: Optional[str] = None, date_published_gte: Optional[str] = None, max_results: int = 20, page: int = 1, ) -> Dict[str, Any]: """ Fetch papers from the OpenAlex API using various search parameters. Args: query: General search query (full-text search) author: Author name to search for title: Title keywords to search for publisher: Publisher name to search for institution: Institution name to search for concepts: Concepts to filter by (e.g., 'computer science', 'artificial intelligence') date_published_gte: Published date greater than or equal to (YYYY-MM-DD) max_results: Maximum number of results to return (default 20, max 200) page: Page number for pagination (default 1) Returns: Dictionary containing papers data from OpenAlex API """ base_url = "https://api.openalex.org/works" filters = {} if query: filters["search"] = sanitize_api_queries(query, max_length=500) if author: filters["filter"] = f"authors.author_name.search:{sanitize_api_queries(author, max_length=200)}" if title: if "filter" in filters: filters["filter"] += f",{sanitize_api_queries(title, max_length=500)}" else: filters["filter"] = f"title.search:{sanitize_api_queries(title, max_length=500)}" if publisher: if "filter" in filters: filters["filter"] += f",publisher.search:{sanitize_api_queries(publisher, max_length=200)}" else: filters["filter"] = f"publisher.search:{sanitize_api_queries(publisher, max_length=200)}" if institution: if "filter" in filters: filters["filter"] += f",institutions.institution_name.search:{sanitize_api_queries(institution, max_length=200)}" else: filters["filter"] = f"institutions.institution_name.search:{sanitize_api_queries(institution, max_length=200)}" if concepts: # OpenAlex concepts can be tricky, simple search might work for now if "filter" in filters: filters["filter"] += f",concepts.display_name.search:{sanitize_api_queries(concepts, max_length=200)}" else: filters["filter"] = f"concepts.display_name.search:{sanitize_api_queries(concepts, max_length=200)}" if date_published_gte: if "filter" in filters: filters["filter"] += f",publication_date:>{date_published_gte}" else: filters["filter"] = f"publication_date:>{date_published_gte}" # Add pagination and results limit filters["per_page"] = min(max_results, 200) # OpenAlex max per_page is 200 filters["page"] = page try: query_string = urlencode(filters, safe=":,") # Allow colons and commas in filter values url = f"{base_url}?{query_string}" response = requests.get(url, timeout=30) response.raise_for_status() data = response.json() papers = [] for result in data.get("results", []): paper = _parse_openalex_work(result) papers.append(paper) return { "data": papers, "meta": { "total_results": data.get("meta", {}).get("count", 0), "page": page, "per_page": filters["per_page"], "search_query": query, # Only include general query for simplicity }, "links": data.get("meta", {}).get("next_page", ""), } except requests.exceptions.RequestException as e: raise ValueError(f"Request failed: {str(e)}") def _parse_openalex_work(work_data: Dict[str, Any]) -> Dict[str, Any]: """Parse a single OpenAlex work entry.""" # Extract authors authors = [] for authorship in work_data.get("authorships", []): author = authorship.get("author", {}) if author and author.get("display_name"): authors.append(author["display_name"]) # Extract concepts concepts = [] for concept in work_data.get("concepts", []): if concept.get("display_name"): concepts.append(concept["display_name"]) # Extract PDF URL from primary location or alternative locations pdf_url = "" primary_location = work_data.get("primary_location") or {} if primary_location and primary_location.get("pdf_url"): pdf_url = primary_location["pdf_url"] elif primary_location and primary_location.get("landing_page_url"): pdf_url = primary_location.get("landing_page_url", "") else: # Check all locations for a PDF URL if primary doesn't have one for location in work_data.get("locations", []): if location.get("pdf_url"): pdf_url = location["pdf_url"] break # Extract abstract from inverted index abstract = "" abstract_inverted_index = work_data.get("abstract_inverted_index", {}) if abstract_inverted_index: abstract = _reconstruct_abstract_from_inverted_index(abstract_inverted_index) # Extract OpenAlex ID from URL openalex_id = work_data.get("id", "") if openalex_id.startswith("https://openalex.org/"): openalex_id = openalex_id.replace("https://openalex.org/", "") # Get primary location source info primary_source = "" if primary_location and primary_location.get("source"): source = primary_location.get("source") or {} primary_source = source.get("display_name", "") return { "id": openalex_id, "doi": work_data.get("doi", ""), "title": work_data.get("title", "") or work_data.get("display_name", ""), "abstract": abstract, "authors": authors, "publication_date": work_data.get("publication_date", ""), "publication_year": work_data.get("publication_year"), "cited_by_count": work_data.get("cited_by_count", 0), "concepts": concepts, "primary_location_url": (work_data.get("primary_location") or {}).get("landing_page_url", ""), "primary_source": primary_source, "pdf_url": pdf_url, "open_access_status": (work_data.get("open_access") or {}).get("oa_status", "closed"), "is_open_access": (work_data.get("primary_location") or {}).get("is_oa", False), "type": work_data.get("type", ""), "relevance_score": work_data.get("relevance_score", 0), } def _reconstruct_abstract_from_inverted_index(inverted_index: Dict[str, Any]) -> str: """Reconstruct abstract text from OpenAlex's inverted index format.""" if not inverted_index: return "" try: # Create a list to hold words at their positions word_positions = [] for word, positions in inverted_index.items(): if isinstance(positions, list): for position in positions: word_positions.append((position, word)) # Sort by position and reconstruct text word_positions.sort(key=lambda x: x[0]) abstract_words = [word for _, word in word_positions] return " ".join(abstract_words) except Exception: # If reconstruction fails, return empty string return "" def fetch_single_openalex_paper_metadata(paper_id: str) -> Dict[str, Any]: """ Fetch metadata for a single OpenAlex paper by ID. Args: paper_id: OpenAlex paper ID (e.g., 'W2741809809') Returns: Dictionary containing paper metadata """ base_url = "https://api.openalex.org/works" url = f"{base_url}/{paper_id}" try: response = requests.get(url, timeout=30) response.raise_for_status() work_data = response.json() if not work_data.get("id"): raise ValueError(f"No metadata found for paper: {paper_id}") metadata = _parse_openalex_work(work_data) return metadata except requests.exceptions.RequestException as e: raise ValueError(f"Failed to fetch paper metadata: {str(e)}")

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/matsjfunke/paperclip'

If you have feedback or need assistance with the MCP directory API, please join our Discord server