Skip to main content
Glama
osf.py10.1 kB
from typing import Any, Dict, Optional from urllib.parse import quote, urlencode import requests from utils import sanitize_api_queries from .providers import fetch_osf_providers, validate_provider def fetch_osf_preprints( provider_id: Optional[str] = None, subjects: Optional[str] = None, date_published_gte: Optional[str] = None, query: Optional[str] = None, ) -> Dict[str, Any]: """ NOTE: The OSF API only supports a limited set of filters. Many common filters like title, DOI, creator, etc. are NOT supported by the OSF API. When query is provided, uses the trove search endpoint which supports full-text search. Args: provider_id: The provider ID (e.g., 'psyarxiv', 'socarxiv') subjects: Subject filter (e.g., 'psychology', 'neuroscience') date_published_gte: Published date greater than or equal to (YYYY-MM-DD) query: Text search query for title, author, content (uses trove endpoint) Returns: Dictionary containing preprints data from OSF API or trove search """ # If query is provided, use trove search endpoint if query: return fetch_osf_preprints_via_trove(query, provider_id) # Build query parameters (only using OSF API supported filters) filters = {} if provider_id: filters["filter[provider]"] = sanitize_api_queries(provider_id, max_length=50) if subjects: filters["filter[subjects]"] = sanitize_api_queries(subjects, max_length=100) if date_published_gte: filters["filter[date_published][gte]"] = date_published_gte # Dates don't need cleaning # Build URL with filters base_url = "https://api.osf.io/v2/preprints/" if filters: query_string = urlencode(filters, safe="", quote_via=quote) url = f"{base_url}?{query_string}" else: url = base_url try: response = requests.get(url, timeout=30) response.raise_for_status() return response.json() except requests.exceptions.HTTPError as e: if response.status_code == 400: if len(filters) > 1: simple_filters = {} if provider_id: simple_filters["filter[provider]"] = sanitize_api_queries(provider_id, max_length=50) simple_query = urlencode(simple_filters, safe="", quote_via=quote) simple_url = f"{base_url}?{simple_query}" try: simple_response = requests.get(simple_url, timeout=30) simple_response.raise_for_status() result = simple_response.json() # Add a note about the simplified search if "meta" not in result: result["meta"] = {} result["meta"][ "search_note" ] = f"Original search failed (400 error), showing all results for provider '{provider_id}'. You may need to filter results manually." return result except: pass raise ValueError(f"Bad request (400) - The search parameters may be invalid. Original error: {str(e)}") else: raise e except requests.exceptions.RequestException as e: raise ValueError(f"Request failed: {str(e)}") def fetch_osf_preprints_via_trove(query: str, provider_id: Optional[str] = None) -> Dict[str, Any]: """ Fetch preprints using the trove search endpoint and transform to standard format. """ from urllib.parse import quote_plus # Build trove search URL base_url = "https://share.osf.io/trove/index-card-search" params = { "cardSearchFilter[resourceType]": "Preprint", "cardSearchText[*,creator.name,isContainedBy.creator.name]": sanitize_api_queries(query, max_length=200), "page[size]": "20", # Match our default page size "sort": "-relevance", } # Validate provider if specified (we'll filter results later) if provider_id: if not validate_provider(provider_id): osf_providers = fetch_osf_providers() valid_ids = [p["id"] for p in osf_providers] raise ValueError(f"Invalid OSF provider: {provider_id}. Valid OSF providers: {valid_ids}") # Build query string manually to handle complex parameter names query_parts = [] for key, value in params.items(): query_parts.append(f"{quote_plus(key)}={quote_plus(str(value))}") query_string = "&".join(query_parts) url = f"{base_url}?{query_string}" try: headers = {"Accept": "application/json"} response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() trove_data = response.json() # Transform trove format to standard OSF API format transformed_data = [] for item in trove_data.get("data", []): # Extract OSF ID from @id field osf_id = "" if "@id" in item and "osf.io/" in item["@id"]: osf_id = item["@id"].split("/")[-1] # Filter by provider if specified if provider_id: # Check if this item is from the specified provider publisher_info = item.get("publisher", []) if isinstance(publisher_info, list) and len(publisher_info) > 0: publisher_id = publisher_info[0].get("@id", "") # Extract provider ID from publisher URL (e.g., "https://osf.io/preprints/psyarxiv" -> "psyarxiv") if provider_id not in publisher_id: continue # Skip this item if it doesn't match the provider else: continue # Skip if no publisher info # Transform to standard format transformed_item = { "id": osf_id, "type": "preprints", "attributes": { "title": extract_first_value(item.get("title", [])), "description": extract_first_value(item.get("description", [])), "date_created": extract_first_value(item.get("dateCreated", [])), "date_published": extract_first_value(item.get("dateAccepted", [])), "date_modified": extract_first_value(item.get("dateModified", [])), "doi": extract_doi_from_identifiers(item.get("identifier", [])), "tags": [kw.get("@value", "") for kw in item.get("keyword", [])], "subjects": [subj.get("prefLabel", [{}])[0].get("@value", "") for subj in item.get("subject", [])], }, "relationships": {}, "links": {"self": item.get("@id", "")}, } transformed_data.append(transformed_item) # Return in standard OSF API format return { "data": transformed_data, "meta": { "version": "2.0", # Match OSF API version "total": trove_data.get("meta", {}).get("total", len(transformed_data)), "search_note": f"Results from trove search for query: '{query}'", }, "links": { "first": trove_data.get("links", {}).get("first", ""), "next": trove_data.get("links", {}).get("next", ""), "last": "", "prev": "", "meta": "", }, } except requests.exceptions.RequestException as e: raise ValueError(f"Trove search failed: {str(e)}") def extract_first_value(field_list): """Extract the first @value from a field list.""" if isinstance(field_list, list) and len(field_list) > 0: if isinstance(field_list[0], dict) and "@value" in field_list[0]: return field_list[0]["@value"] elif isinstance(field_list[0], str): return field_list[0] return "" def extract_doi_from_identifiers(identifiers): """Extract DOI from identifier list.""" for identifier in identifiers: if isinstance(identifier, dict) and "@value" in identifier: value = identifier["@value"] if "doi.org" in value or value.startswith("10."): return value return "" def fetch_single_osf_preprint_metadata(preprint_id: str) -> Dict[str, Any]: try: preprint_url = f"https://api.osf.io/v2/preprints/{preprint_id}" response = requests.get(preprint_url, timeout=30) response.raise_for_status() preprint_data = response.json() primary_file_url = preprint_data["data"]["relationships"]["primary_file"]["links"]["related"]["href"] file_response = requests.get(primary_file_url, timeout=30) file_response.raise_for_status() file_data = file_response.json() # Get the download URL download_url = file_data["data"]["links"]["download"] # Prepare metadata first attributes = preprint_data["data"]["attributes"] metadata = { "id": preprint_id, "title": attributes.get("title", ""), "description": attributes.get("description", ""), "date_created": attributes.get("date_created", ""), "date_published": attributes.get("date_published", ""), "date_modified": attributes.get("date_modified", ""), "is_published": attributes.get("is_published", False), "is_preprint_orphan": attributes.get("is_preprint_orphan", False), "license_record": attributes.get("license_record", {}), "doi": attributes.get("doi", ""), "tags": attributes.get("tags", []), "subjects": attributes.get("subjects", []), "download_url": download_url, } if not download_url: return {"status": "error", "message": "Download URL not available", "metadata": metadata} return metadata except requests.exceptions.RequestException as e: raise ValueError(f"Failed to fetch preprint metadata: {str(e)}")

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/matsjfunke/paperclip'

If you have feedback or need assistance with the MCP directory API, please join our Discord server