BnF API Server

by Kryzo
Verified
""" Gallica BnF API Client --------------------- Client for the Gallica BnF SRU API. Provides methods to search for documents and retrieve metadata. """ import logging import requests import xml.etree.ElementTree as ET from datetime import datetime from typing import Dict, Any, List, Optional # Set up logging logger = logging.getLogger(__name__) # Constants DEFAULT_MAX_RECORDS = 10 DEFAULT_START_RECORD = 1 BNF_SRU_URL = "https://gallica.bnf.fr/SRU" class GallicaAPI: """ Client for the Gallica BnF SRU API. Provides methods to search for documents and retrieve metadata. """ def __init__(self): """Initialize the Gallica API client.""" self.base_url = BNF_SRU_URL logger.info("Gallica API client initialized") def search(self, query: str, start_record: int = DEFAULT_START_RECORD, max_records: int = DEFAULT_MAX_RECORDS) -> Dict[str, Any]: """ Search for documents in the Gallica digital library. Args: query: Search query in CQL format start_record: Starting record number for pagination max_records: Maximum number of records to return Returns: Dictionary containing search results and metadata """ params = { 'version': '1.2', 'operation': 'searchRetrieve', 'query': query, 'startRecord': start_record, 'maximumRecords': max_records } try: response = requests.get(self.base_url, params=params) response.raise_for_status() # Parse the XML response root = ET.fromstring(response.text) # Define namespaces used in the XML namespaces = { 'srw': 'http://www.loc.gov/zing/srw/', 'dc': 'http://purl.org/dc/elements/1.1/', 'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/' } # Get the number of records found num_records = root.find('.//srw:numberOfRecords', namespaces).text # Create a dictionary to store the results results = { "metadata": { "query": query, "total_records": num_records, "records_returned": len(root.findall('.//srw:record', namespaces)), "date_retrieved": datetime.now().strftime("%Y-%m-%d %H:%M:%S") }, "records": [] } # Process each record for record in root.findall('.//srw:record', namespaces): # Get the record data element that contains Dublin Core metadata record_data = record.find('.//srw:recordData/oai_dc:dc', namespaces) if record_data is not None: # Create a dictionary for this record record_dict = {} # Define the Dublin Core fields we want to extract dc_fields = [ 'title', 'creator', 'contributor', 'publisher', 'date', 'description', 'type', 'format', 'identifier', 'source', 'language', 'relation', 'coverage', 'rights', 'subject' ] # Extract each field for field in dc_fields: elements = record_data.findall(f'./dc:{field}', namespaces) if elements: # If there are multiple values, store them as a list if len(elements) > 1: record_dict[field] = [elem.text.strip() for elem in elements if elem.text and elem.text.strip()] # If there's only one value, store it as a string else: text = elements[0].text if text and text.strip(): record_dict[field] = text.strip() # Extract Gallica URL from identifiers if 'identifier' in record_dict: identifiers = record_dict['identifier'] if isinstance(identifiers, list): for identifier in identifiers: if 'gallica.bnf.fr/ark:' in identifier: record_dict['gallica_url'] = identifier break elif 'gallica.bnf.fr/ark:' in identifiers: record_dict['gallica_url'] = identifiers # Add the record to our results results['records'].append(record_dict) return results except requests.exceptions.RequestException as e: logger.error(f"Error during Gallica API request: {e}") return { "error": str(e), "query": query, "parameters": params } except ET.ParseError as e: logger.error(f"Error parsing XML response: {e}") return { "error": f"XML parsing error: {str(e)}", "query": query } except Exception as e: logger.error(f"Unexpected error: {e}") return { "error": str(e), "query": query }