Skip to main content
Glama
oai_pmh_client.py7.87 kB
""" OAI-PMH client for Riksarkivet. """ from typing import Dict, Optional, Union, List from lxml import etree from ..config import OAI_BASE_URL, NAMESPACES from ..utils.http_client import HTTPClient class OAIPMHClient: """Client for interacting with OAI-PMH repositories.""" def __init__(self, http_client: HTTPClient, base_url: str = OAI_BASE_URL): self.http_client = http_client self.base_url = base_url def get_record(self, identifier: str, metadata_prefix: str = "oai_ape_ead") -> Dict[str, Union[str, List, Dict]]: """Get a specific record with full metadata.""" oai_request_parameters = self._build_oai_request_parameters(identifier, metadata_prefix) xml_response_root = self._make_request(oai_request_parameters) oai_record_element = self._extract_record_from_response(xml_response_root) extracted_record_data = self._build_basic_record_result(oai_record_element, metadata_prefix) if metadata_prefix == "oai_ape_ead": ead_metadata = self._extract_ead_metadata(oai_record_element) extracted_record_data.update(ead_metadata) return extracted_record_data def _build_oai_request_parameters(self, record_identifier: str, metadata_format: str) -> Dict[str, str]: """Build OAI-PMH request parameters.""" return { "verb": "GetRecord", "identifier": record_identifier, "metadataPrefix": metadata_format, } def _extract_record_from_response(self, xml_root: etree.Element) -> etree.Element: """Extract record element from OAI-PMH response.""" record_elements = xml_root.xpath("//oai:record", namespaces=NAMESPACES) if not record_elements: raise Exception("No record found in OAI-PMH response") return record_elements[0] def _build_basic_record_result(self, record_element: etree.Element, metadata_format: str) -> Dict[str, Union[str, List, Dict]]: """Build basic record result from header information.""" record_header = self._parse_header_information(record_element) return { "identifier": record_header.get("identifier", ""), "datestamp": record_header.get("datestamp", ""), "metadata_format": metadata_format, } def _parse_header_information(self, record_element: etree.Element) -> Dict[str, str]: """Parse header information from record element.""" header_elements = record_element.xpath("oai:header", namespaces=NAMESPACES) if not header_elements: return {"identifier": "", "datestamp": ""} header_element = header_elements[0] return { "identifier": self._get_text(header_element, "oai:identifier") or "", "datestamp": self._get_text(header_element, "oai:datestamp") or "", } def extract_manifset_id(self, identifier: str) -> Optional[str]: """Extract PID from a record for IIIF access.""" try: retrieved_record = self.get_record(identifier, "oai_ape_ead") nad_link = retrieved_record.get("nad_link") if nad_link: return self._extract_manifset_id_from_nad_link(nad_link) return None except Exception: return None def _extract_manifset_id_from_nad_link(self, nad_link_url: str) -> str: """Extract Manifset ID from NAD link URL.""" url_segments = nad_link_url.rstrip("/").split("/") if url_segments: # Remove query parameters if present manifset_id = url_segments[-1].split("?")[0] return manifset_id return "" def _make_request(self, request_parameters: Dict[str, str]) -> etree.Element: """Make an OAI-PMH request and return parsed XML using centralized HTTP client.""" try: xml_content = self.http_client.get_xml(self.base_url, params=request_parameters, timeout=30) xml_response_root = self._parse_xml_response(xml_content) self._check_oai_response_errors(xml_response_root) return xml_response_root except Exception as e: raise Exception(f"OAI-PMH request failed: {e}") from e def _parse_xml_response(self, xml_data: bytes) -> etree.Element: """Parse XML response content.""" try: return etree.fromstring(xml_data) except Exception as parse_error: raise Exception(f"Failed to parse XML response: {parse_error}") from parse_error def _check_oai_response_errors(self, xml_root: etree.Element) -> None: """Check for OAI-PMH errors in the response.""" error_elements = xml_root.xpath("//oai:error", namespaces=NAMESPACES) if error_elements: error_code = error_elements[0].get("code", "unknown") error_message = error_elements[0].text or "No error message" raise Exception(f"OAI-PMH Error [{error_code}]: {error_message}") def _extract_ead_metadata(self, record_element: etree.Element) -> Dict[str, Union[str, List, Dict]]: """Extract metadata from EAD format.""" ead_metadata_element = self._extract_ead_element_from_record(record_element) if ead_metadata_element is None: return {} extracted_metadata = {} document_title = self._extract_title_from_ead(ead_metadata_element) if document_title: extracted_metadata["title"] = document_title unitid_value = self._extract_unitid_from_ead(ead_metadata_element) if unitid_value: extracted_metadata["unitid"] = unitid_value repository_info = self._extract_repository_from_ead(ead_metadata_element) if repository_info: extracted_metadata["repository"] = repository_info nad_link_url = self._extract_nad_link_from_ead(ead_metadata_element) if nad_link_url: extracted_metadata["nad_link"] = nad_link_url return extracted_metadata def _extract_ead_element_from_record(self, record_element: etree.Element) -> Optional[etree.Element]: """Extract EAD element from record.""" ead_elements = record_element.xpath(".//ead:ead", namespaces={"ead": NAMESPACES["ead"]}) return ead_elements[0] if ead_elements else None def _extract_title_from_ead(self, ead_element: etree.Element) -> str: """Extract title from EAD element.""" return self._get_text(ead_element, ".//ead:unittitle", {"ead": NAMESPACES["ead"]}) or "" def _extract_unitid_from_ead(self, ead_element: etree.Element) -> str: """Extract unit ID from EAD element.""" return self._get_text(ead_element, ".//ead:unitid", {"ead": NAMESPACES["ead"]}) or "" def _extract_repository_from_ead(self, ead_element: etree.Element) -> str: """Extract repository information from EAD element.""" return self._get_text(ead_element, ".//ead:repository", {"ead": NAMESPACES["ead"]}) or "" def _extract_nad_link_from_ead(self, ead_element: etree.Element) -> str: """Extract NAD link from EAD element.""" dao_elements = ead_element.xpath(".//ead:dao", namespaces={"ead": NAMESPACES["ead"]}) if dao_elements: return dao_elements[0].get("{http://www.w3.org/1999/xlink}href", "") return "" def _get_text( self, element: etree.Element, xpath: str, namespaces: Optional[Dict[str, str]] = None, ) -> Optional[str]: """Get text from element using XPath.""" if namespaces is None: namespaces = NAMESPACES matches = element.xpath(xpath, namespaces=namespaces) if matches: if hasattr(matches[0], "text"): return matches[0].text return str(matches[0]) return None

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/AI-Riksarkivet/oxenstierna'

If you have feedback or need assistance with the MCP directory API, please join our Discord server