Skip to main content
Glama

Riksarkivet MCP Server

ra.py53.1 kB
#!/usr/bin/env python3 # /// script # requires-python = ">=3.9" # dependencies = [ # "requests>=2.31.0", # "click>=8.1.0", # "rich>=13.0.0", # "lxml>=4.9.0", # ] # /// """ Transcribed Search & Browser for Riksarkivet Fast keyword search in transcribed materials with context display and page browsing. """ from __future__ import annotations import json import sys import xml.etree.ElementTree as ET import urllib.parse from collections import defaultdict from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, List, Optional import click import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from rich.console import Console from rich.panel import Panel from rich.table import Table from rich.progress import Progress, SpinnerColumn, TextColumn from lxml import etree # ============================================================================ # Constants and Configuration # ============================================================================ SEARCH_API_BASE_URL = "https://data.riksarkivet.se/api/records" COLLECTION_API_BASE_URL = "https://lbiiif.riksarkivet.se/collection/arkiv" IIIF_BASE_URL = "https://lbiiif.riksarkivet.se" OAI_BASE_URL = "https://oai-pmh.riksarkivet.se/OAI" REQUEST_TIMEOUT = 60 DEFAULT_MAX_RESULTS = 50 DEFAULT_MAX_DISPLAY = 20 DEFAULT_MAX_PAGES = 10 NAMESPACES = { 'oai': 'http://www.openarchives.org/OAI/2.0/', 'dc': 'http://purl.org/dc/elements/1.1/', 'ead': 'urn:isbn:1-931666-22-9', 'xlink': 'http://www.w3.org/1999/xlink' } console = Console() # ============================================================================ # Data Models # ============================================================================ @dataclass class SearchHit: """A search hit with context.""" pid: str title: str reference_code: str page_number: str snippet_text: str full_page_text: Optional[str] = None alto_url: Optional[str] = None image_url: Optional[str] = None bildvisning_url: Optional[str] = None score: float = 0.0 hierarchy: Optional[List[Dict[str, str]]] = None note: Optional[str] = None collection_url: Optional[str] = None manifest_url: Optional[str] = None archival_institution: Optional[List[Dict[str, str]]] = None date: Optional[str] = None @dataclass class PageContext: """Full page context around a search hit.""" page_number: int page_id: str reference_code: str full_text: str alto_url: str image_url: str bildvisning_url: str = "" # ============================================================================ # HTTP and API Utilities # ============================================================================ class HTTPClient: """HTTP client with optimized settings for Riksarkivet APIs.""" @staticmethod def create_session() -> requests.Session: """Create HTTP session optimized for Riksarkivet APIs.""" session = requests.Session() session.headers.update({ 'Connection': 'close', 'User-Agent': 'Transcribed-Search-Browser/1.0' }) retry_strategy = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) adapter = HTTPAdapter(max_retries=retry_strategy, pool_connections=1, pool_maxsize=1) session.mount("http://", adapter) session.mount("https://", adapter) return session class URLGenerator: """Generate various URLs for Riksarkivet resources.""" @staticmethod def clean_id(pid: str) -> str: """Extract clean ID from PID.""" return pid[6:] if pid.startswith('arkis!') else pid @staticmethod def format_page_number(page_number: str) -> str: """Format page number with proper padding.""" clean_page = page_number.lstrip('_') if clean_page.isdigit(): return f"{int(clean_page):05d}" return clean_page.zfill(5) @classmethod def alto_url(cls, pid: str, page_number: str) -> Optional[str]: """Generate ALTO URL from PID and page number.""" try: manifest_id = cls.clean_id(pid) padded_page = cls.format_page_number(page_number) if len(manifest_id) >= 4: first_4_chars = manifest_id[:4] return f"https://sok.riksarkivet.se/dokument/alto/{first_4_chars}/{manifest_id}/{manifest_id}_{padded_page}.xml" return None except Exception: return None @classmethod def iiif_image_url(cls, pid: str, page_number: str) -> Optional[str]: """Generate IIIF image URL from PID and page number.""" try: clean_id = cls.clean_id(pid) padded_page = cls.format_page_number(page_number) return f"https://lbiiif.riksarkivet.se/arkis!{clean_id}_{padded_page}/full/max/0/default.jpg" except Exception: return None @classmethod def bildvisning_url(cls, pid: str, page_number: str, search_term: Optional[str] = None) -> Optional[str]: """Generate bildvisning URL with optional search highlighting.""" try: clean_id = cls.clean_id(pid) padded_page = cls.format_page_number(page_number) base_url = f"https://sok.riksarkivet.se/bildvisning/{clean_id}_{padded_page}" if search_term and search_term.strip(): encoded_term = urllib.parse.quote(search_term.strip()) return f"{base_url}#?q={encoded_term}" return base_url except Exception: return None @classmethod def collection_url(cls, pid: str) -> Optional[str]: """Generate IIIF collection URL from PID.""" try: clean_id = cls.clean_id(pid) return f"{COLLECTION_API_BASE_URL}/{clean_id}" except Exception: return None @classmethod def manifest_url(cls, pid: str, manifest_id: Optional[str] = None) -> Optional[str]: """Generate IIIF manifest URL from PID and optional manifest ID.""" try: clean_id = cls.clean_id(pid) if manifest_id: return f"{IIIF_BASE_URL}/{manifest_id}/manifest" return f"{IIIF_BASE_URL}/{clean_id}_001/manifest" except Exception: return None # ============================================================================ # Core API Clients # ============================================================================ class SearchAPI: """Client for Riksarkivet Search API.""" def __init__(self): self.session = HTTPClient.create_session() def search_transcribed_text(self, keyword: str, max_results: int = DEFAULT_MAX_RESULTS) -> List[SearchHit]: """Fast search for keyword in transcribed materials.""" console.print(f"[blue]Searching for '{keyword}' in transcribed materials...[/blue]") params = { 'transcribed_text': keyword, 'only_digitised_materials': 'true', 'max': max_results, 'sort': 'relevance' } try: response = self.session.get(SEARCH_API_BASE_URL, params=params, timeout=REQUEST_TIMEOUT) response.raise_for_status() data = response.json() hits = [] for item in data.get('items', []): hits.extend(self._process_search_item(item)) console.print(f"[green]Found {len(hits)} page-level hits in {data.get('totalHits', 0)} documents[/green]") return hits except Exception as e: console.print(f"[red]Search failed: {e}[/red]") return [] def _process_search_item(self, item: Dict[str, Any]) -> List[SearchHit]: """Process a single search result item into SearchHit objects.""" metadata = item.get('metadata', {}) transcribed_data = item.get('transcribedText', {}) # Extract basic info pid = item.get('id', 'Unknown') title = item.get('caption', '(No title)') reference_code = metadata.get('referenceCode', '') # Extract enhanced metadata hierarchy = metadata.get('hierarchy', []) note = metadata.get('note') archival_institution = metadata.get('archivalInstitution', []) date = metadata.get('date') # Generate URLs collection_url = URLGenerator.collection_url(pid) if pid else None manifest_url = URLGenerator.manifest_url(pid) if pid else None hits = [] if transcribed_data and 'snippets' in transcribed_data: for snippet in transcribed_data['snippets']: pages = snippet.get('pages', []) for page in pages: page_id = page.get('id', '').lstrip('_') if isinstance(page, dict) else str(page) hit = SearchHit( pid=pid, title=title[:100] + '...' if len(title) > 100 else title, reference_code=reference_code, page_number=page_id, snippet_text=self._clean_html(snippet.get('text', '')), score=snippet.get('score', 0), hierarchy=hierarchy, note=note, collection_url=collection_url, manifest_url=manifest_url, archival_institution=archival_institution, date=date ) hits.append(hit) return hits @staticmethod def _clean_html(text: str) -> str: """Remove HTML tags from text.""" import re return re.sub(r'<[^>]+>', '', text) class ALTOClient: """Client for fetching and parsing ALTO XML files.""" def __init__(self): self.session = HTTPClient.create_session() def fetch_content(self, alto_url: str, timeout: int = 10) -> Optional[str]: """Fetch and parse ALTO XML file to extract full text content.""" try: headers = { 'User-Agent': 'transcribed_search_browser/1.0', 'Accept': 'application/xml, text/xml, */*' } response = self.session.get(alto_url, headers=headers, timeout=timeout) if response.status_code != 200: return None root = ET.fromstring(response.content) return self._extract_text_from_alto(root) except Exception: return None def _extract_text_from_alto(self, root: ET.Element) -> Optional[str]: """Extract text content from ALTO XML root element.""" # Try different ALTO namespaces namespaces = [ {'alto': 'http://www.loc.gov/standards/alto/ns-v4#'}, {'alto': 'http://www.loc.gov/standards/alto/ns-v2#'}, {'alto': 'http://www.loc.gov/standards/alto/ns-v3#'}, ] text_lines = [] # Try with namespaces first for ns in namespaces: for string_elem in root.findall('.//alto:String', ns): content = string_elem.get('CONTENT', '') if content: text_lines.append(content) if text_lines: break # If no namespace works, try without namespace if not text_lines: for string_elem in root.findall('.//String'): content = string_elem.get('CONTENT', '') if content: text_lines.append(content) full_text = ' '.join(text_lines) return full_text.strip() if full_text else None class OAIPMHClient: """Client for interacting with OAI-PMH repositories.""" def __init__(self, base_url: str = OAI_BASE_URL): self.base_url = base_url self.session = requests.Session() self.session.headers.update({'User-Agent': 'Transcribed-Search-Browser/1.0'}) def get_record(self, identifier: str, metadata_prefix: str = 'oai_ape_ead') -> Dict[str, Any]: """Get a specific record with full metadata.""" params = { 'verb': 'GetRecord', 'identifier': identifier, 'metadataPrefix': metadata_prefix } root = self._make_request(params) record = root.xpath('//oai:record', namespaces=NAMESPACES)[0] header = record.xpath('oai:header', namespaces=NAMESPACES)[0] result = { 'identifier': self._get_text(header, 'oai:identifier') or '', 'datestamp': self._get_text(header, 'oai:datestamp') or '', 'metadata_format': metadata_prefix } if metadata_prefix == 'oai_ape_ead': result.update(self._extract_ead_metadata(record)) return result def extract_pid(self, identifier: str) -> Optional[str]: """Extract PID from a record for IIIF access.""" try: record = self.get_record(identifier, 'oai_ape_ead') if 'nad_link' in record: return record['nad_link'].split('/')[-1] return None except Exception: return None def _make_request(self, params: Dict[str, str]) -> etree.Element: """Make an OAI-PMH request and return parsed XML.""" response = self.session.get(self.base_url, params=params) response.raise_for_status() parser = etree.XMLParser(remove_blank_text=True) root = etree.fromstring(response.content, parser) # Check for OAI-PMH errors errors = root.xpath('//oai:error', namespaces=NAMESPACES) if errors: error_code = errors[0].get('code', 'unknown') error_msg = errors[0].text or 'Unknown error' raise Exception(f"OAI-PMH Error [{error_code}]: {error_msg}") return root def _get_text(self, element, xpath: str) -> Optional[str]: """Safely extract text from an XML element.""" result = element.xpath(xpath, namespaces=NAMESPACES) return result[0].text if result and result[0].text else None def _extract_ead_metadata(self, record) -> Dict[str, Any]: """Extract metadata from EAD format.""" metadata = {} ead = record.xpath('.//ead:ead', namespaces=NAMESPACES) if not ead: return metadata ead = ead[0] # Extract title title = ead.xpath('.//ead:unittitle', namespaces=NAMESPACES) if title and title[0].text: metadata['title'] = title[0].text # Extract date date = ead.xpath('.//ead:unitdate', namespaces=NAMESPACES) if date and date[0].text: metadata['date'] = date[0].text # Extract NAD link for PID nad_links = ead.xpath('.//ead:extref/@xlink:href', namespaces=NAMESPACES) if nad_links: for link in nad_links: if 'sok.riksarkivet.se' in link or 'sok-acc.riksarkivet.se' in link: metadata['nad_link'] = link break return metadata class IIIFClient: """Client for IIIF collections and manifests.""" def __init__(self): self.session = HTTPClient.create_session() def explore_collection(self, pid: str, timeout: int = 10) -> Optional[Dict[str, Any]]: """Explore IIIF collection to get manifests.""" collection_url = f"{COLLECTION_API_BASE_URL}/{pid}" try: response = self.session.get(collection_url, timeout=timeout) if response.status_code == 404: return None response.raise_for_status() data = response.json() title = self._extract_iiif_label(data.get('label', {}), "Unknown Collection") manifests = [] for item in data.get('items', []): if item.get('type') == 'Manifest': manifest_title = self._extract_iiif_label(item.get('label', {}), "Untitled") manifest_url = item.get('id', '') manifest_id = manifest_url.rstrip('/').split('/')[-2] if '/manifest' in manifest_url else manifest_url.split('/')[-1] manifests.append({ 'id': manifest_id or manifest_url, 'label': manifest_title, 'url': manifest_url }) return { 'title': title, 'manifests': manifests, 'collection_url': collection_url } except Exception: return None @staticmethod def _extract_iiif_label(label_obj: Any, default: str = "Unknown") -> str: """Smart IIIF label extraction supporting all language map formats.""" if not label_obj: return default if isinstance(label_obj, str): return label_obj if isinstance(label_obj, dict): for lang in ['sv', 'en', 'none']: if lang in label_obj: value = label_obj[lang] return value[0] if isinstance(value, list) and value else str(value) first_lang = next(iter(label_obj.keys()), None) if first_lang: value = label_obj[first_lang] return value[0] if isinstance(value, list) and value else str(value) return str(label_obj) if label_obj else default # ============================================================================ # Core Business Logic # ============================================================================ class PageContextService: """Service for getting full page context.""" def __init__(self): self.alto_client = ALTOClient() def get_page_context(self, pid: str, page_number: str, reference_code: str = "", search_term: Optional[str] = None) -> Optional[PageContext]: """Get full page context for a specific page.""" alto_url = URLGenerator.alto_url(pid, page_number) image_url = URLGenerator.iiif_image_url(pid, page_number) bildvisning_url = URLGenerator.bildvisning_url(pid, page_number, search_term) if not alto_url: return None full_text = self.alto_client.fetch_content(alto_url) if not full_text: return None return PageContext( page_number=int(page_number) if page_number.isdigit() else 0, page_id=page_number, reference_code=reference_code, full_text=full_text, alto_url=alto_url, image_url=image_url or "", bildvisning_url=bildvisning_url or "" ) class SearchEnrichmentService: """Service for enriching search hits with full page context.""" def __init__(self): self.iiif_client = IIIFClient() self.alto_client = ALTOClient() def enrich_hits_with_context(self, hits: List[SearchHit], max_pages: int = DEFAULT_MAX_PAGES, search_term: Optional[str] = None) -> List[SearchHit]: """Enrich search hits with full page context by exploring IIIF collections.""" console.print(f"[blue]Fetching full page context for up to {max_pages} pages...[/blue]") enriched_hits = [] processed = 0 failed = 0 # Group hits by PID to avoid exploring the same collection multiple times hits_by_pid = defaultdict(list) for hit in hits[:max_pages]: hits_by_pid[hit.pid].append(hit) with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress: task = progress.add_task("Loading page contexts...", total=min(len(hits), max_pages)) for pid, pid_hits in hits_by_pid.items(): if processed >= max_pages: break # Explore the IIIF collection to get manifest IDs progress.update(task, description=f"Exploring collection {pid[-8:]}...") collection_info = self.iiif_client.explore_collection(pid, timeout=10) if collection_info and collection_info.get('manifests'): manifest = collection_info['manifests'][0] manifest_id = manifest['id'] progress.update(task, description=f"Found manifest {manifest_id[-8:]}...") # Process each hit for this PID for hit in pid_hits: if processed >= max_pages: break page_ref = f"{hit.reference_code or hit.pid[-8:]}:p{hit.page_number}" self._enrich_single_hit(hit, manifest_id, search_term) progress.update(task, description=f"Fetching {page_ref}...") # Try to get ALTO content if hit.alto_url: full_text = self.alto_client.fetch_content(hit.alto_url, timeout=8) if full_text: hit.full_page_text = full_text progress.update(task, description=f"✓ Loaded {page_ref} ({len(full_text)//100}/10 chars)") else: hit.full_page_text = hit.snippet_text progress.update(task, description=f"○ Using snippet {page_ref}") else: hit.full_page_text = hit.snippet_text enriched_hits.append(hit) processed += 1 progress.advance(task) else: # No manifests found, fall back to snippet text for hit in pid_hits: if processed >= max_pages: break self._enrich_single_hit(hit, hit.pid, search_term) hit.full_page_text = hit.snippet_text enriched_hits.append(hit) processed += 1 failed += 1 progress.advance(task) console.print(f"[green]Successfully loaded context for {len(enriched_hits)} pages[/green]") if failed > 0: console.print(f"[yellow]Used snippet text for {failed} pages (IIIF manifests not found)[/yellow]") return enriched_hits def _enrich_single_hit(self, hit: SearchHit, manifest_id: str, search_term: Optional[str]): """Enrich a single hit with generated URLs.""" hit.alto_url = URLGenerator.alto_url(manifest_id, hit.page_number) hit.image_url = URLGenerator.iiif_image_url(manifest_id, hit.page_number) hit.bildvisning_url = URLGenerator.bildvisning_url(manifest_id, hit.page_number, search_term) def expand_hits_with_context_padding(self, hits: List[SearchHit], padding: int = 1) -> List[SearchHit]: """Expand search hits with context pages around each hit.""" if padding <= 0: return hits # Group hits by PID and reference code hits_by_doc = defaultdict(list) for hit in hits: key = (hit.pid, hit.reference_code) hits_by_doc[key].append(hit) expanded_hits = [] for (pid, ref_code), doc_hits in hits_by_doc.items(): # Get all page numbers for this document hit_pages = set() for hit in doc_hits: try: page_num = int(hit.page_number.lstrip('0')) if hit.page_number.lstrip('0').isdigit() else int(hit.page_number) hit_pages.add(page_num) except (ValueError, AttributeError): hit_pages.add(hit.page_number) # Generate context pages around each hit context_pages = set() for page in hit_pages: if isinstance(page, int): for offset in range(-padding, padding + 1): context_page = page + offset if context_page > 0: context_pages.add(context_page) else: context_pages.add(page) # Create SearchHit objects for all context pages for page in context_pages: existing_hit = self._find_existing_hit(doc_hits, page) if existing_hit: expanded_hits.append(existing_hit) else: context_hit = self._create_context_hit(pid, ref_code, page, doc_hits) expanded_hits.append(context_hit) # Sort by PID, reference code, and page number expanded_hits.sort(key=self._sort_key) return expanded_hits def _find_existing_hit(self, doc_hits: List[SearchHit], page) -> Optional[SearchHit]: """Find existing hit for a given page.""" for hit in doc_hits: hit_page_num = hit.page_number.lstrip('0') if hit.page_number.lstrip('0').isdigit() else hit.page_number try: if isinstance(page, int) and int(hit_page_num) == page: return hit elif str(page) == hit.page_number: return hit except (ValueError, AttributeError): if str(page) == hit.page_number: return hit return None def _create_context_hit(self, pid: str, ref_code: str, page, doc_hits: List[SearchHit]) -> SearchHit: """Create a context hit for a page that doesn't have search results.""" page_str = f"{page:05d}" if isinstance(page, int) else str(page) template_hit = doc_hits[0] if doc_hits else None return SearchHit( pid=pid, title=template_hit.title if template_hit else "(No title)", reference_code=ref_code, page_number=page_str, snippet_text="[Context page - no search hit]", score=0.0, hierarchy=template_hit.hierarchy if template_hit else None, note=template_hit.note if template_hit else None, collection_url=template_hit.collection_url if template_hit else None, manifest_url=template_hit.manifest_url if template_hit else None, archival_institution=template_hit.archival_institution if template_hit else None, date=template_hit.date if template_hit else None ) @staticmethod def _sort_key(hit: SearchHit): """Sort key for organizing hits.""" try: page_num = int(hit.page_number.lstrip('0')) if hit.page_number.lstrip('0').isdigit() else 999999 except (ValueError, AttributeError): page_num = 999999 return (hit.pid, hit.reference_code, page_num) # ============================================================================ # Display Services # ============================================================================ class DisplayService: """Service for displaying search results and page contexts.""" @staticmethod def keyword_highlight(text: str, keyword: str) -> str: """Highlight keyword in text.""" if not keyword: return text import re pattern = re.compile(re.escape(keyword), re.IGNORECASE) return pattern.sub(lambda m: f"[bold yellow underline]{m.group()}[/bold yellow underline]", text) def display_search_hits(self, hits: List[SearchHit], keyword: str, max_display: int = DEFAULT_MAX_DISPLAY): """Display search hits in a table, grouped by reference code.""" if not hits: console.print("[yellow]No search hits found.[/yellow]") return # Group hits by reference code from collections import OrderedDict grouped_hits = OrderedDict() for hit in hits: ref_code = hit.reference_code or hit.pid if ref_code not in grouped_hits: grouped_hits[ref_code] = [] grouped_hits[ref_code].append(hit) console.print(f"\n✓ Found {len(hits)} page-level hits across {len(grouped_hits)} documents") console.print("[dim]💡 Tips: Use --context to see full page transcriptions | Use 'browse' command to view specific reference codes[/dim]") table = Table( "Institution & Reference", "Content", title=f"Search Results for '{keyword}'", show_lines=True, expand=True ) # Display grouped results displayed_groups = 0 for ref_code, ref_hits in grouped_hits.items(): if displayed_groups >= max_display: break displayed_groups += 1 # Take the first hit as representative for metadata first_hit = ref_hits[0] # Extract institution institution = "" if first_hit.archival_institution: institution = first_hit.archival_institution[0].get('caption', '') if first_hit.archival_institution else "" elif first_hit.hierarchy: institution = first_hit.hierarchy[0].get('caption', '') if first_hit.hierarchy else "" # Combine institution and reference with pages institution_and_ref = "" if institution: institution_and_ref = f"🏛️ {institution[:30] + '...' if len(institution) > 30 else institution}\n" # Format pages - list all pages, comma separated, with leading zeros trimmed pages = sorted(set(h.page_number for h in ref_hits)) # Trim leading zeros from each page number pages_trimmed = [] for p in pages: trimmed = p.lstrip('0') or '0' # Keep at least one zero if all zeros pages_trimmed.append(trimmed) pages_str = ",".join(pages_trimmed) institution_and_ref += f"📚 \"{ref_code}\" --page \"{pages_str}\"" if first_hit.date: institution_and_ref += f"\n📅 [dim]{first_hit.date}[/dim]" # Create content column with title and snippets from different pages title_text = first_hit.title[:50] + '...' if len(first_hit.title) > 50 else first_hit.title content_parts = [] # Add title if title_text and title_text.strip(): content_parts.append(f"[bold blue]{title_text}[/bold blue]") else: content_parts.append(f"[bright_black]No title[/bright_black]") # Add snippets with page numbers displayed_snippets = 0 for hit in ref_hits[:3]: # Show max 3 snippets per reference displayed_snippets += 1 snippet = hit.snippet_text[:150] + '...' if len(hit.snippet_text) > 150 else hit.snippet_text snippet = self.keyword_highlight(snippet, keyword) content_parts.append(f"[dim]Page {hit.page_number}:[/dim] [italic]{snippet}[/italic]") if len(ref_hits) > 3: content_parts.append(f"[dim]...and {len(ref_hits) - 3} more pages with hits[/dim]") content = "\n".join(content_parts) table.add_row( institution_and_ref, content ) console.print(table) # Show example browse command with actual reference from first group if grouped_hits: first_ref = next(iter(grouped_hits.keys())) first_group = grouped_hits[first_ref] pages = sorted(set(h.page_number for h in first_group)) # Trim leading zeros from page numbers and limit to max 5 pages for example pages_trimmed = [p.lstrip('0') or '0' for p in pages[:5]] # Take first 5 pages max console.print(f"\n[dim]💡 Example: To view these hits, run:[/dim]") if len(pages_trimmed) == 1: console.print(f"[cyan] browse \"{first_ref}\" --page {pages_trimmed[0]} --search-term \"{keyword}\"[/cyan]") else: pages_str = ",".join(pages_trimmed) if len(pages) > 5: console.print(f"[cyan] browse \"{first_ref}\" --page \"{pages_str}\" --search-term \"{keyword}\"[/cyan]") console.print(f"[dim] (Showing first 5 of {len(pages)} pages with hits)[/dim]") else: console.print(f"[cyan] browse \"{first_ref}\" --page \"{pages_str}\" --search-term \"{keyword}\"[/cyan]") # Count remaining groups instead of hits total_groups = len(grouped_hits) if total_groups > displayed_groups: remaining_groups = total_groups - displayed_groups total_remaining_hits = sum(len(h) for _, h in list(grouped_hits.items())[displayed_groups:]) console.print(f"\n[dim]... and {remaining_groups} more documents with {total_remaining_hits} hits[/dim]") console.print(f"[dim]Options: --max-display N to show more | --context for full pages | 'browse REFERENCE' to view specific documents[/dim]") def display_page_contexts(self, contexts: List[PageContext], keyword: str, reference_code: str = ""): """Display full page contexts with keyword highlighting.""" if not contexts: console.print("[yellow]No page contexts found.[/yellow]") return console.print(f"\n[bold]Full Page Transcriptions ({len(contexts)} pages):[/bold]") for context in contexts: page_content = self._build_page_content(context, keyword, reference_code) panel_title = f"[cyan]Page {context.page_number}: {context.reference_code or 'Unknown Reference'}[/cyan]" console.print(Panel( "\n".join(page_content), title=panel_title, border_style="green", padding=(0, 1) )) def display_enriched_hits(self, hits: List[SearchHit], keyword: str, grouped: bool = False): """Display enriched search hits with full page context, optionally grouped by document.""" if not hits: console.print("[yellow]No enriched hits found.[/yellow]") return if grouped: self._display_grouped_by_document(hits, keyword) else: self._display_individual_hits(hits, keyword) def _display_individual_hits(self, hits: List[SearchHit], keyword: str): """Display search hits individually.""" console.print(f"\n[bold]Search Results with Full Page Context ({len(hits)} pages):[/bold]") for hit in hits: if not hit.full_page_text: continue page_content = self._build_hit_content(hit, keyword) panel_title = f"[cyan]Hit: {hit.reference_code} - Page {hit.page_number}[/cyan]" console.print(Panel( "\n".join(page_content), title=panel_title, border_style="blue", padding=(0, 1) )) def _display_grouped_by_document(self, hits: List[SearchHit], keyword: str): """Display search hits grouped by document/reference code.""" # Group hits by reference code grouped_hits = defaultdict(list) for hit in hits: if hit.full_page_text: key = hit.reference_code or hit.pid grouped_hits[key].append(hit) console.print(f"\n[bold]Search Results Grouped by Document ({len(grouped_hits)} documents, {len(hits)} pages):[/bold]") for doc_ref, doc_hits in grouped_hits.items(): # Sort pages by page number doc_hits.sort(key=lambda h: int(h.page_number) if h.page_number.isdigit() else 0) doc_content = self._build_document_content(doc_hits, keyword) panel_title = f"[cyan]Document: {doc_ref} ({len(doc_hits)} pages)[/cyan]" console.print(Panel( "\n".join(doc_content), title=panel_title, border_style="green", padding=(0, 1) )) def _build_hit_content(self, hit: SearchHit, keyword: str) -> List[str]: """Build content for an individual hit panel.""" page_content = [] # Header with metadata (reference and page info now in panel title) page_content.append(f"\n[bold blue]📄 Title:[/bold blue] {hit.title}") # Add date if available if hit.date: page_content.append(f"[bold blue]📅 Date:[/bold blue] {hit.date}") # Add hierarchy information if hit.hierarchy: page_content.append(f"\n[bold yellow]🏛️ Archive Hierarchy:[/bold yellow]") for level in hit.hierarchy: page_content.append(f" 📁 {level.get('caption', 'Unknown')}") # Add archival institution if hit.archival_institution: page_content.append(f"\n[bold yellow]🏛️ Institution:[/bold yellow]") for inst in hit.archival_institution: page_content.append(f" 🏛️ {inst.get('caption', 'Unknown')}") # Add note if available if hit.note: page_content.append(f"\n[bold magenta]📝 Note:[/bold magenta]") page_content.append(f"[italic]{hit.note}[/italic]") # Full transcribed text with keyword highlighting display_text = self.keyword_highlight(hit.full_page_text or "", keyword) page_content.append(f"\n[bold magenta]📜 Full Transcription:[/bold magenta]") page_content.append(f"[italic]{display_text}[/italic]") # Links section page_content.extend(self._build_links_section(hit)) return page_content def _build_document_content(self, doc_hits: List[SearchHit], keyword: str) -> List[str]: """Build content for a document panel with multiple pages.""" doc_content = [] # Document header (reference code now in panel title) first_hit = doc_hits[0] doc_content.append(f"\n[bold blue]📄 Title:[/bold blue] {first_hit.title}") doc_content.append(f"[bold green]📄 Pages with hits:[/bold green] {', '.join(h.page_number for h in doc_hits)}") # Add document-level metadata from first hit if first_hit.date: doc_content.append(f"[bold blue]📅 Date:[/bold blue] {first_hit.date}") # Add hierarchy information if first_hit.hierarchy: doc_content.append(f"\n[bold yellow]🏛️ Archive Hierarchy:[/bold yellow]") for level in first_hit.hierarchy: doc_content.append(f" 📁 {level.get('caption', 'Unknown')}") # Add archival institution if first_hit.archival_institution: doc_content.append(f"\n[bold yellow]🏛️ Institution:[/bold yellow]") for inst in first_hit.archival_institution: doc_content.append(f" 🏛️ {inst.get('caption', 'Unknown')}") # Add note if available if first_hit.note: doc_content.append(f"\n[bold magenta]📝 Note:[/bold magenta]") doc_content.append(f"[italic]{first_hit.note}[/italic]") # Add document-level links doc_content.append(f"\n[bold cyan]🔗 Document Links:[/bold cyan]") if first_hit.collection_url: doc_content.append(f" [dim]📚 Collection:[/dim] [link]{first_hit.collection_url}[/link]") if first_hit.manifest_url: doc_content.append(f" [dim]📖 Manifest:[/dim] [link]{first_hit.manifest_url}[/link]") # Add each page's content for i, hit in enumerate(doc_hits, 1): # Mark search hits vs context pages is_search_hit = hit.snippet_text != "[Context page - no search hit]" page_marker = "🎯" if is_search_hit else "📄" page_type = "[bold yellow]SEARCH HIT[/bold yellow]" if is_search_hit else "[dim]context[/dim]" doc_content.append(f"\n[bold cyan]── {page_marker} Page {hit.page_number} ({page_type}) ──[/bold cyan]") # Full transcribed text with keyword highlighting display_text = hit.full_page_text or "" if keyword and is_search_hit and display_text: display_text = self.keyword_highlight(display_text, keyword) if display_text: doc_content.append(f"[italic]{display_text}[/italic]") else: doc_content.append(f"[dim italic]No text content available for this page[/dim italic]") # Links for this page doc_content.append(f"\n[dim]🔗 Page {hit.page_number} Links:[/dim]") if hit.alto_url: doc_content.append(f" [dim]📝 ALTO:[/dim] [link]{hit.alto_url}[/link]") if hit.image_url: doc_content.append(f" [dim]🖼️ Image:[/dim] [link]{hit.image_url}[/link]") if hit.bildvisning_url: doc_content.append(f" [dim]👁️ Bildvisning:[/dim] [link]{hit.bildvisning_url}[/link]") # Add separator between pages (except for last page) if i < len(doc_hits): doc_content.append("") return doc_content def _build_links_section(self, hit: SearchHit) -> List[str]: """Build links section for a hit.""" links = [f"\n[bold cyan]🔗 Links:[/bold cyan]"] if hit.alto_url: links.append(f" [dim]📝 ALTO XML:[/dim] [link]{hit.alto_url}[/link]") if hit.image_url: links.append(f" [dim]🖼️ Image:[/dim] [link]{hit.image_url}[/link]") if hit.bildvisning_url: links.append(f" [dim]👁️ Bildvisning:[/dim] [link]{hit.bildvisning_url}[/link]") if hit.collection_url: links.append(f" [dim]📚 Collection:[/dim] [link]{hit.collection_url}[/link]") if hit.manifest_url: links.append(f" [dim]📖 Manifest:[/dim] [link]{hit.manifest_url}[/link]") return links def _build_page_content(self, context: PageContext, keyword: str, reference_code: str) -> List[str]: """Build content for a page panel.""" page_content = [] # Page header (page number and reference now in panel title) # Full transcribed text with keyword highlighting display_text = self.keyword_highlight(context.full_text, keyword) page_content.append(f"\n[bold magenta]📜 Full Transcription:[/bold magenta]") page_content.append(f"[italic]{display_text}[/italic]") # Links section page_content.append(f"\n[bold cyan]🔗 Links:[/bold cyan]") page_content.append(f" [dim]📝 ALTO XML:[/dim] [link]{context.alto_url}[/link]") if context.image_url: page_content.append(f" [dim]🖼️ Image:[/dim] [link]{context.image_url}[/link]") if context.bildvisning_url: page_content.append(f" [dim]👁️ Bildvisning:[/dim] [link]{context.bildvisning_url}[/link]") return page_content # ============================================================================ # Utility Functions # ============================================================================ def parse_page_range(page_range: Optional[str], total_pages: int = 1000) -> List[int]: """Parse page range string and return list of page numbers.""" if not page_range: return list(range(1, min(total_pages + 1, 21))) # Default to first 20 pages pages = [] parts = page_range.split(',') for part in parts: part = part.strip() if '-' in part: start, end = part.split('-', 1) start = int(start.strip()) end = int(end.strip()) pages.extend(range(start, min(end + 1, total_pages + 1))) else: page_num = int(part.strip()) if 1 <= page_num <= total_pages: pages.append(page_num) return sorted(list(set(pages))) # ============================================================================ # CLI Interface # ============================================================================ @click.group() def cli(): """Transcribed Search & Browser for Riksarkivet. Commands: search - Search for keywords in transcribed materials browse - Browse pages by reference code show-pages - Search and display exact pages with context Quick Start: search "keyword" - Basic search search "keyword" --context - Search with full page transcriptions browse "SE/RA/123" --page 5 - View specific page """ pass @cli.command() @click.argument('keyword') @click.option('--max', 'max_results', type=int, default=DEFAULT_MAX_RESULTS, help='Maximum search results') @click.option('--max-display', type=int, default=DEFAULT_MAX_DISPLAY, help='Maximum results to display') @click.option('--context', is_flag=True, help='Show full page context for search hits') @click.option('--max-pages', type=int, default=DEFAULT_MAX_PAGES, help='Maximum pages to load context for') @click.option('--no-grouping', is_flag=True, help='Show pages individually instead of grouped by document (only with --context)') def search(keyword: str, max_results: int, max_display: int, context: bool, max_pages: int, no_grouping: bool): """Search for keyword in transcribed materials. Fast search across all transcribed documents in Riksarkivet. Returns reference codes and page numbers containing the keyword. Examples: uv run transcribed_search_browser.py search "Stockholm" uv run transcribed_search_browser.py search "trolldom" --context --max-pages 5 uv run transcribed_search_browser.py search "vasa" --context --no-grouping --max-pages 3 """ search_api = SearchAPI() display_service = DisplayService() hits = search_api.search_transcribed_text(keyword, max_results) if context and hits: enrichment_service = SearchEnrichmentService() enriched_hits = enrichment_service.enrich_hits_with_context(hits, max_pages, keyword) # Display enriched hits with full transcriptions display_service.display_enriched_hits(enriched_hits, keyword, grouped=not no_grouping) else: display_service.display_search_hits(hits, keyword, max_display) @cli.command() @click.argument('keyword') @click.option('--max-pages', type=int, default=DEFAULT_MAX_PAGES, help='Maximum pages to display with full context') @click.option('--context-padding', type=int, default=1, help='Number of pages to include before and after each hit for context') @click.option('--no-grouping', is_flag=True, help='Show pages individually instead of grouped by document') def show_pages(keyword: str, max_pages: int, context_padding: int, no_grouping: bool): """Search for keyword and show the exact pages containing it with context. This combines search and browse - finds pages with the keyword then displays the full page transcriptions with the keyword highlighted, plus surrounding pages for context. Results are grouped by document by default for better context. Examples: uv run transcribed_search_browser.py show-pages "Stockholm" --max-pages 5 uv run transcribed_search_browser.py show-pages "trolldom" --no-grouping uv run transcribed_search_browser.py show-pages "vasa" --context-padding 2 """ # Step 1: Search for the keyword search_limit = max(max_pages * 3, 50) # Get more results to have options console.print(f"[blue]Searching for '{keyword}' to find exact pages...[/blue]") search_api = SearchAPI() hits = search_api.search_transcribed_text(keyword, search_limit) if not hits: console.print("[yellow]No pages found containing the keyword.[/yellow]") return # Step 2: Show basic search results first console.print(f"\n[green]Found {len(hits)} pages containing '{keyword}'[/green]") # Limit to max_pages for detailed display display_hits = hits[:max_pages] console.print(f"[blue]Showing full context for first {len(display_hits)} pages...[/blue]") # Step 3: Expand hits with context padding and enrich with full page context enrichment_service = SearchEnrichmentService() expanded_hits = enrichment_service.expand_hits_with_context_padding(display_hits, context_padding) console.print(f"[blue]Expanded to {len(expanded_hits)} pages including context padding (+/- {context_padding} pages)...[/blue]") enriched_hits = enrichment_service.enrich_hits_with_context(expanded_hits, len(expanded_hits), keyword) # Step 4: Display results (grouped by document by default) display_service = DisplayService() if enriched_hits: # Use grouped=True by default unless --no-grouping is specified display_service.display_enriched_hits(enriched_hits, keyword, grouped=not no_grouping) else: console.print("[red]No page transcriptions could be loaded.[/red]") @cli.command() @click.argument('reference_code') @click.option('--pages', help='Page range to display (e.g., "1-10" or "5,7,9")') @click.option('--page', help='Single page or page range to display (alias for --pages)') @click.option('--search-term', help='Highlight this term in the text') @click.option('--max-display', type=int, default=DEFAULT_MAX_DISPLAY, help='Maximum pages to display') def browse(reference_code: str, pages: Optional[str], page: Optional[str], search_term: Optional[str], max_display: int): """Browse pages by reference code. You can specify pages using either --pages or --page (they work the same way). If both are provided, --page takes precedence. Examples: uv run transcribed_search_browser.py browse "SE/RA/123" --page 5 uv run transcribed_search_browser.py browse "SE/RA/123" --pages "1-10" uv run transcribed_search_browser.py browse "SE/RA/123" --page "5,7,9" """ console.print(f"[blue]Looking up reference code: {reference_code}[/blue]") # Handle both --pages and --page options (--page takes precedence if both are provided) page_range = page if page is not None else pages # Try search API first session = HTTPClient.create_session() pid = None try: params = {'reference_code': reference_code, 'only_digitised_materials': 'true', 'max': 1} response = session.get(SEARCH_API_BASE_URL, params=params, timeout=REQUEST_TIMEOUT) response.raise_for_status() data = response.json() if data.get('items'): pid = data['items'][0].get('id') console.print(f"[green]Found PID via search: {pid}[/green]") except Exception as e: console.print(f"[yellow]Search API lookup failed: {e}[/yellow]") # Fall back to OAI-PMH if not pid: console.print(f"[blue]Trying OAI-PMH lookup...[/blue]") oai_client = OAIPMHClient() try: pid = oai_client.extract_pid(reference_code) if pid: console.print(f"[green]Found PID via OAI-PMH: {pid}[/green]") else: console.print(f"[red]Could not extract PID for {reference_code}[/red]") sys.exit(1) except Exception as e: console.print(f"[red]Error getting OAI-PMH record: {e}[/red]") sys.exit(1) # Get manifest and load pages iiif_client = IIIFClient() collection_info = iiif_client.explore_collection(pid) manifest_id = pid if collection_info and collection_info.get('manifests'): manifest_id = collection_info['manifests'][0]['id'] console.print(f"[green]Found manifest: {manifest_id}[/green]") selected_pages = parse_page_range(page_range) console.print(f"[blue]Loading {len(selected_pages[:max_display])} pages...[/blue]") # Load page contexts page_service = PageContextService() display_service = DisplayService() contexts = [] with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress: task = progress.add_task("Loading page contexts...", total=len(selected_pages[:max_display])) for page_num in selected_pages[:max_display]: progress.update(task, description=f"Loading page {page_num}...") context = page_service.get_page_context(manifest_id, str(page_num), reference_code, search_term) if context: contexts.append(context) progress.update(task, description=f"✓ Loaded page {page_num}") else: progress.update(task, description=f"✗ Failed to load page {page_num}") progress.advance(task) display_service.display_page_contexts(contexts, search_term or "", reference_code) console.print(f"\n[green]Successfully loaded {len(contexts)} pages[/green]") if __name__ == '__main__': cli()

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/AI-Riksarkivet/ra-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server