Riksarkivet MCP Server

Apache 2.0

Overview InspectNew Endpoints Schema Related Servers Reviews Score

ra-mcp
src
ra_mcp

ra_tools.py•22.4 kB

""" MCP tool definitions for RA-MCP server. Provides search and browse functionality for Riksarkivet documents. """ from typing import Optional, List from fastmcp import FastMCP from pydantic import Field from ra_core import ( SearchAPI, SearchEnrichmentService, PageContextService, IIIFClient, HTTPClient, OAIPMHClient, SEARCH_API_BASE_URL, REQUEST_TIMEOUT, parse_page_range ) from formatters import ( format_search_results, format_page_contexts, format_document_structure, format_error ) from cache import get_cache import os # Initialize FastMCP instance ra_mcp = FastMCP( name="ra-mcp", instructions=""" 🏛️ Riksarkivet (RA) Search and Browse MCP Server This server provides access to transcribed historical documents from the Swedish National Archives. AVAILABLE TOOLS: 1. 🔍 search_transcribed - Search for keywords in transcribed materials - Returns documents and pages containing the keyword - Offset parameter required to encourage comprehensive discovery - Context disabled by default for maximum hit coverage - Provides direct links to images and ALTO XML 2. 📖 browse_document - Browse specific pages by reference code - View full transcriptions of specific pages - Supports page ranges and multiple pages - Optional keyword highlighting 3. 📚 get_document_structure - Get document structure without content - Quick overview of available manifests - Document metadata and hierarchy - Useful for understanding what's available SEARCH STRATEGY FOR MAXIMUM DISCOVERY: 1. Start with search_transcribed(keyword, offset=0) for initial hits 2. Continue pagination with increasing offsets (50, 100, 150...) to find all matches 3. Use show_context=False (default) to see more results per query 4. Only enable show_context=True when you want full page text for specific hits 5. EXPLORE RELATED TERMS: Search for similar/related words to gather comprehensive context - Historical variants and spellings (e.g., "trolldom" + "häxa" + "trollkona") - Synonyms and related concepts (e.g., "satan" + "djävul" for devil-related terms) - Different word forms (e.g., "trolleri" + "trollkonst" for witchcraft variants) - Period-appropriate terminology and archaic spellings 6. Note reference codes and page numbers for detailed browsing 7. Use browse_document() to view full transcriptions of interesting pages TYPICAL WORKFLOW: 1. Comprehensive search: search_transcribed(term, 0), then search_transcribed(term, 50), etc. 2. Search related terms in parallel to build complete context 3. Review hit summaries to identify most relevant documents across all searches 4. Use browse_document() for detailed examination of specific pages 5. Use get_document_structure() to understand document organization All tools return rich, formatted text optimized for LLM understanding. """, ) @ra_mcp.tool( name="search_transcribed", description="Search for keywords in transcribed historical documents from Riksarkivet" ) async def search_transcribed( keyword: str, offset: int, show_context: bool = False, max_results: int = 10, max_hits_per_document: int = 3, max_pages_with_context: int = 0, context_padding: int = 0, max_response_tokens: int = 15000, truncate_page_text: int = 800 ) -> str: """ Search for keywords in transcribed materials from the Swedish National Archives. Returns rich formatted text with: - Full page transcriptions with keyword highlighting - Document metadata and hierarchy - Direct links to images and transcriptions - Context pages around each hit for better understanding Parameters: - keyword: The search term - offset: Start position in search results for pagination (required for more hits) - show_context: Include full page text (default False) - max_results: Maximum number of documents to fetch (default 10) - max_hits_per_document: Maximum page hits to return per document (default 3) - max_pages_with_context: Maximum pages to enrich with full text (default 0) - context_padding: Pages of context around each hit (default 0) - max_response_tokens: Approximate max tokens in response to prevent overflow (default 15000) - truncate_page_text: Max characters per page text to prevent huge responses (default 800) Example: - search_transcribed("häxor", offset=0) - Find documents about witches - search_transcribed("Stockholm", offset=0, show_context=True, max_pages_with_context=10) - Find Stockholm references with context - search_transcribed("näcken", offset=10, max_results=10) - Get results 11-20 - search_transcribed("näcken", offset=0, max_pages_with_context=3, context_padding=0) - Limit response size """ try: cache = get_cache() # Check cache for search results cache_params = {'keyword': keyword, 'max_results': max_results, 'offset': offset, 'max_hits_per_document': max_hits_per_document} cached_hits = cache.get('search', cache_params) if cached_hits is None: # Perform fresh search search_api = SearchAPI() hits, total_hits = search_api.search_transcribed_text(keyword, max_results, offset, max_hits_per_document) # Cache the search results cache.set('search', cache_params, (hits, total_hits)) else: hits, total_hits = cached_hits if not hits: if offset > 0: return f"No more results found for '{keyword}' at offset {offset}. Total results: {total_hits if 'total_hits' in locals() else 'unknown'}" return f"No results found for '{keyword}'. Try different search terms or variations." # Enrich with context if requested if show_context and hits: enrichment_service = SearchEnrichmentService() # Limit hits to max_pages_with_context hits_to_enrich = hits[:max_pages_with_context] # Expand with context padding if requested if context_padding > 0: hits_to_enrich = enrichment_service.expand_hits_with_context_padding( hits_to_enrich, context_padding ) # Enrich with full page text enriched_hits = enrichment_service.enrich_hits_with_context( hits_to_enrich, len(hits_to_enrich), keyword ) # Apply text truncation if needed to prevent token overflow for hit in enriched_hits: if hasattr(hit, 'full_page_text') and hit.full_page_text: if len(hit.full_page_text) > truncate_page_text: hit.full_page_text = hit.full_page_text[:truncate_page_text] + "..." # Format results with size limits formatted = format_search_results(enriched_hits, keyword, show_context=True) # Check approximate token count (rough estimate: 1 char ≈ 0.25 tokens) estimated_tokens = len(formatted) // 4 if estimated_tokens > max_response_tokens: # Try reducing the response if context_padding > 0: return ( f"⚠️ Response too large ({estimated_tokens} tokens). " f"Showing limited results.\n\n" + format_search_results( enriched_hits[:max(1, max_pages_with_context // 2)], keyword, show_context=True ) + f"\n\n💡 **Tip**: Use context_padding=0 and lower max_pages_with_context to see more results." ) else: # Already minimal, just truncate return formatted[:max_response_tokens * 4] + "\n\n[Response truncated due to size limits]" # Add pagination info if relevant - focus on documents, not hits # Count unique documents in current results unique_docs = set() for hit in hits: unique_docs.add(hit.reference_code or hit.pid) # Estimate if there are more documents available # Since API returns more documents than requested, assume there are more if we got max_results if len(unique_docs) == max_results and total_hits > len(hits): document_start = offset // max_results * max_results + 1 document_end = document_start + len(unique_docs) - 1 formatted += f"\n\n📊 **Pagination**: Showing documents {document_start}-{document_end} (total {len(hits)} hits from {len(unique_docs)} documents)" formatted += f"\n💡 Use `offset={offset + max_results}` to see the next {max_results} documents" return formatted else: result = format_search_results(hits, keyword, show_context=False) # Add pagination info - focus on documents, not hits # Count unique documents in current results unique_docs = set() for hit in hits: unique_docs.add(hit.reference_code or hit.pid) # Estimate if there are more documents available if len(unique_docs) == max_results and total_hits > len(hits): document_start = offset // max_results * max_results + 1 document_end = document_start + len(unique_docs) - 1 result += f"\n\n📊 **Pagination**: Showing documents {document_start}-{document_end} (total {len(hits)} hits from {len(unique_docs)} documents)" result += f"\n💡 Use `offset={offset + max_results}` to see the next {max_results} documents" return result except Exception as e: return format_error( f"Search failed: {str(e)}", suggestions=[ "Try a simpler search term", "Check if the service is available", "Reduce max_results or max_pages_with_context" ] ) @ra_mcp.tool( name="browse_document", description="Browse specific pages of a document by reference code" ) async def browse_document( reference_code: str, pages: str, highlight_term: Optional[str] = None, max_pages: int = 20 ) -> str: """ Browse specific pages of a document by reference code. Returns: - Full transcribed text for each requested page - Optional keyword highlighting - Direct links to images and ALTO XML Examples: - browse_document("SE/RA/420422/01", "5") - View page 5 - browse_document("SE/RA/420422/01", "1-10") - View pages 1 through 10 - browse_document("SE/RA/420422/01", "5,7,9", highlight_term="Stockholm") - View specific pages with highlighting """ try: cache = get_cache() # First, find the PID for this reference code session = HTTPClient.create_session() pid = None # Check cache for PID cache_params = {'reference_code': reference_code} pid = cache.get('structure', cache_params) if pid is None: # Try search API first try: params = {'reference_code': reference_code, 'only_digitised_materials': 'true', 'max': 1} response = session.get(SEARCH_API_BASE_URL, params=params, timeout=REQUEST_TIMEOUT) response.raise_for_status() data = response.json() if data.get('items'): pid = data['items'][0].get('id') cache.set('structure', cache_params, pid) except: pass # Fall back to OAI-PMH if search failed if not pid: oai_client = OAIPMHClient() try: pid = oai_client.extract_pid(reference_code) if pid: cache.set('structure', cache_params, pid) except: pass if not pid: return format_error( f"Could not find document with reference code '{reference_code}'", suggestions=[ "Check the reference code format", "Try searching for keywords instead", "The document might not be digitized" ] ) # Get manifest information iiif_client = IIIFClient() collection_info = iiif_client.explore_collection(pid) manifest_id = pid if collection_info and collection_info.get('manifests'): manifest_id = collection_info['manifests'][0]['id'] # Parse page range selected_pages = parse_page_range(pages)[:max_pages] # Load page contexts page_service = PageContextService() contexts = [] for page_num in selected_pages: context = page_service.get_page_context( manifest_id, str(page_num), reference_code, highlight_term ) if context: contexts.append(context) if not contexts: return format_error( f"Could not load pages for {reference_code}", suggestions=[ "The pages might not have transcriptions", "Try different page numbers", "Check if the document is fully digitized" ] ) return format_page_contexts(contexts, reference_code, highlight_term) except Exception as e: return format_error( f"Browse failed: {str(e)}", suggestions=[ "Check the reference code format", "Verify page numbers are valid", "Try with fewer pages" ] ) @ra_mcp.tool( name="get_document_structure", description="Get document structure and metadata without fetching content" ) async def get_document_structure( reference_code: Optional[str] = None, pid: Optional[str] = None, include_manifest_info: bool = True ) -> str: """ Get the structure and metadata of a document without fetching page content. Useful for: - Understanding what's available in a document - Getting the total number of pages - Finding available manifests - Viewing document hierarchy Provide either reference_code or pid. """ try: if not reference_code and not pid: return format_error( "Either reference_code or pid must be provided", suggestions=["Provide a reference code like 'SE/RA/420422/01'", "Or provide a PID from search results"] ) cache = get_cache() # Get PID if only reference_code provided if reference_code and not pid: session = HTTPClient.create_session() # Check cache cache_params = {'reference_code': reference_code} pid = cache.get('structure', cache_params) if pid is None: try: params = {'reference_code': reference_code, 'only_digitised_materials': 'true', 'max': 1} response = session.get(SEARCH_API_BASE_URL, params=params, timeout=REQUEST_TIMEOUT) response.raise_for_status() data = response.json() if data.get('items'): pid = data['items'][0].get('id') cache.set('structure', cache_params, pid) except: pass if not pid: return format_error( f"Could not find document", suggestions=["Check the reference code", "Try searching for it first"] ) # Clean PID if needed if pid.startswith('arkis!'): clean_pid = pid[6:] else: clean_pid = pid # Get IIIF collection info iiif_client = IIIFClient() # Check cache for collection info cache_params = {'pid': clean_pid} collection_info = cache.get('iiif', cache_params) if collection_info is None: collection_info = iiif_client.explore_collection(clean_pid) if collection_info: cache.set('iiif', cache_params, collection_info) if not collection_info: return format_error( f"Could not get structure for PID {pid}", suggestions=["The document might not have IIIF manifests", "Try browsing specific pages instead"] ) return format_document_structure(collection_info) except Exception as e: return format_error( f"Failed to get document structure: {str(e)}", suggestions=["Check the reference code or PID", "Try searching for the document first"] ) @ra_mcp.resource("riksarkivet://contents/table_of_contents") def get_table_of_contents() -> str: """ Get the table of contents (Innehållsförteckning) for the Riksarkivet historical guide. This resource provides an overview of all available sections and topics covered in the historical guide to Swedish National Archives. Returns: - Complete table of contents in Swedish - Links to different sections and subsections - Hierarchical structure of the guide """ try: # Get the path to the markdown file current_dir = os.path.dirname(__file__) markdown_path = os.path.join(current_dir, "..", "..", "markdown", "00_Innehallsforteckning.md") # Read the file with open(markdown_path, 'r', encoding='utf-8') as f: content = f.read() return content except FileNotFoundError: return format_error( "Table of contents file not found", suggestions=[ "Check if the markdown/00_Innehallsforteckning.md file exists", "Verify the file path is correct" ] ) except Exception as e: return format_error( f"Failed to load table of contents: {str(e)}", suggestions=["Check file permissions", "Verify file encoding is UTF-8"] ) @ra_mcp.tool( name="get_guide_content", description="Load specific sections from the Riksarkivet historical guide" ) async def get_guide_content( filename: str = Field(description="Markdown filename to load (e.g., '01_Domstolar.md', '02_Fangelse.md')") ) -> str: """ Load content from specific sections of the Riksarkivet historical guide. Available files include: - 00_Inledning.md - Introduction - 00_Register.md - Index - 01_Domstolar.md - Courts - 02_Fangelse.md - Prisons - 03_Skatt.md - Taxes - 04_Stadens_Forvaltning.md - City administration - 05_Lan.md - Counties - 06_Statskyrkan.md - State church - 07_Folkbokforing.md - Population registration - 08_Tull.md - Customs - 09_Lantmateri.md - Land surveying - 10_Bergsbruk.md - Mining - 11_Fiske.md - Fishing - 12_Skog.md - Forestry - 13_Sjukvard.md - Healthcare - 14_Sjofart.md - Shipping - 15_Vagar.md - Roads - 16_Tillverkning.md - Manufacturing - 17_Handel.md - Trade - 18_Lantbruk.md - Agriculture - 19_Skola.md - Schools - 20_Fattig_Socialtjanst.md - Poor relief and social services - 21_Kommun.md - Municipalities - 22_Landsting.md - County councils - 23_Omsorg.md - Care - 24_Polis.md - Police - 25_Invandring.md - Immigration - 26_Halsa_Miljo.md - Health and environment - 27_Djur.md - Animals - 28_Posten.md - Postal service - 29_Jarnvagen.md - Railways - 30_Tele.md - Telecommunications - 31_Byggnader.md - Buildings - 32_Forsorjning.md - Supply - 33_Barn_Ungdom.md - Children and youth - 34_Modrahjalp.md - Maternal aid - 35_Pension.md - Pensions - 36_Rattshjalp.md - Legal aid - 37_Overvakning.md - Surveillance - 38_Nykterhet.md - Temperance - 39_Arbete.md - Work - 40_Brand_Civilforsvar.md - Fire and civil defense - 41_Energi.md - Energy - 42_Luftfart.md - Aviation - 43_Kultur.md - Culture - 44_Internadministration.md - Internal administration - 45_Studiestod.md - Student aid - 46-61_Forsvaret.md - Defense - 99_Litteratur.md - Literature - Index.md - Index Parameters: - filename: The markdown file to load (with .md extension) Example: - get_guide_content("01_Domstolar.md") - Load courts section - get_guide_content("13_Sjukvard.md") - Load healthcare section """ try: # Validate filename - ensure it ends with .md and contains no path traversal if not filename.endswith('.md'): return format_error( "Invalid filename format", suggestions=["Filename must end with .md extension"] ) # Remove any path components for security filename = os.path.basename(filename) # Get the path to the markdown file current_dir = os.path.dirname(__file__) markdown_path = os.path.join(current_dir, "..", "..", "markdown", filename) # Check if file exists if not os.path.exists(markdown_path): return format_error( f"Guide section '{filename}' not found", suggestions=[ "Check the filename spelling", "Use get_table_of_contents resource to see available sections", "Ensure the filename includes .md extension" ] ) # Read the file with open(markdown_path, 'r', encoding='utf-8') as f: content = f.read() return content except Exception as e: return format_error( f"Failed to load guide content '{filename}': {str(e)}", suggestions=[ "Check file permissions", "Verify file encoding is UTF-8", "Ensure the filename is valid" ] )

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/AI-Riksarkivet/ra-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server