Skip to main content
Glama

Riksarkivet MCP Server

_scrape_markdown_convert.py11.9 kB
import re from bs4 import BeautifulSoup import html2text from pathlib import Path import os def fix_swedish_encoding(text): """Fix common Swedish character encoding issues""" replacements = { 'ö': 'ö', 'ä': 'ä', 'Ã¥': 'å', 'Ö': 'Ö', 'Ä': 'Ä', 'Ã…': 'Å', 'é': 'é', 'è': 'è', 'ü': 'ü', 'ø': 'ø', 'Ø': 'Ø', '’': "'", 'â€"': '–', 'â€"': '—', '“': '"', 'â€': '"', '…': '…', ' ': ' ', # non-breaking space } result = text for old, new in replacements.items(): result = result.replace(old, new) return result def html_to_markdown(html_content, filename): """Convert HTML content to Markdown with proper formatting""" # Parse HTML soup = BeautifulSoup(html_content, 'html.parser') # Remove script and style elements for element in soup(['script', 'style', 'meta', 'link']): element.decompose() # Initialize html2text converter h = html2text.HTML2Text() h.body_width = 0 # Don't wrap lines h.ignore_links = False h.ignore_images = False h.unicode_snob = True # Get the body content if it exists, otherwise use all content body = soup.find('body') if body: content = str(body) else: content = str(soup) # Convert to markdown markdown = h.handle(content) # Fix Swedish encoding issues markdown = fix_swedish_encoding(markdown) # Clean up excessive blank lines markdown = re.sub(r'\n{3,}', '\n\n', markdown) # Add metadata header title = soup.find('title') if title: title_text = fix_swedish_encoding(title.get_text()) else: title_text = filename.replace('.htm', '').replace('_', ' ') # Extract meta information meta_info = [] author = soup.find('meta', {'name': 'Author'}) if author: meta_info.append(f"author: {fix_swedish_encoding(author.get('content', ''))}") description = soup.find('meta', {'name': 'Description'}) if description: meta_info.append(f"description: {fix_swedish_encoding(description.get('content', ''))}") # Build the final markdown with frontmatter frontmatter = f"""--- title: {title_text} {chr(10).join(meta_info)} --- """ return frontmatter + markdown def convert_all_files(): """Find and convert all HTM files in riksarkivet_data/html directory""" # Define paths html_dir = Path("riksarkivet_data/html") markdown_dir = Path("riksarkivet_data/markdown") # Create markdown directory if it doesn't exist markdown_dir.mkdir(parents=True, exist_ok=True) # Find all .htm files htm_files = list(html_dir.glob("*.htm")) if not htm_files: print(f"No .htm files found in {html_dir}") return print(f"Found {len(htm_files)} HTM files to convert") print("-" * 50) # Convert each file successful = 0 failed = 0 for filepath in htm_files: try: # Read the HTML file with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: html_content = f.read() # Convert to markdown filename = filepath.name markdown_content = html_to_markdown(html_content, filename) # Save as .md file in the markdown directory output_path = markdown_dir / filepath.with_suffix('.md').name with open(output_path, 'w', encoding='utf-8') as f: f.write(markdown_content) print(f"✓ Converted: {filename} -> {output_path.name}") successful += 1 except Exception as e: print(f"✗ Failed to convert {filepath.name}: {str(e)}") failed += 1 print("-" * 50) print(f"Conversion complete: {successful} successful, {failed} failed") print(f"Markdown files saved in: {markdown_dir}") if __name__ == "__main__": # First check if beautifulsoup4 and html2text are installed try: import bs4 import html2text except ImportError: print("Required libraries not installed. Please run:") print("pip install beautifulsoup4 html2text") exit(1) # Run the conversion convert_all_files() #################################################################################################################################### #################################################################################################################################### #################################################################################################################################### import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse import time import os import json class RiksarkivetScraper: def __init__(self, base_url="https://forvaltningshistorik.riksarkivet.se/"): self.base_url = base_url self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) self.visited_urls = set() self.all_content = {} self.to_visit = set() def is_valid_url(self, url): """Check if URL belongs to the target domain""" parsed = urlparse(url) base_parsed = urlparse(self.base_url) # Only scrape URLs from the same domain if parsed.netloc and parsed.netloc != base_parsed.netloc: return False # Skip non-HTML files (images, PDFs, etc.) if url.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.pdf', '.doc', '.xls')): return False return True def extract_links(self, soup, current_url): """Extract all valid links from a page""" links = set() for tag in soup.find_all(['a', 'link']): href = tag.get('href') if href: # Skip anchors and mailto links if href.startswith('#') or href.startswith('mailto:'): continue # Convert relative URLs to absolute absolute_url = urljoin(current_url, href) if self.is_valid_url(absolute_url): links.add(absolute_url) return links def scrape_page(self, url): """Scrape a single page and extract its content and links""" if url in self.visited_urls: return None try: print(f"Scraping: {url}") response = self.session.get(url, timeout=10) response.raise_for_status() # Handle encoding if response.apparent_encoding: response.encoding = response.apparent_encoding else: response.encoding = 'iso-8859-1' soup = BeautifulSoup(response.text, 'html.parser') # Extract content title = soup.title.string if soup.title else "No title" # Get text content text_content = soup.get_text(separator='\n', strip=True) # Get raw HTML html_content = response.text # Store the content self.all_content[url] = { 'title': title, 'text': text_content, 'html': html_content, 'links_found': [] } # Mark as visited self.visited_urls.add(url) # Find all links on this page new_links = self.extract_links(soup, url) self.all_content[url]['links_found'] = list(new_links) # Add new links to the queue for link in new_links: if link not in self.visited_urls: self.to_visit.add(link) return new_links except Exception as e: print(f" Error scraping {url}: {e}") self.visited_urls.add(url) # Mark as visited even if failed return None def scrape_all(self, start_url=None): """Recursively scrape all pages starting from the index""" if not start_url: start_url = urljoin(self.base_url, "Index.htm") # Initialize with start URL self.to_visit.add(start_url) # Keep scraping until we've visited all discovered URLs while self.to_visit: # Get next URL to visit current_url = self.to_visit.pop() if current_url not in self.visited_urls: # Scrape the page (this will add new URLs to self.to_visit) self.scrape_page(current_url) # Be respectful - add delay time.sleep(0.5) # Progress update print(f" Progress: Visited {len(self.visited_urls)} pages, " f"{len(self.to_visit)} remaining in queue") print(f"\nScraping complete! Total pages scraped: {len(self.all_content)}") return self.all_content def save_to_files(self, output_dir="riksarkivet_data"): """Save all scraped content to files""" os.makedirs(output_dir, exist_ok=True) # Save individual HTML files html_dir = os.path.join(output_dir, "html") os.makedirs(html_dir, exist_ok=True) # Save individual text files text_dir = os.path.join(output_dir, "text") os.makedirs(text_dir, exist_ok=True) # Create index of all pages index_data = {} for url, content in self.all_content.items(): # Create filename from URL parsed = urlparse(url) filename = parsed.path.strip('/').replace('/', '_') if not filename: filename = "index" # Save HTML html_path = os.path.join(html_dir, filename) with open(html_path, 'w', encoding='utf-8') as f: f.write(content['html']) # Save text text_path = os.path.join(text_dir, filename.replace('.htm', '.txt')) with open(text_path, 'w', encoding='utf-8') as f: f.write(f"Title: {content['title']}\n") f.write(f"URL: {url}\n") f.write(f"{'='*50}\n\n") f.write(content['text']) # Add to index index_data[url] = { 'title': content['title'], 'html_file': html_path, 'text_file': text_path, 'links': content['links_found'] } # Save index as JSON with open(os.path.join(output_dir, 'index.json'), 'w', encoding='utf-8') as f: json.dump(index_data, f, ensure_ascii=False, indent=2) # Save all text in one file with open(os.path.join(output_dir, 'all_content.txt'), 'w', encoding='utf-8') as f: for url, content in self.all_content.items(): f.write(f"\n{'='*80}\n") f.write(f"URL: {url}\n") f.write(f"Title: {content['title']}\n") f.write(f"{'='*80}\n") f.write(content['text']) f.write(f"\n\n") print(f"All content saved to {output_dir}/") print(f" - HTML files: {html_dir}/") print(f" - Text files: {text_dir}/") print(f" - Complete index: {output_dir}/index.json") print(f" - All text: {output_dir}/all_content.txt") # Run the scraper if __name__ == "__main__": scraper = RiksarkivetScraper() # Scrape everything all_content = scraper.scrape_all() # Save to files scraper.save_to_files() # Print summary print(f"\nSummary:") print(f"Total pages scraped: {len(all_content)}") print(f"Total links discovered: {sum(len(c['links_found']) for c in all_content.values())}")

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/AI-Riksarkivet/ra-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server