eu-regulations

Overview Schema Related Servers Score Discussions

eu-regulations
scripts

ingest-eurlex.ts•12.1 KiB

#!/usr/bin/env npx tsx /** * Ingest EU regulations from EUR-Lex. * * Usage: npx tsx scripts/ingest-eurlex.ts <celex_id> <output_file> [--browser] * Example: npx tsx scripts/ingest-eurlex.ts 32016R0679 data/seed/gdpr.json * Example (with browser): npx tsx scripts/ingest-eurlex.ts 32016R0679 data/seed/gdpr.json --browser */ import { writeFileSync } from 'fs'; import { JSDOM } from 'jsdom'; import { fetchEurLexWithBrowser } from './ingest-eurlex-browser.js'; interface Article { number: string; title?: string; text: string; chapter?: string; } interface Definition { term: string; definition: string; article: string; } interface Recital { recital_number: number; text: string; related_articles?: string; } interface RegulationData { id: string; full_name: string; celex_id: string; effective_date?: string; eur_lex_url: string; articles: Article[]; definitions: Definition[]; recitals?: Recital[]; } const REGULATION_METADATA: Record<string, { id: string; full_name: string; effective_date?: string }> = { '32016R0679': { id: 'GDPR', full_name: 'General Data Protection Regulation', effective_date: '2018-05-25' }, '32022L2555': { id: 'NIS2', full_name: 'Directive on measures for a high common level of cybersecurity across the Union', effective_date: '2024-10-17' }, '32022R2554': { id: 'DORA', full_name: 'Digital Operational Resilience Act', effective_date: '2025-01-17' }, '32024R1689': { id: 'AI_ACT', full_name: 'Artificial Intelligence Act', effective_date: '2024-08-01' }, '32024R2847': { id: 'CRA', full_name: 'Cyber Resilience Act', effective_date: '2024-12-10' }, '32019R0881': { id: 'CYBERSECURITY_ACT', full_name: 'EU Cybersecurity Act', effective_date: '2019-06-27' }, '32024R1183': { id: 'EIDAS2', full_name: 'European Digital Identity Framework (eIDAS 2.0)', effective_date: '2024-05-20' }, '02014R0910-20241018': { id: 'EIDAS2', full_name: 'European Digital Identity Framework (eIDAS 2.0)', effective_date: '2024-05-20' }, // Digital Single Market regulations '32023R2854': { id: 'DATA_ACT', full_name: 'Data Act', effective_date: '2025-09-12' }, '32022R2065': { id: 'DSA', full_name: 'Digital Services Act', effective_date: '2024-02-17' }, '32022R1925': { id: 'DMA', full_name: 'Digital Markets Act', effective_date: '2023-05-02' }, // UN Regulations (adopted by EU) '42021X0387': { id: 'UN_R155', full_name: 'UN Regulation No. 155 - Cyber security and cyber security management system', effective_date: '2021-01-22' }, '42025X0005': { id: 'UN_R155', full_name: 'UN Regulation No. 155 - Cyber security and cyber security management system (Supplement 3)', effective_date: '2025-01-10' }, }; async function fetchEurLexHtml(celexId: string, useBrowser = false): Promise<string> { if (useBrowser) { console.log('Using Puppeteer to bypass WAF...'); return fetchEurLexWithBrowser(celexId); } // Fallback to direct fetch (will fail with WAF) const url = `https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:${celexId}`; console.log(`Fetching: ${url}`); const response = await fetch(url, { headers: { 'User-Agent': 'Mozilla/5.0 (compatible; EU-Compliance-MCP/1.0; +https://github.com/Ansvar-Systems/EU_compliance_MCP)', 'Accept': 'text/html', }, }); if (!response.ok) { throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`); } return response.text(); } function parseRecitals(html: string): Recital[] { const dom = new JSDOM(html); const doc = dom.window.document; const recitals: Recital[] = []; const allText = doc.body?.textContent || ''; const lines = allText.split('\n').map(l => l.trim()).filter(l => l); let inRecitalsSection = false; let currentRecital: { number: number; lines: string[] } | null = null; for (const line of lines) { // Detect start of recitals section if (line.match(/^Having regard to/i) || line.match(/^Whereas:/i)) { inRecitalsSection = true; continue; } // Detect end of recitals (usually "HAVE ADOPTED" or "Article 1") if (line.match(/^HAVE ADOPTED/i) || line.match(/^Article\s+1$/i)) { inRecitalsSection = false; if (currentRecital && currentRecital.lines.length > 0) { recitals.push({ recital_number: currentRecital.number, text: currentRecital.lines.join('\n\n'), }); } break; } if (!inRecitalsSection) continue; // Match recital number: "(1)", "(123)", etc. const recitalMatch = line.match(/^$(\d+)$/); if (recitalMatch) { // Save previous recital if (currentRecital && currentRecital.lines.length > 0) { recitals.push({ recital_number: currentRecital.number, text: currentRecital.lines.join('\n\n'), }); } // Start new recital currentRecital = { number: parseInt(recitalMatch[1]), lines: [], }; // Add remaining text after number const textAfterNumber = line.substring(recitalMatch[0].length).trim(); if (textAfterNumber) { currentRecital.lines.push(textAfterNumber); } continue; } // Add line to current recital if (currentRecital && line.length > 0) { currentRecital.lines.push(line); } } // Don't forget the last recital if (currentRecital && currentRecital.lines.length > 0) { recitals.push({ recital_number: currentRecital.number, text: currentRecital.lines.join('\n\n'), }); } return recitals; } function parseArticles(html: string, celexId: string): { articles: Article[]; definitions: Definition[] } { const dom = new JSDOM(html); const doc = dom.window.document; const articles: Article[] = []; const definitions: Definition[] = []; let currentChapter = ''; // Get all text content and split by article markers const allText = doc.body?.textContent || ''; const lines = allText.split('\n').map(l => l.trim()).filter(l => l); let currentArticle: { number: string; title?: string; lines: string[] } | null = null; for (const line of lines) { const articleStart = line.match(/^Article\s+(\d+[a-z]?)$/i); if (articleStart) { if (currentArticle && currentArticle.lines.length > 0) { articles.push({ number: currentArticle.number, title: currentArticle.title, text: currentArticle.lines.join('\n\n'), chapter: currentChapter || undefined, }); } currentArticle = { number: articleStart[1], lines: [] }; continue; } const chapterStart = line.match(/^CHAPTER\s+([IVXLC]+)/i); if (chapterStart) { currentChapter = chapterStart[1]; continue; } if (currentArticle) { // Check if this is a title line (short, no period at end) if (!currentArticle.title && currentArticle.lines.length === 0 && line.length < 100 && !line.endsWith('.')) { currentArticle.title = line; } else if (line.length > 0) { currentArticle.lines.push(line); } } } // Don't forget the last article if (currentArticle && currentArticle.lines.length > 0) { articles.push({ number: currentArticle.number, title: currentArticle.title, text: currentArticle.lines.join('\n\n'), chapter: currentChapter || undefined, }); } // Deduplicate articles - keep the one with the most content for each number const articleMap = new Map<string, Article>(); for (const article of articles) { const existing = articleMap.get(article.number); if (!existing || article.text.length > existing.text.length) { articleMap.set(article.number, article); } } const deduplicatedArticles = Array.from(articleMap.values()) .sort((a, b) => { // Extract numeric and letter parts (e.g., "5a" -> [5, "a"]) const matchA = a.number.match(/^(\d+)([a-z]?)$/); const matchB = b.number.match(/^(\d+)([a-z]?)$/); if (!matchA || !matchB) return 0; const numA = parseInt(matchA[1]); const numB = parseInt(matchB[1]); // Sort by number first if (numA !== numB) return numA - numB; // Then by letter (empty string sorts before letters) return (matchA[2] || '').localeCompare(matchB[2] || ''); }); // Extract definitions from Article 4 (or similar definitions article) // Find definitions article from deduplicated list const defsArticle = deduplicatedArticles.find(a => a.title?.toLowerCase().includes('definition') ); if (defsArticle && defsArticle.text.includes('means')) { // Normalize text: collapse whitespace and normalize quotes const normalizedText = defsArticle.text .replace(/\s+/g, ' ') .replace(/[\u2018\u2019]/g, "'"); // Curly quotes to straight // Parse definitions by extracting content between consecutive numbered entries // This handles: // - Complex definitions with internal periods/semicolons // - 'term' or 'alternate' means... patterns (NIS2 Art 6) // - 'term1', 'term2' and 'term3' mean... patterns (CRA Art 3) // - 'term' of the something means... patterns (GDPR Art 4) // - mean, respectively... patterns (CRA Art 3) // - means: (a) ... patterns (complex definitions with sub-parts) const defRegex = /$(\d+)$\s*'([^']+)'(?:[^(]*?)means?[,:;]?\s+(.+?)(?=$\d+$\s*'|$)/g; let defMatch; while ((defMatch = defRegex.exec(normalizedText)) !== null) { const term = defMatch[2].trim().toLowerCase(); const definition = defMatch[3].trim(); // Only add if we got meaningful content if (term.length > 0 && definition.length > 10) { definitions.push({ term, definition, article: defsArticle.number, }); } } } return { articles: deduplicatedArticles, definitions }; } async function ingestRegulation(celexId: string, outputPath: string, useBrowser = false): Promise<void> { const metadata = REGULATION_METADATA[celexId]; if (!metadata) { console.warn(`Unknown CELEX ID: ${celexId}. Using generic metadata.`); } const html = await fetchEurLexHtml(celexId, useBrowser); console.log(`Fetched ${html.length} bytes`); // Parse recitals BEFORE articles const recitals = parseRecitals(html); console.log(`Parsed ${recitals.length} recitals`); const { articles, definitions } = parseArticles(html, celexId); console.log(`Parsed ${articles.length} articles, ${definitions.length} definitions`); if (articles.length === 0) { console.error('No articles found! The HTML structure may have changed.'); console.log('Saving raw HTML for debugging...'); writeFileSync(outputPath.replace('.json', '.html'), html); return; } const regulation: RegulationData = { id: metadata?.id || celexId, full_name: metadata?.full_name || `Regulation ${celexId}`, celex_id: celexId, effective_date: metadata?.effective_date, eur_lex_url: `https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:${celexId}`, articles, definitions, recitals, }; writeFileSync(outputPath, JSON.stringify(regulation, null, 2)); console.log(`\nSaved to: ${outputPath}`); console.log(`Articles: ${articles.length}`); console.log(`Definitions: ${definitions.length}`); console.log(`Recitals: ${recitals.length}`); } // Main const args = process.argv.slice(2); const useBrowser = args.includes('--browser'); const [celexId, outputPath] = args.filter(arg => arg !== '--browser'); if (!celexId || !outputPath) { console.log('Usage: npx tsx scripts/ingest-eurlex.ts <celex_id> <output_file> [--browser]'); console.log('Example: npx tsx scripts/ingest-eurlex.ts 32016R0679 data/seed/gdpr.json'); console.log('Example (with browser): npx tsx scripts/ingest-eurlex.ts 32016R0679 data/seed/gdpr.json --browser'); console.log('\nOptions:'); console.log(' --browser Use Puppeteer to bypass EUR-Lex WAF challenges'); console.log('\nKnown CELEX IDs:'); Object.entries(REGULATION_METADATA).forEach(([id, meta]) => { console.log(` ${id} - ${meta.id} (${meta.full_name})`); }); process.exit(1); } if (useBrowser) { console.log('Browser mode enabled - using Puppeteer to fetch content\n'); } ingestRegulation(celexId, outputPath, useBrowser).catch(err => { console.error('Error:', err); process.exit(1); });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Mortalus/eu-regulations'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

ingest-eurlex.ts•12.1 KiB