OpenTK Model Context Protocol Server

Overview Schema Related Servers Score Discussions

html-parser.ts•23.4 KiB

/** * Utility functions for parsing HTML content */ interface DocumentDetails { title: string | null; type: string | null; nummer: string | null; datum: string | null; bijgewerkt: string | null; versie: number | null; directLinkPdf: string | null; tweedekamerLink: string | null; bijlageBij: { title: string | null; nummer: string | null; link: string; } | null; } interface Committee { id: string; name: string; url: string; } interface CommitteeDetails extends Committee { description?: string | null; members?: Array<{ name: string; id: string; party?: string; role?: string; }>; recentActivities?: Array<{ title: string; date: string; url: string; }>; } interface Activity { id: string; title: string; date: string; time?: string; location?: string; committee?: string; url: string; type?: string; } interface VotingResult { id: string; title: string; date: string; result: 'Aangenomen' | 'Verworpen' | 'Ingetrokken' | 'Aangehouden' | string; submitter?: string; votes?: { voor: string[]; tegen: string[]; voorAantal: number; tegenAantal: number; }; url: string; } interface RecentDocument { id: string; title: string; type: string; date: string; updated: string; committee?: string; subject?: string; url: string; } interface BirthdayPerson { id: string; name: string; party?: string; url: string; } interface OverviewData { recentDocuments: RecentDocument[]; birthdays: BirthdayPerson[]; lastUpdated: string; pagination: { currentPage: number; hasMoreDocuments: boolean; totalDocumentsRetrieved: number; }; } const extractValue = (html: string, regex: RegExp, group: number = 1): string | null => { const match = html.match(regex); return match && match[group] ? match[group].trim() : null; }; export function extractDocumentLink(html: string): string | null { // Check if the document was not found if (html.includes('Found nothing in document.html!!')) { return 'NOT_FOUND'; } const match = html.match(/<a href="([^"]+)"[^>]*>Directe link naar document<\/a>/i); if (match && match[1]) { // Make sure the link starts with 'tkconv/getraw' or 'getraw' const link = match[1]; if (link.startsWith('getraw/')) { return link; // Return as is, will be resolved with baseUrl } } return null; } export function extractDocumentDetailsFromHtml(html: string, baseUrl: string): DocumentDetails | null { if (!html) { return null; } const details: DocumentDetails = { title: null, type: null, nummer: null, datum: null, bijgewerkt: null, versie: null, directLinkPdf: null, tweedekamerLink: null, bijlageBij: null, }; // Extract basic info details.title = extractValue(html, /<hblock>\s*<h2>([\s\S]*?)<\/h2>/i); details.type = extractValue(html, /<\/hblock>[\s\S]*?<p><em>([\s\S]*?)<\/em><\/p>/i); // Extract metadata line const metadataMatch = html.match(/<p>Nummer: <b>(.*?)<\/b>, datum: <b>(.*?)<\/b>, bijgewerkt: <b>(.*?)<\/b>, versie: (\d+)/i); if (metadataMatch) { details.nummer = metadataMatch[1]?.trim() || null; details.datum = metadataMatch[2]?.trim() || null; details.bijgewerkt = metadataMatch[3]?.trim() || null; details.versie = metadataMatch[4] ? parseInt(metadataMatch[4], 10) : null; } // Extract links const directLinkMatch = html.match(/<a href="(getraw\/[^"']+)">Directe link naar document<\/a>/i); if (directLinkMatch && directLinkMatch[1]) { // Ensure the URL includes the /tkconv/ path const rawPath = directLinkMatch[1]; // If baseUrl already includes /tkconv/, this will work correctly // If not, we need to add it manually if (baseUrl.endsWith('/tkconv')) { details.directLinkPdf = `${baseUrl}/${rawPath}`; } else if (baseUrl.includes('/tkconv/')) { details.directLinkPdf = new URL(rawPath, baseUrl).href; } else { // Ensure we have /tkconv/ in the path details.directLinkPdf = `${baseUrl}/tkconv/${rawPath}`; } } details.tweedekamerLink = extractValue(html, /<a href="(https:\/\/www\.tweedekamer\.nl\/[^"']+)">link naar pagina op de Tweede Kamer site<\/a>/i); // Extract bijlage bij info const bijlageMatch = html.match(/<p>Bijlage bij: <a href="(document\.html\?nummer=[^"]+)">([\s\S]*?)<\/a> $([^)]+)$<\/p>/i); if (bijlageMatch && bijlageMatch[1]) { details.bijlageBij = { title: bijlageMatch[2]?.trim() || null, nummer: bijlageMatch[3]?.trim() || null, link: new URL(bijlageMatch[1], baseUrl).href, }; } return details; } /** * Extracts committee information from the committees HTML page * @param html The HTML content of the committees page * @param baseUrl The base URL for resolving relative URLs * @returns Array of committee objects */ export function extractCommitteesFromHtml(html: string, baseUrl: string): Committee[] { if (!html) { return []; } const committees: Committee[] = []; // Extract the table containing committees const tableRegex = /<table[^>]*>[\s\S]*?<tbody>([\s\S]*?)<\/tbody>/i; const tableMatch = html.match(tableRegex); if (!tableMatch || !tableMatch[1]) { return []; } const tableContent = tableMatch[1]; // Extract each row (committee) from the table const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi; let rowMatch; while ((rowMatch = rowRegex.exec(tableContent)) !== null) { if (!rowMatch[1]) continue; const rowContent = rowMatch[1]; // Extract the committee ID and name from the link const linkRegex = /<a href="(commissie\.html\?id=([^"]+))">([^<]+)<\/a>/i; const linkMatch = rowContent.match(linkRegex); if (linkMatch && linkMatch[1] && linkMatch[2] && linkMatch[3]) { const id = linkMatch[2]; const name = linkMatch[3].trim(); const url = new URL(linkMatch[1], baseUrl).href; committees.push({ id, name, url }); } } return committees; } /** * Extracts detailed committee information from a committee page * @param html The HTML content of the committee page * @param baseUrl The base URL for resolving relative URLs * @returns Committee details object */ export function extractCommitteeDetailsFromHtml(html: string, baseUrl: string, committeeId: string): CommitteeDetails | null { if (!html) { return null; } // Extract the committee name from the title tag or h2 tag let name = ""; // First try to get the name from the h2 tag const h2Regex = /<h2>([^<]+)<\/h2>/i; const h2Match = html.match(h2Regex); if (h2Match && h2Match[1]) { name = h2Match[1].trim(); } // If not found, try to get it from the title tag if (!name) { const titleRegex = /<title>([^<]+)<\/title>/i; const titleMatch = html.match(titleRegex); if (titleMatch && titleMatch[1]) { name = titleMatch[1].trim(); } } if (!name) { return null; } const details: CommitteeDetails = { id: committeeId, name, url: `${baseUrl}/commissie.html?id=${encodeURIComponent(committeeId)}`, members: [], recentActivities: [] }; // Extract description if available const descriptionRegex = /<p class="description">([^<]+)<\/p>/i; const descriptionMatch = html.match(descriptionRegex); details.description = descriptionMatch?.[1]?.trim() || null; // Extract members from the first table const membersTableRegex = /<table[^>]*>[\s\S]*?<thead>[\s\S]*?<\/thead>[\s\S]*?<tbody>([\s\S]*?)<\/tbody>/i; const membersTableMatch = html.match(membersTableRegex); if (membersTableMatch && membersTableMatch[1]) { const membersTableContent = membersTableMatch[1]; const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi; let rowMatch; while ((rowMatch = rowRegex.exec(membersTableContent)) !== null) { if (!rowMatch[1]) continue; const rowContent = rowMatch[1]; // Extract cells const cellRegex = /<td[^>]*>([\s\S]*?)<\/td>/gi; const cells: string[] = []; let cellMatch; while ((cellMatch = cellRegex.exec(rowContent)) !== null) { if (cellMatch[1]) { cells.push(cellMatch[1].trim()); } } if (cells.length < 3) continue; // Extract role, name, and party const role = cells[1] ? cells[1].replace(/<[^>]+>/g, "").trim() : ""; // Extract name and ID from the link const nameCell = cells[2] || ""; const nameMatch = nameCell.match(/<a href="persoon\.html\?nummer=([^"]+)">([^<]+)<\/a>/); if (!nameMatch || !nameMatch[1] || !nameMatch[2]) continue; const id = nameMatch[1]; const name = nameMatch[2].trim(); // Extract party if available (might be in the same cell as the name) let party = ""; const partyMatch = nameCell.match(/>([^<]+)<\/a>\s*$([^)]+)$/); if (partyMatch && partyMatch[2]) { party = partyMatch[2].trim(); } details.members?.push({ id: id, name: name, role: role || undefined, party: party || undefined }); } } // Extract recent activities from the second table const tablesRegex = /<table[^>]*>[\s\S]*?<thead>[\s\S]*?<\/thead>[\s\S]*?<tbody>([\s\S]*?)<\/tbody>/gi; let tableMatch; let tableCount = 0; let activitiesTableContent = ""; // Find the second table (activities) while ((tableMatch = tablesRegex.exec(html)) !== null) { tableCount++; if (tableCount === 2 && tableMatch[1]) { activitiesTableContent = tableMatch[1]; break; } } if (activitiesTableContent) { const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi; let rowMatch; while ((rowMatch = rowRegex.exec(activitiesTableContent)) !== null) { if (!rowMatch[1]) continue; const rowContent = rowMatch[1]; // Extract cells const cellRegex = /<td[^>]*>([\s\S]*?)<\/td>/gi; const cells: string[] = []; let cellMatch; while ((cellMatch = cellRegex.exec(rowContent)) !== null) { if (cellMatch[1]) { cells.push(cellMatch[1].trim()); } } if (cells.length < 2) continue; // Extract date const date = cells[0] ? cells[0].replace(/<[^>]+>/g, "").trim() : ""; if (!date) continue; // Extract title and link const titleCell = cells[1] || ""; const titleMatch = titleCell.match(/<a href="(activiteit\.html\?nummer=([^"]+))">([^<]+)<\/a>/); if (!titleMatch || !titleMatch[1] || !titleMatch[3]) continue; const url = new URL(titleMatch[1], baseUrl).href; const title = titleMatch[3].trim(); details.recentActivities?.push({ title, date, url }); } } return details; } /** * Extracts upcoming activities from the activities HTML page * @param html The HTML content of the activities page * @param baseUrl The base URL for resolving relative URLs * @returns Array of activity objects */ export function extractActivitiesFromHtml(html: string, baseUrl: string): Activity[] { if (!html) { return []; } const activities: Activity[] = []; // Extract the table containing activities const tableRegex = /<table[^>]*>[\s\S]*?<tbody>([\s\S]*?)<\/tbody>/i; const tableMatch = html.match(tableRegex); if (!tableMatch || !tableMatch[1]) { return []; } const tableContent = tableMatch[1]; // Extract each row (activity) from the table const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi; let rowMatch; while ((rowMatch = rowRegex.exec(tableContent)) !== null) { if (!rowMatch[1]) continue; const rowContent = rowMatch[1]; // Extract cells const cellRegex = /<td[^>]*>([\s\S]*?)<\/td>/gi; const cells: string[] = []; let cellMatch; while ((cellMatch = cellRegex.exec(rowContent)) !== null) { if (cellMatch[1]) { cells.push(cellMatch[1].trim()); } } if (cells.length < 3) continue; // Extract date and time if (!cells[0]) continue; const dateCell = cells[0]; const dateMatch = dateCell.match(/(\d{4}-\d{2}-\d{2})(?:\s+(\d{2}:\d{2}))?/); if (!dateMatch || !dateMatch[1]) continue; const date = dateMatch[1]; const time = dateMatch[2] || undefined; // Extract location (zaal) if available let location: string | undefined = undefined; if (cells.length > 1 && cells[1]) { location = cells[1].replace(/<[^>]+>/g, "").trim() || undefined; } // Extract committee if available let committee: string | undefined = undefined; if (cells.length > 2 && cells[2]) { committee = cells[2].replace(/<[^>]+>/g, "").trim() || undefined; // Extract committee name from abbr title if present const abbrMatch = cells[2].match(/<abbr title="([^"]+)">/i); if (abbrMatch && abbrMatch[1]) { committee = abbrMatch[1].trim(); } } // Extract title and link from the subject column (index 3) if (cells.length <= 3 || !cells[3]) continue; const titleCell = cells[3]; const titleMatch = titleCell.match(/<a href="(activiteit\.html\?nummer=([^"]+))">([^<]+)<\/a>/); if (!titleMatch || !titleMatch[1] || !titleMatch[2] || !titleMatch[3]) continue; const id = titleMatch[2]; const url = new URL(titleMatch[1], baseUrl).href; const title = titleMatch[3].trim(); // Extract type/description if available (index 4) let type: string | undefined = undefined; if (cells.length > 4 && cells[4]) { type = cells[4].replace(/<[^>]+>/g, "").trim() || undefined; } activities.push({ id: id, title: title, date: date, time, location, committee, type, url: url }); } return activities; } /** * Extracts voting results from the stemmingen HTML page * @param html The HTML content of the stemmingen page * @param baseUrl The base URL for resolving relative URLs * @returns Array of voting result objects */ export function extractVotingResultsFromHtml(html: string, baseUrl: string): VotingResult[] { if (!html) { return []; } const votingResults: VotingResult[] = []; // Extract all tbody sections (each contains a voting result and its details) const tbodyRegex = /<tbody>([\s\S]*?)<\/tbody>/gi; let tbodyMatch: RegExpExecArray | null; while ((tbodyMatch = tbodyRegex.exec(html)) !== null) { if (!tbodyMatch[1]) continue; const tbodyContent = tbodyMatch[1]; // Extract rows within this tbody const rows: string[] = []; const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi; let rowMatch: RegExpExecArray | null; while ((rowMatch = rowRegex.exec(tbodyContent)) !== null) { if (rowMatch[1]) { rows.push(rowMatch[1]); } } // Need at least the main row and the parties row if (rows.length < 2) continue; // Process the main row (first row) const mainRowContent = rows[0]; // Extract cells from the main row const cellRegex = /<td[^>]*>([\s\S]*?)<\/td>/gi; const cells: string[] = []; let cellMatch: RegExpExecArray | null; while ((cellMatch = cellRegex.exec(mainRowContent as string)) !== null) { if (cellMatch[1]) { cells.push(cellMatch[1].trim()); } } // Need at least date, title, submitter, result, and vote counts if (cells.length < 5) continue; // Extract date const dateCell = cells[0] || ""; const date = dateCell.replace(/<[^>]+>/g, "").trim(); // Extract title and link const titleCell = cells[1] || ""; const titleMatch = titleCell.match(/<a href="(zaak\.html\?nummer=([^"]+))">([^<]+)<\/a>/); if (!titleMatch || !titleMatch[1] || !titleMatch[2] || !titleMatch[3]) continue; const id = titleMatch[2]; const url = new URL(titleMatch[1], baseUrl).href; const title = titleMatch[3].trim(); // Extract submitter const submitter = cells[2] ? cells[2].replace(/<[^>]+>/g, "").trim() : null; // Extract result const resultCell = cells[3] || ""; const result = resultCell.replace(/<[^>]+>/g, "").trim(); // Extract vote counts const forVotes = cells[4] ? parseInt(cells[4].replace(/<[^>]+>/g, "").trim(), 10) : 0; const againstVotes = cells[5] ? parseInt(cells[5].replace(/<[^>]+>/g, "").trim(), 10) : 0; // Process the parties row (second row) const partiesRowContent = rows[1]; // Extract cells from the parties row const partiesCells: string[] = []; let partiesCellMatch: RegExpExecArray | null; const partiesCellRegex = /<td[^>]*>([\s\S]*?)<\/td>/gi; while ((partiesCellMatch = partiesCellRegex.exec(partiesRowContent as string)) !== null) { if (partiesCellMatch[1]) { partiesCells.push(partiesCellMatch[1].trim()); } } // Extract parties that voted for and against let forParties: string[] = []; let againstParties: string[] = []; // Find the cell with "Voor" parties const forPartiesCell = partiesCells.find(cell => cell.includes("<b>Voor</b>:")) || ""; if (forPartiesCell) { // Extract the text after "<b>Voor</b>:" const forPartiesMatch = forPartiesCell.match(/<b>Voor<\/b>:\s*(.*?)(?:<\/td>|$)/i); if (forPartiesMatch && forPartiesMatch[1]) { const forPartiesText = forPartiesMatch[1].trim(); // Split by "|" and trim each party name forParties = forPartiesText.split("|").map(p => p.trim()).filter(p => p); } } // Find the cell with "Tegen" parties const againstPartiesCell = partiesCells.find(cell => cell.includes("<b>Tegen</b>:")) || ""; if (againstPartiesCell) { // Extract the text after "<b>Tegen</b>:" const againstPartiesMatch = againstPartiesCell.match(/<b>Tegen<\/b>:\s*(.*?)(?:<\/td>|$)/i); if (againstPartiesMatch && againstPartiesMatch[1]) { const againstPartiesText = againstPartiesMatch[1].trim(); // Split by "|" and trim each party name againstParties = againstPartiesText.split("|").map(p => p.trim()).filter(p => p); } } // Create the voting result object with all details votingResults.push({ id, title, date, result, submitter: submitter || undefined, votes: { voor: forParties, tegen: againstParties, voorAantal: forVotes, tegenAantal: againstVotes }, url }); } return votingResults; } /** * Extracts overview information from the main tkconv page * @param html The HTML content of the main page * @param baseUrl The base URL for resolving relative URLs * @param page The page number to retrieve (default: 1) * @returns Overview data including recent documents and birthdays */ export function extractOverviewFromHtml(html: string, baseUrl: string, page: number = 1): OverviewData { if (!html) { return { recentDocuments: [], birthdays: [], lastUpdated: new Date().toISOString(), pagination: { currentPage: 1, hasMoreDocuments: false, totalDocumentsRetrieved: 0 } }; } const recentDocuments: RecentDocument[] = []; const birthdays: BirthdayPerson[] = []; let lastUpdated = new Date().toISOString(); // Extract the table containing recent documents const tableRegex = /<table[^>]*>[\s\S]*?<tbody>([\s\S]*?)<\/tbody>/i; const tableMatch = html.match(tableRegex); if (tableMatch && tableMatch[1]) { const tableContent = tableMatch[1]; // Extract each row (document) from the table const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi; let rowMatch; while ((rowMatch = rowRegex.exec(tableContent)) !== null) { if (!rowMatch[1]) continue; const rowContent = rowMatch[1]; // Extract cells const cellRegex = /<td[^>]*>([\s\S]*?)<\/td>/gi; const cells: string[] = []; let cellMatch; while ((cellMatch = cellRegex.exec(rowContent)) !== null) { if (cellMatch[1]) { cells.push(cellMatch[1].trim()); } } // Need at least date, updated, committee, subject, and title/type if (cells.length < 5) continue; // Extract date const date = cells[0] ? cells[0].replace(/<[^>]+>/g, "").trim() : ""; if (!date) continue; // Extract updated date const updated = cells[1] ? cells[1].replace(/<[^>]+>/g, "").trim() : ""; // Extract committee const committee = cells[2] ? cells[2].replace(/<[^>]+>/g, "").trim() : undefined; // Extract subject const subject = cells[3] ? cells[3].replace(/<[^>]+>/g, "").trim() : undefined; // Extract title, type, and document ID from the last cell const titleCell = cells[4] || ""; // Extract document ID and title from the link if present const docLinkMatch = titleCell.match(/<a href="document\.html\?nummer=([^"]+)">([\s\S]*?)<\/a>/i); let id = ""; let title = ""; let url = ""; if (docLinkMatch && docLinkMatch[1] && docLinkMatch[2]) { id = docLinkMatch[1]; title = docLinkMatch[2].trim(); url = new URL(`document.html?nummer=${id}`, baseUrl).href; } else { // If no link, just use the text content title = titleCell.replace(/<[^>]+>/g, "").trim(); // Generate a placeholder ID id = `unknown-${Date.now()}-${Math.floor(Math.random() * 1000)}`; url = baseUrl; } // Extract document type (usually on the next line after the title) const typeMatch = titleCell.match(/<br\s*\/?>\s*(.*?)(?:<|$)/i); const type = typeMatch && typeMatch[1] ? typeMatch[1].trim() : "Unknown"; // Add the document to the list recentDocuments.push({ id, title, type, date, updated, committee, subject, url }); // Limit to 20 documents to avoid overwhelming the response if (recentDocuments.length >= 20) break; } } // Extract birthdays const birthdayRegex = /Jarig vandaag\s*((?:<a[^>]*>[^<]*<\/a>\s*)+)/i; const birthdayMatch = html.match(birthdayRegex); if (birthdayMatch && birthdayMatch[1]) { const birthdayContent = birthdayMatch[1]; const birthdayLinkRegex = /<a href="persoon\.html\?nummer=([^"]+)">([\s\S]*?)<\/a>/gi; let birthdayLinkMatch; while ((birthdayLinkMatch = birthdayLinkRegex.exec(birthdayContent)) !== null) { if (birthdayLinkMatch[1] && birthdayLinkMatch[2]) { const id = birthdayLinkMatch[1]; const nameWithParty = birthdayLinkMatch[2].trim(); // Extract name and party if in format "Name (Party)" const namePartyMatch = nameWithParty.match(/(.*?)\s*$(.*?)$\s*$/); let name = nameWithParty; let party = undefined; if (namePartyMatch && namePartyMatch[1] && namePartyMatch[2]) { name = namePartyMatch[1].trim(); party = namePartyMatch[2].trim(); } const url = new URL(`persoon.html?nummer=${id}`, baseUrl).href; birthdays.push({ id, name, party, url }); } } } // For pagination, we would normally need to fetch different pages from the server // Since the tkconv site doesn't have explicit pagination, we're simulating it by // limiting the number of documents per page and tracking which ones we've shown const documentsPerPage = 10; const startIndex = (page - 1) * documentsPerPage; const endIndex = startIndex + documentsPerPage; // Get the documents for the current page const paginatedDocuments = recentDocuments.slice(startIndex, endIndex); // Check if there are more documents available const hasMoreDocuments = endIndex < recentDocuments.length; return { recentDocuments: paginatedDocuments, birthdays, // Birthdays are always shown regardless of page lastUpdated, pagination: { currentPage: page, hasMoreDocuments, totalDocumentsRetrieved: recentDocuments.length } }; }

Loading blob content...

Implementation Reference

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/r-huijts/opentk-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

html-parser.ts•23.4 KiB