SAP Documentation MCP Server

communityBestMatch.ts•12.1 KiB

// src/lib/communityBestMatch.ts // Scrape SAP Community search "Best Match" results directly from the HTML page. // No external dependencies; best-effort selectors based on current Khoros layout. import { CONFIG } from "./config.js"; export interface BestMatchHit { title: string; url: string; author?: string; published?: string; // e.g., "2024 Dec 11 4:31 PM" likes?: number; snippet?: string; tags?: string[]; postId?: string; // extracted from URL for retrieval } type Options = { includeBlogs?: boolean; // default true limit?: number; // default 20 userAgent?: string; // optional UA override }; const BASE = "https://community.sap.com"; const buildSearchUrl = (q: string, includeBlogs = true) => { const params = new URLSearchParams({ collapse_discussion: "true", q, }); if (includeBlogs) { params.set("filter", "includeBlogs"); params.set("include_blogs", "true"); } // "tab/message" view surfaces posts sorted by Best Match by default return `${BASE}/t5/forums/searchpage/tab/message?${params.toString()}`; }; const decodeEntities = (s = "") => s .replace(/&/g, "&") .replace(/</g, "<") .replace(/>/g, ">") .replace(/"/g, '"') .replace(/'/g, "'"); const stripTags = (html = "") => decodeEntities(html.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").trim()); const absolutize = (href: string) => href?.startsWith("http") ? href : new URL(href, BASE).href; // Extract post ID from URL for later retrieval const extractPostId = (url: string): string | undefined => { // Extract from URL patterns like: /ba-p/13961398 or /td-p/13961398 const urlMatch = url.match(/\/(?:ba-p|td-p)\/(\d+)/); if (urlMatch) { return urlMatch[1]; } // Fallback: extract from end of URL const endMatch = url.match(/\/(\d+)(?:\?|$)/); return endMatch ? endMatch[1] : undefined; }; async function fetchText(url: string, userAgent?: string) { const res = await fetch(url, { headers: { "User-Agent": userAgent || "sap-docs-mcp/1.0 (BestMatchScraper)", "Accept": "text/html,application/xhtml+xml", }, }); if (!res.ok) throw new Error(`${url} -> ${res.status} ${res.statusText}`); return res.text(); } function parseHitsFromHtml(html: string, limit = 20): BestMatchHit[] { const results: BestMatchHit[] = []; // Find all message wrapper divs with data-lia-message-uid const wrapperRegex = /<div[^>]+data-lia-message-uid="([^"]*)"[^>]*class="[^"]*lia-message-view-wrapper[^"]*"[^>]*>([\s\S]*?)(?=<div[^>]+class="[^"]*lia-message-view-wrapper|$)/gi; let match; while ((match = wrapperRegex.exec(html)) !== null && results.length < limit) { const postId = match[1]; const seg = match[2].slice(0, 60000); // safety cap // Title + URL const titleMatch = seg.match( /<h2[^>]*class="[^"]*message-subject[^"]*"[^>]*>[\s\S]*?<a[^>]*href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i ) || seg.match( /<a[^>]+class="page-link[^"]*"[^>]*href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i ); const url = titleMatch ? absolutize(decodeEntities(titleMatch[1])) : ""; const title = titleMatch ? stripTags(titleMatch[2]) : ""; if (!title || !url) continue; // Author // Look for "View Profile of ..." or the user link block let author = ""; const authorMatch = seg.match(/viewprofilepage\/user-id\/\d+[^>]*>([^<]+)/i) || seg.match(/class="[^"]*lia-user-name-link[^"]*"[^>]*>([^<]+)/i); if (authorMatch) author = stripTags(authorMatch[1]); // Date/time const dateMatch = seg.match(/class="local-date"[^>]*>([^<]+)</i); const timeMatch = seg.match(/class="local-time"[^>]*>([^<]+)</i); const published = dateMatch ? `${stripTags(dateMatch[1])}${timeMatch ? " " + stripTags(timeMatch[1]) : ""}` : undefined; // Likes (Kudos) const likesMatch = seg.match(/Kudos Count\s+(\d+)/i); const likes = likesMatch ? Number(likesMatch[1]) : undefined; // Snippet const snippetMatch = seg.match( /<div[^>]*class="[^"]*lia-truncated-body-container[^"]*"[^>]*>([\s\S]*?)<\/div>/i ); const snippet = snippetMatch ? stripTags(snippetMatch[1]).slice(0, CONFIG.EXCERPT_LENGTH_COMMUNITY) : undefined; // Tags const tagSectionMatch = seg.match( /<div[^>]*class="[^"]*TagList[^"]*"[^>]*>[\s\S]*?<\/div>/i ); const tags: string[] = []; if (tagSectionMatch) { const tagLinks = tagSectionMatch[0].matchAll( /<a[^>]*class="[^"]*lia-tag[^"]*"[^>]*>([\s\S]*?)<\/a>/gi ); for (const m of tagLinks) { const t = stripTags(m[1]); if (t) tags.push(t); } } results.push({ title, url, author, published, likes, snippet, tags, postId }); } return results; } export async function searchCommunityBestMatch( query: string, opts: Options = {} ): Promise<BestMatchHit[]> { const { includeBlogs = true, limit = 20, userAgent } = opts; const url = buildSearchUrl(query, includeBlogs); const html = await fetchText(url, userAgent); return parseHitsFromHtml(html, limit); } // Convenience function: Search and get full content of top N posts in one call export async function searchAndGetTopPosts( query: string, topN: number = 3, opts: Options = {} ): Promise<{ search: BestMatchHit[], posts: { [id: string]: string } }> { // First, search for posts const searchResults = await searchCommunityBestMatch(query, { ...opts, limit: Math.max(topN, opts.limit || 20) }); // Extract post IDs from top N results const topResults = searchResults.slice(0, topN); const postIds = topResults .map(result => result.postId) .filter((id): id is string => id !== undefined); // Batch retrieve full content const posts = await getCommunityPostsByIds(postIds, opts.userAgent); return { search: topResults, posts }; } // Function to get full post content by scraping the post page // Batch retrieve multiple posts using LiQL API export async function getCommunityPostsByIds(postIds: string[], userAgent?: string): Promise<{ [id: string]: string }> { const results: { [id: string]: string } = {}; if (postIds.length === 0) { return results; } try { // Build LiQL query for batch retrieval const idList = postIds.map(id => `'${id}'`).join(', '); const liqlQuery = ` select body, id, subject, search_snippet, post_time, view_href from messages where id in (${idList}) `.replace(/\s+/g, ' ').trim(); const url = `https://community.sap.com/api/2.0/search?q=${encodeURIComponent(liqlQuery)}`; const response = await fetch(url, { headers: { 'Accept': 'application/json', 'User-Agent': userAgent || 'sap-docs-mcp/1.0 (BatchRetrieval)' } }); if (!response.ok) { console.warn(`SAP Community API returned ${response.status}: ${response.statusText}`); return results; } const data = await response.json() as any; if (data.status !== 'success' || !data.data?.items) { return results; } // Process each post for (const post of data.data.items) { const postDate = post.post_time ? new Date(post.post_time).toLocaleDateString() : 'Unknown'; const postUrl = post.view_href || `https://community.sap.com/t5/technology-blogs-by-sap/bg-p/t/${post.id}`; const content = `# ${post.subject} **Source**: SAP Community Blog Post **Published**: ${postDate} **URL**: ${postUrl} --- ${post.body || post.search_snippet} --- *This content is from the SAP Community and represents community knowledge and experiences.*`; results[post.id] = content; } return results; } catch (error) { console.warn('Failed to batch retrieve community posts:', error); return results; } } // Single post retrieval using LiQL API export async function getCommunityPostById(postId: string, userAgent?: string): Promise<string | null> { const results = await getCommunityPostsByIds([postId], userAgent); return results[postId] || null; } export async function getCommunityPostByUrl(postUrl: string, userAgent?: string): Promise<string | null> { try { const html = await fetchText(postUrl, userAgent); // Extract title - try multiple selectors let title = "Untitled"; const titleSelectors = [ /<h1[^>]*class="[^"]*lia-message-subject[^"]*"[^>]*>([\s\S]*?)<\/h1>/i, /<h2[^>]*class="[^"]*message-subject[^"]*"[^>]*>([\s\S]*?)<\/h2>/i, /<title>([\s\S]*?)<\/title>/i ]; for (const selector of titleSelectors) { const titleMatch = html.match(selector); if (titleMatch) { title = stripTags(titleMatch[1]).replace(/\s*-\s*SAP Community.*$/, '').trim(); break; } } // Extract author and date - multiple patterns let author = "Unknown"; const authorSelectors = [ /class="[^"]*lia-user-name-link[^"]*"[^>]*>([^<]+)/i, /viewprofilepage\/user-id\/\d+[^>]*>([^<]+)/i, /"author"[^>]*>[\s\S]*?<[^>]*>([^<]+)/i ]; for (const selector of authorSelectors) { const authorMatch = html.match(selector); if (authorMatch) { author = stripTags(authorMatch[1]); break; } } // Extract date and time const dateMatch = html.match(/class="local-date"[^>]*>([^<]+)</i); const timeMatch = html.match(/class="local-time"[^>]*>([^<]+)</i); const published = dateMatch ? `${stripTags(dateMatch[1])}${timeMatch ? " " + stripTags(timeMatch[1]) : ""}` : "Unknown"; // Extract main content - try multiple content selectors let content = "Content not available"; const contentSelectors = [ /<div[^>]*class="[^"]*lia-message-body[^"]*"[^>]*>([\s\S]*?)<\/div>/i, /<div[^>]*class="[^"]*lia-message-body-content[^"]*"[^>]*>([\s\S]*?)<\/div>/i, /<div[^>]*class="[^"]*messageBody[^"]*"[^>]*>([\s\S]*?)<\/div>/i ]; for (const selector of contentSelectors) { const contentMatch = html.match(selector); if (contentMatch) { // Clean up the content - remove script tags, preserve some formatting let rawContent = contentMatch[1] .replace(/<script[\s\S]*?<\/script>/gi, '') .replace(/<style[\s\S]*?<\/style>/gi, '') .replace(/<iframe[\s\S]*?<\/iframe>/gi, '[Embedded Content]'); // Convert some HTML elements to markdown-like format rawContent = rawContent .replace(/<h([1-6])[^>]*>([\s\S]*?)<\/h[1-6]>/gi, (_, level, text) => { const hashes = '#'.repeat(parseInt(level) + 1); return `\n${hashes} ${stripTags(text)}\n`; }) .replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, '\n$1\n') .replace(/<br\s*\/?>/gi, '\n') .replace(/<strong[^>]*>([\s\S]*?)<\/strong>/gi, '**$1**') .replace(/<em[^>]*>([\s\S]*?)<\/em>/gi, '*$1*') .replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, '`$1`') .replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, '\n```\n$1\n```\n') .replace(/<ul[^>]*>([\s\S]*?)<\/ul>/gi, '$1') .replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, '- $1\n'); content = stripTags(rawContent).replace(/\n\s*\n\s*\n/g, '\n\n').trim(); break; } } // Extract tags const tagSectionMatch = html.match( /<div[^>]*class="[^"]*TagList[^"]*"[^>]*>[\s\S]*?<\/div>/i ); const tags: string[] = []; if (tagSectionMatch) { const tagLinks = tagSectionMatch[0].matchAll( /<a[^>]*class="[^"]*lia-tag[^"]*"[^>]*>([\s\S]*?)<\/a>/gi ); for (const m of tagLinks) { const t = stripTags(m[1]); if (t) tags.push(t); } } // Extract kudos count let kudos = 0; const kudosMatch = html.match(/(\d+)\s+Kudos?/i); if (kudosMatch) { kudos = parseInt(kudosMatch[1]); } const tagsText = tags.length > 0 ? `\n**Tags:** ${tags.join(", ")}` : ""; const kudosText = kudos > 0 ? `\n**Kudos:** ${kudos}` : ""; return `# ${title} **Source**: SAP Community Blog Post **Author**: ${author} **Published**: ${published}${kudosText}${tagsText} **URL**: ${postUrl} --- ${content} --- *This content is from the SAP Community and represents community knowledge and experiences.*`; } catch (error) { console.warn('Failed to get community post:', error); return null; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/marianfoo/mcp-sap-docs'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

communityBestMatch.ts•12.1 KiB