Karakeep MCP server

Overview Schema Related Servers Score Discussions

karakeep
apps
workers
metascraper-plugins

metascraper-reddit.ts•11.4 KiB

import type { CheerioAPI } from "cheerio"; import type { Rules, RulesOptions } from "metascraper"; import { decode as decodeHtmlEntities } from "html-entities"; import { fetchWithProxy } from "network"; import { z } from "zod"; import logger from "@karakeep/shared/logger"; /** * This is a metascraper plugin to select a better * 'image' attribute for Reddit links, specifically * those sharing images. It will also extract the * Post Title for a Reddit post instead of use the * default. * * As of writing this, Reddit posts do not define * an open-graph image (og:image) attribute, so * metascraper resorts to looking for images in * the HTML DOM, and selects the first one. * * In Reddit posts, the first image is typically * the profile picture of the OP, which Karakeep * is using for the thumbnail. * * This metascraper plugin instead looks for images * with the domain i.redd.it, on which Reddit hosts * their preview images for posts. If this plugin * finds an i.redd.it image, it provides that for * the image metadata. * * If there is not a matching image, this plugin * will return 'undefined' and the next plugin * should continue to attempt to extract images. * * We also attempt to fetch the Reddit JSON response * (by appending '.json' to the URL) to grab the * title and preview images directly from the API. **/ const redditPreviewImageSchema = z.object({ source: z.object({ url: z.string().optional() }).optional(), resolutions: z.array(z.object({ url: z.string().optional() })).optional(), }); const redditMediaMetadataItemSchema = z.object({ s: z.object({ u: z.string().optional() }).optional(), p: z.array(z.object({ u: z.string().optional() })).optional(), }); const redditPostSchema = z.object({ title: z.string().optional(), preview: z .object({ images: z.array(redditPreviewImageSchema).optional() }) .optional(), url_overridden_by_dest: z.string().optional(), url: z.string().optional(), thumbnail: z.string().optional(), media_metadata: z.record(redditMediaMetadataItemSchema).optional(), author: z.string().optional(), created_utc: z.number().optional(), selftext: z.string().nullish(), selftext_html: z.string().nullish(), subreddit_name_prefixed: z.string().optional(), }); type RedditPostData = z.infer<typeof redditPostSchema>; const redditResponseSchema = z.array( z.object({ data: z.object({ children: z.array(z.object({ data: redditPostSchema })).optional(), }), }), ); interface RedditFetchResult { fetched: boolean; post?: RedditPostData; } const REDDIT_CACHE_TTL_MS = 60 * 1000; // 1 minute TTL to avoid stale data interface RedditCacheEntry { expiresAt: number; promise: Promise<RedditFetchResult>; } const redditJsonCache = new Map<string, RedditCacheEntry>(); const purgeExpiredCacheEntries = (now: number) => { for (const [key, entry] of redditJsonCache.entries()) { if (entry.expiresAt <= now) { redditJsonCache.delete(key); } } }; const decodeRedditUrl = (url?: string): string | undefined => { if (!url) { return undefined; } const decoded = decodeHtmlEntities(url); return decoded || undefined; }; const buildJsonUrl = (url: string): string => { const urlObj = new URL(url); if (!urlObj.pathname.endsWith(".json")) { urlObj.pathname = urlObj.pathname.replace(/\/?$/, ".json"); } return urlObj.toString(); }; const extractImageFromMediaMetadata = ( media_metadata?: RedditPostData["media_metadata"], ): string | undefined => { if (!media_metadata) { return undefined; } const firstItem = Object.values(media_metadata)[0]; if (!firstItem) { return undefined; } return ( decodeRedditUrl(firstItem.s?.u) ?? decodeRedditUrl(firstItem.p?.[0]?.u) ?? undefined ); }; const isRedditImageHost = (urlCandidate: string): boolean => { try { const hostname = new URL(urlCandidate).hostname; return hostname.includes("redd.it"); } catch { return false; } }; const extractImageFromPost = (post: RedditPostData): string | undefined => { const previewImage = post.preview?.images?.[0]; const previewUrl = decodeRedditUrl(previewImage?.source?.url) ?? decodeRedditUrl(previewImage?.resolutions?.[0]?.url); if (previewUrl) { return previewUrl; } const mediaUrl = extractImageFromMediaMetadata(post.media_metadata); if (mediaUrl) { return mediaUrl; } const directUrl = decodeRedditUrl(post.url_overridden_by_dest) ?? decodeRedditUrl(post.url) ?? decodeRedditUrl(post.thumbnail); if (directUrl && isRedditImageHost(directUrl)) { return directUrl; } return undefined; }; const extractTitleFromPost = (post: RedditPostData): string | undefined => post.title?.trim() || undefined; const extractAuthorFromPost = (post: RedditPostData): string | undefined => post.author?.trim() || undefined; const extractDateFromPost = (post: RedditPostData): string | undefined => { if (!post.created_utc) { return undefined; } const date = new Date(post.created_utc * 1000); return Number.isNaN(date.getTime()) ? undefined : date.toISOString(); }; const extractPublisherFromPost = (post: RedditPostData): string | undefined => post.subreddit_name_prefixed?.trim() || "Reddit"; const REDDIT_LOGO_URL = "https://www.redditstatic.com/desktop2x/img/favicon/android-icon-192x192.png"; const fallbackDomImage = ({ htmlDom }: { htmlDom: CheerioAPI }) => { // 'preview' subdomain images are more likely to be what we're after // but it could be in the 'i' subdomain. // returns undefined if neither exists const previewImages = htmlDom('img[src*="preview.redd.it"]') .map((_, el) => htmlDom(el).attr("src")) .get(); const iImages = htmlDom('img[src*="i.redd.it"]') .map((_, el) => htmlDom(el).attr("src")) .get(); return previewImages[0] || iImages[0]; }; const fallbackDomTitle = ({ htmlDom }: { htmlDom: CheerioAPI }) => { const title: string | undefined = htmlDom("shreddit-title[title]") .first() .attr("title"); const postTitle: string | undefined = title ?? htmlDom("shreddit-post[post-title]").first().attr("post-title"); return postTitle ? postTitle.trim() : undefined; }; const fetchRedditPostData = async (url: string): Promise<RedditFetchResult> => { const cached = redditJsonCache.get(url); const now = Date.now(); purgeExpiredCacheEntries(now); if (cached && cached.expiresAt > now) { return cached.promise; } const promise = (async () => { let jsonUrl: string; try { jsonUrl = buildJsonUrl(url); } catch (error) { logger.warn( "[MetascraperReddit] Failed to construct Reddit JSON URL", error, ); return { fetched: false }; } let response; try { response = await fetchWithProxy(jsonUrl, { headers: { accept: "application/json" }, }); } catch (error) { logger.warn( `[MetascraperReddit] Failed to fetch Reddit JSON for ${jsonUrl}`, error, ); return { fetched: false }; } if (response.status === 403) { // API forbidden; fall back to DOM scraping. return { fetched: false }; } if (!response.ok) { logger.warn( `[MetascraperReddit] Reddit JSON request failed for ${jsonUrl} with status ${response.status}`, ); return { fetched: false }; } let payload: unknown; try { payload = await response.json(); } catch (error) { logger.warn( `[MetascraperReddit] Failed to parse Reddit JSON for ${jsonUrl}`, error, ); return { fetched: false }; } const parsed = redditResponseSchema.safeParse(payload); if (!parsed.success) { logger.warn( "[MetascraperReddit] Reddit JSON schema validation failed", parsed.error, ); return { fetched: false }; } const firstListingWithChildren = parsed.data.find( (listing) => (listing.data.children?.length ?? 0) > 0, ); return { fetched: true, post: firstListingWithChildren?.data.children?.[0]?.data, }; })(); redditJsonCache.set(url, { promise, expiresAt: now + REDDIT_CACHE_TTL_MS, }); return promise; }; const domainFromUrl = (url: string): string => { /** * First-party metascraper plugins import metascraper-helpers, * which exposes a parseUrl function from the tldtr package. * This function does similar to the 'domainWithoutSuffix' * field from the tldtr package, without requiring any * additional packages. **/ try { // Create a URL instance to parse the hostname const hostname = new URL(url).hostname; const parts = hostname.split("."); // Return the part before the TLD (assuming at least two segments) // For example, "www.example.com" -> ["www", "example", "com"] if (parts.length >= 2) { return parts[parts.length - 2]; } return hostname; } catch (error) { logger.error( "[MetascraperReddit] Test>domainFromUrl received an invalid URL:", error, ); return ""; } }; const test = ({ url }: { url: string }): boolean => domainFromUrl(url).toLowerCase() === "reddit"; const metascraperReddit = () => { const rules: Rules = { pkgName: "metascraper-reddit", test, image: (async ({ url, htmlDom }: { url: string; htmlDom: CheerioAPI }) => { const result = await fetchRedditPostData(url); if (result.post) { const redditImage = extractImageFromPost(result.post); if (redditImage) { return redditImage; } } // If we successfully fetched JSON but found no Reddit image, // avoid falling back to random DOM images. if (result.fetched) { return undefined; } return fallbackDomImage({ htmlDom }); }) as unknown as RulesOptions, title: (async ({ url, htmlDom }: { url: string; htmlDom: CheerioAPI }) => { const result = await fetchRedditPostData(url); if (result.post) { const redditTitle = extractTitleFromPost(result.post); if (redditTitle) { return redditTitle; } } return fallbackDomTitle({ htmlDom }); }) as unknown as RulesOptions, author: (async ({ url }: { url: string }) => { const result = await fetchRedditPostData(url); if (result.post) { return extractAuthorFromPost(result.post); } return undefined; }) as unknown as RulesOptions, datePublished: (async ({ url }: { url: string }) => { const result = await fetchRedditPostData(url); if (result.post) { return extractDateFromPost(result.post); } return undefined; }) as unknown as RulesOptions, publisher: (async ({ url }: { url: string }) => { const result = await fetchRedditPostData(url); if (result.post) { return extractPublisherFromPost(result.post); } return undefined; }) as unknown as RulesOptions, logo: (async ({ url }: { url: string }) => { const result = await fetchRedditPostData(url); if (result.post) { return REDDIT_LOGO_URL; } return undefined; }) as unknown as RulesOptions, readableContentHtml: (async ({ url }: { url: string }) => { const result = await fetchRedditPostData(url); if (result.post) { const decoded = decodeHtmlEntities(result.post.selftext_html ?? ""); // The post has no content, return the title return (decoded || result.post.title) ?? null; } return undefined; }) as unknown as RulesOptions, }; return rules; }; export default metascraperReddit;

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/karakeep-app/karakeep'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

metascraper-reddit.ts•11.4 KiB