MCP Podcast Scraper

index.ts•25.3 kB

#!/usr/bin/env node import { Server } from "@modelcontextprotocol/sdk/server/index.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { CallToolRequestSchema, ListToolsRequestSchema, } from "@modelcontextprotocol/sdk/types.js"; import { readFileSync, existsSync } from "fs"; import { join } from "path"; import { config, getProjectRoot } from "./config.js"; import { isYouTubeUrl, downloadAudio, getVideoInfo, searchYouTube } from "./services/sources/youtube.js"; import { parseFeed, downloadAudioFromUrl } from "./services/sources/rss.js"; import { transcribeAudio } from "./services/transcription.js"; import { saveTranscript, saveSummary, cleanupTempFile, cleanupTempDirectory, hasTranscript, hasSummary, readTranscript, findIncompleteEpisodes, getEpisodeOutputDir, } from "./services/file-manager.js"; import { addTrackedPodcast, getTrackedPodcasts, removeTrackedPodcast, loadTrackingData, } from "./services/scheduler.js"; /** * Read the summary prompt from prompts/summary-prompt.md */ function getSummaryPrompt(): string { const promptPath = join(getProjectRoot(), "prompts", "summary-prompt.md"); if (!existsSync(promptPath)) { return `No custom prompt found. Create prompts/summary-prompt.md to customize how summaries are generated. Default instructions: Summarize the podcast transcript with key insights, actionable takeaways, and notable quotes.`; } return readFileSync(promptPath, "utf-8"); } const server = new Server( { name: "mcp-podcast-scraper", version: "1.0.0", }, { capabilities: { tools: {}, }, } ); // List available tools server.setRequestHandler(ListToolsRequestSchema, async () => { return { tools: [ { name: "scrape_podcast", description: "Scrape a podcast episode and transcribe it. Returns transcript file path. Use get_transcript to read it, then save_summary after summarizing.", inputSchema: { type: "object", properties: { query: { type: "string", description: "YouTube URL, RSS feed URL, or search query for the podcast episode", }, podcastName: { type: "string", description: "Name of the podcast (for organization)", }, episodeTitle: { type: "string", description: "Title of the episode (optional, will be auto-detected)", }, force: { type: "boolean", description: "Force re-scraping even if episode was already scraped (default: false)", }, }, required: ["query"], }, }, { name: "get_transcript", description: "Read the transcript of a previously scraped episode. After reading, use get_summary_prompt for summarization instructions.", inputSchema: { type: "object", properties: { podcastName: { type: "string", description: "Name of the podcast", }, episodeTitle: { type: "string", description: "Title of the episode", }, episodeDate: { type: "string", description: "Date of the episode (YYYY-MM-DD format)", }, }, required: ["podcastName", "episodeTitle", "episodeDate"], }, }, { name: "save_summary", description: "Save your generated summary to a markdown file.", inputSchema: { type: "object", properties: { podcastName: { type: "string", description: "Name of the podcast", }, episodeTitle: { type: "string", description: "Title of the episode", }, episodeDate: { type: "string", description: "Date of the episode (YYYY-MM-DD format)", }, summaryText: { type: "string", description: "The summary content in markdown format", }, }, required: ["podcastName", "episodeTitle", "episodeDate", "summaryText"], }, }, { name: "get_summary_prompt", description: "Get the custom prompt/instructions for how to summarize podcasts. Read this before summarizing to follow the user's preferences.", inputSchema: { type: "object", properties: {}, }, }, { name: "check_new_episodes", description: "Check all tracked podcasts for new episodes that haven't been scraped yet", inputSchema: { type: "object", properties: {}, }, }, { name: "list_incomplete", description: "List all episodes that have transcripts but are missing summaries. Use this to find episodes that need summarization.", inputSchema: { type: "object", properties: {}, }, }, { name: "search_podcast", description: "Search for podcasts or episodes on YouTube, or parse an RSS feed URL to see available episodes", inputSchema: { type: "object", properties: { query: { type: "string", description: "Search query for YouTube, or RSS feed URL to parse", }, source: { type: "string", enum: ["youtube", "rss", "all"], description: "Source to search (default: all)", }, }, required: ["query"], }, }, { name: "add_tracking", description: "Add a podcast RSS feed to the tracking list. Use check_new_episodes to find new episodes.", inputSchema: { type: "object", properties: { podcastName: { type: "string", description: "Name of the podcast", }, feedUrl: { type: "string", description: "RSS feed URL of the podcast", }, }, required: ["podcastName", "feedUrl"], }, }, { name: "list_tracking", description: "List all podcasts currently being tracked", inputSchema: { type: "object", properties: {}, }, }, { name: "remove_tracking", description: "Remove a podcast from the tracking list", inputSchema: { type: "object", properties: { podcastName: { type: "string", description: "Name of the podcast to remove", }, }, required: ["podcastName"], }, }, ], }; }); // Handle tool calls server.setRequestHandler(CallToolRequestSchema, async (request) => { const { name, arguments: args } = request.params; try { switch (name) { case "scrape_podcast": return await handleScrapePodcast(args as { query: string; podcastName?: string; episodeTitle?: string; force?: boolean; }); case "get_transcript": return await handleGetTranscript(args as { podcastName: string; episodeTitle: string; episodeDate: string; }); case "save_summary": return await handleSaveSummary(args as { podcastName: string; episodeTitle: string; episodeDate: string; summaryText: string; }); case "get_summary_prompt": return handleGetSummaryPrompt(); case "check_new_episodes": return await handleCheckNewEpisodes(); case "list_incomplete": return await handleListIncomplete(); case "search_podcast": return await handleSearchPodcast(args as { query: string; source?: "youtube" | "rss" | "all"; }); case "add_tracking": return await handleAddTracking(args as { podcastName: string; feedUrl: string; }); case "list_tracking": return await handleListTracking(); case "remove_tracking": return await handleRemoveTracking(args as { podcastName: string }); default: throw new Error(`Unknown tool: ${name}`); } } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); return { content: [ { type: "text", text: `Error: ${errorMessage}`, }, ], isError: true, }; } }); /** * Handle scrape_podcast tool * Returns transcript file path (not full content) for efficiency */ async function handleScrapePodcast(args: { query: string; podcastName?: string; episodeTitle?: string; force?: boolean; }) { const { query, podcastName, episodeTitle, force = false } = args; let audioPath: string | null = null; let detectedPodcastName: string; let detectedEpisodeTitle: string; let episodeDate: string; let audioUrl: string | null = null; // First, get episode info without downloading to check for duplicates if (isYouTubeUrl(query)) { const info = await getVideoInfo(query); detectedPodcastName = podcastName || info.uploader; detectedEpisodeTitle = episodeTitle || info.title; episodeDate = info.uploadDate || new Date().toISOString().split("T")[0]; } else if (query.startsWith("http") && (query.includes("rss") || query.includes("feed") || query.includes(".xml") || query.includes("megaphone") || query.includes("libsyn") || query.includes("anchor"))) { const feed = await parseFeed(query); const latestEpisode = feed.episodes[0]; if (!latestEpisode || !latestEpisode.audioUrl) { throw new Error("No episodes with audio found in the RSS feed"); } detectedPodcastName = podcastName || feed.title; detectedEpisodeTitle = episodeTitle || latestEpisode.title; episodeDate = latestEpisode.pubDate ? new Date(latestEpisode.pubDate).toISOString().split("T")[0] : new Date().toISOString().split("T")[0]; audioUrl = latestEpisode.audioUrl; } else if (query.startsWith("http") && (query.includes(".mp3") || query.includes("traffic.") || query.includes("audio"))) { // Direct audio URL if (!podcastName || !episodeTitle) { throw new Error("When using a direct audio URL, you must provide podcastName and episodeTitle"); } detectedPodcastName = podcastName; detectedEpisodeTitle = episodeTitle; episodeDate = new Date().toISOString().split("T")[0]; audioUrl = query; } else { // Treat as search query - search YouTube const results = await searchYouTube(query, 1); if (results.length === 0) { throw new Error(`No results found for: ${query}`); } const info = await getVideoInfo(results[0].url); detectedPodcastName = podcastName || info.uploader; detectedEpisodeTitle = episodeTitle || info.title; episodeDate = info.uploadDate || new Date().toISOString().split("T")[0]; } // Check if episode already has transcript const alreadyHasTranscript = hasTranscript(detectedPodcastName, detectedEpisodeTitle, episodeDate); const alreadyHasSummary = hasSummary(detectedPodcastName, detectedEpisodeTitle, episodeDate); if (!force && alreadyHasTranscript && alreadyHasSummary) { const outputDir = getEpisodeOutputDir(detectedPodcastName, detectedEpisodeTitle, episodeDate); return { content: [ { type: "text", text: `⏭️ Episode already fully processed! Skipping... **Podcast:** ${detectedPodcastName} **Episode:** ${detectedEpisodeTitle} **Date:** ${episodeDate} **Location:** ${outputDir} Both transcript and summary exist. To re-scrape, set \`force: true\``, }, ], }; } if (!force && alreadyHasTranscript) { const outputDir = getEpisodeOutputDir(detectedPodcastName, detectedEpisodeTitle, episodeDate); return { content: [ { type: "text", text: `📝 Transcript already exists (no summary yet) **Podcast:** ${detectedPodcastName} **Episode:** ${detectedEpisodeTitle} **Date:** ${episodeDate} **Location:** ${outputDir} Use \`get_transcript\` to read it, then \`save_summary\` after summarizing. To re-scrape, set \`force: true\``, }, ], }; } // Download the audio if (isYouTubeUrl(query)) { const result = await downloadAudio(query); audioPath = result.audioPath; } else if (audioUrl) { const filename = `${Date.now()}-podcast.mp3`; audioPath = await downloadAudioFromUrl(audioUrl, filename); } else { const results = await searchYouTube(query, 1); const result = await downloadAudio(results[0].url); audioPath = result.audioPath; } // Transcribe audio const transcription = await transcribeAudio(audioPath); // Save transcript const transcriptPath = saveTranscript( detectedPodcastName, detectedEpisodeTitle, episodeDate, transcription.text ); // Cleanup temp audio file cleanupTempFile(audioPath); // Return success with instructions (not the full transcript) const wordCount = transcription.text.split(/\s+/).length; const preview = transcription.text.substring(0, 500) + (transcription.text.length > 500 ? "..." : ""); return { content: [ { type: "text", text: `✅ Successfully transcribed! **Podcast:** ${detectedPodcastName} **Episode:** ${detectedEpisodeTitle} **Date:** ${episodeDate} **Words:** ~${wordCount.toLocaleString()} **Transcript:** ${transcriptPath} --- **Preview:** ${preview} --- **Next steps:** 1. Use \`get_summary_prompt\` to get summarization instructions 2. Use \`get_transcript\` to read the full transcript 3. Generate a summary following the prompt instructions 4. Use \`save_summary\` to save your summary \`\`\` get_transcript({ podcastName: "${detectedPodcastName}", episodeTitle: "${detectedEpisodeTitle}", episodeDate: "${episodeDate}" }) \`\`\``, }, ], }; } /** * Handle get_transcript tool * Reads and returns the full transcript content */ async function handleGetTranscript(args: { podcastName: string; episodeTitle: string; episodeDate: string; }) { const { podcastName, episodeTitle, episodeDate } = args; const transcript = readTranscript(podcastName, episodeTitle, episodeDate); if (!transcript) { return { content: [ { type: "text", text: `❌ Transcript not found for: - Podcast: ${podcastName} - Episode: ${episodeTitle} - Date: ${episodeDate} Use \`scrape_podcast\` to transcribe this episode first.`, }, ], isError: true, }; } return { content: [ { type: "text", text: `${transcript} --- **Next steps:** 1. Use \`get_summary_prompt\` to see summarization instructions (if not already done) 2. Generate a summary following the prompt 3. Save with \`save_summary\`: \`\`\` save_summary({ podcastName: "${podcastName}", episodeTitle: "${episodeTitle}", episodeDate: "${episodeDate}", summaryText: "YOUR_SUMMARY_HERE" }) \`\`\``, }, ], }; } /** * Handle save_summary tool */ async function handleSaveSummary(args: { podcastName: string; episodeTitle: string; episodeDate: string; summaryText: string; }) { const { podcastName, episodeTitle, episodeDate, summaryText } = args; const summaryPath = saveSummary( podcastName, episodeTitle, episodeDate, summaryText ); return { content: [ { type: "text", text: `✅ Summary saved! **Podcast:** ${podcastName} **Episode:** ${episodeTitle} **Date:** ${episodeDate} **File:** ${summaryPath}`, }, ], }; } /** * Handle get_summary_prompt tool */ function handleGetSummaryPrompt() { const prompt = getSummaryPrompt(); return { content: [ { type: "text", text: `## Summary Instructions ${prompt} --- **After reading the transcript, generate a summary following these instructions, then use \`save_summary\` to save it.**`, }, ], }; } /** * Handle check_new_episodes tool */ async function handleCheckNewEpisodes() { const trackingData = loadTrackingData(); const podcasts = trackingData.podcasts; if (podcasts.length === 0) { return { content: [ { type: "text", text: "No podcasts are being tracked. Use `add_tracking` to add podcasts first.", }, ], }; } const newEpisodes: Array<{ podcastName: string; episodeTitle: string; episodeDate: string; audioUrl: string; }> = []; const errors: string[] = []; for (const podcast of podcasts) { if (!podcast.enabled) continue; try { if (isYouTubeUrl(podcast.feedUrl)) { errors.push(`${podcast.name}: YouTube channel tracking not supported. Use RSS feeds instead.`); continue; } const feed = await parseFeed(podcast.feedUrl); // Check recent episodes (last 5) for (const episode of feed.episodes.slice(0, 5)) { if (!episode.audioUrl) continue; const episodeDate = episode.pubDate ? new Date(episode.pubDate).toISOString().split("T")[0] : new Date().toISOString().split("T")[0]; // Check if already has transcript if (!hasTranscript(podcast.name, episode.title, episodeDate)) { newEpisodes.push({ podcastName: podcast.name, episodeTitle: episode.title, episodeDate, audioUrl: episode.audioUrl, }); } } } catch (error) { errors.push(`${podcast.name}: ${error instanceof Error ? error.message : String(error)}`); } } if (newEpisodes.length === 0 && errors.length === 0) { return { content: [ { type: "text", text: `✅ All caught up! No new episodes found across ${podcasts.length} tracked podcast(s).`, }, ], }; } let response = `## New Episodes Found: ${newEpisodes.length}\n\n`; if (newEpisodes.length > 0) { newEpisodes.forEach((ep, index) => { response += `### ${index + 1}. ${ep.podcastName}\n`; response += `- **Episode:** ${ep.episodeTitle}\n`; response += `- **Date:** ${ep.episodeDate}\n`; response += `- **Audio URL:** ${ep.audioUrl}\n\n`; }); response += `---\n\n`; response += `To scrape all these episodes, call \`scrape_podcast\` for each audio URL.\n`; } if (errors.length > 0) { response += `\n## Errors\n`; errors.forEach((err) => { response += `- ${err}\n`; }); } return { content: [ { type: "text", text: response, }, ], }; } /** * Handle list_incomplete tool * Find episodes with transcripts but no summaries */ async function handleListIncomplete() { const incomplete = findIncompleteEpisodes(); if (incomplete.length === 0) { return { content: [ { type: "text", text: `✅ All episodes are complete! No missing summaries found.`, }, ], }; } let response = `## Episodes Missing Summaries: ${incomplete.length}\n\n`; incomplete.forEach((ep, index) => { response += `### ${index + 1}. ${ep.podcastName}\n`; response += `- **Episode:** ${ep.episodeTitle}\n`; response += `- **Date:** ${ep.episodeDate}\n`; response += `- **Transcript:** ${ep.transcriptPath}\n\n`; }); response += `---\n\n`; response += `**To summarize:**\n`; response += `1. Use \`get_summary_prompt\` to see summarization instructions\n`; response += `2. Use \`get_transcript\` to read each transcript\n`; response += `3. Generate summary and use \`save_summary\` to save\n`; return { content: [ { type: "text", text: response, }, ], }; } /** * Handle search_podcast tool */ async function handleSearchPodcast(args: { query: string; source?: "youtube" | "rss" | "all"; }) { const { query, source = "all" } = args; const results: string[] = []; if (source === "youtube" || source === "all") { try { const ytResults = await searchYouTube(query, 5); if (ytResults.length > 0) { results.push("## YouTube Results\n"); ytResults.forEach((video, index) => { results.push(`${index + 1}. **${video.title}**`); results.push(` - Channel: ${video.uploader}`); results.push(` - Date: ${video.uploadDate || "Unknown"}`); results.push(` - Duration: ${Math.floor(video.duration / 60)}m ${video.duration % 60}s`); results.push(` - URL: ${video.url}`); results.push(""); }); } } catch (error) { results.push(`YouTube search error: ${error instanceof Error ? error.message : String(error)}\n`); } } if (source === "rss" || source === "all") { if (query.startsWith("http")) { try { const feed = await parseFeed(query); results.push("## RSS Feed Results\n"); results.push(`**Podcast:** ${feed.title}\n`); results.push("**Recent Episodes:**\n"); feed.episodes.slice(0, 5).forEach((episode, index) => { results.push(`${index + 1}. **${episode.title}**`); results.push(` - Date: ${episode.pubDate}`); if (episode.audioUrl) { results.push(` - Audio URL: ${episode.audioUrl}`); } else { results.push(` - Audio URL: Not available`); } results.push(""); }); } catch (error) { results.push(`RSS parse error: ${error instanceof Error ? error.message : String(error)}\n`); } } else if (source === "rss") { results.push("💡 **Tip:** Provide a direct RSS feed URL to parse it."); results.push("Find podcast RSS feeds at: https://getrssfeed.com/\n"); } } if (results.length === 0) { return { content: [ { type: "text", text: `No results found for: "${query}"`, }, ], }; } return { content: [ { type: "text", text: results.join("\n"), }, ], }; } /** * Handle add_tracking tool */ async function handleAddTracking(args: { podcastName: string; feedUrl: string; }) { const { podcastName, feedUrl } = args; // Validate it's an RSS feed, not YouTube if (isYouTubeUrl(feedUrl)) { return { content: [ { type: "text", text: `❌ YouTube URLs are not supported for tracking. Please provide an RSS feed URL. To find a podcast's RSS feed: - Check the podcast's website - Use https://getrssfeed.com/ - Look for "RSS" link on Apple Podcasts`, }, ], isError: true, }; } // Validate the feed works try { const feed = await parseFeed(feedUrl); const podcast = addTrackedPodcast({ name: podcastName, feedUrl, }); return { content: [ { type: "text", text: `✅ Added podcast to tracking list! **Podcast:** ${podcast.name} **Feed URL:** ${podcast.feedUrl} **Episodes in feed:** ${feed.episodes.length} Use \`check_new_episodes\` to find new episodes to scrape.`, }, ], }; } catch (error) { return { content: [ { type: "text", text: `❌ Failed to validate RSS feed: ${error instanceof Error ? error.message : String(error)} Make sure the URL is a valid RSS feed.`, }, ], isError: true, }; } } /** * Handle list_tracking tool */ async function handleListTracking() { const podcasts = getTrackedPodcasts(); if (podcasts.length === 0) { return { content: [ { type: "text", text: "No podcasts are currently being tracked.\n\nUse `add_tracking` to add a podcast.", }, ], }; } const list = podcasts.map((p, index) => { return `${index + 1}. **${p.name}** - Feed: ${p.feedUrl} - Last Checked: ${p.lastChecked || "Never"}`; }).join("\n\n"); return { content: [ { type: "text", text: `## Tracked Podcasts (${podcasts.length})\n\n${list}`, }, ], }; } /** * Handle remove_tracking tool */ async function handleRemoveTracking(args: { podcastName: string }) { const { podcastName } = args; const removed = removeTrackedPodcast(podcastName); if (removed) { return { content: [ { type: "text", text: `✅ Removed "${podcastName}" from tracking list.`, }, ], }; } else { return { content: [ { type: "text", text: `❌ Podcast "${podcastName}" not found in tracking list.`, }, ], }; } } // Start the server async function main() { // Clean up temp directory on startup cleanupTempDirectory(); const transport = new StdioServerTransport(); await server.connect(transport); console.error("MCP Podcast Scraper server running on stdio"); } main().catch((error) => { console.error("Fatal error:", error); process.exit(1); });

Loading blob content...

Latest Blog Posts

Don't Use Large Strings as Cache Keys
By punkpeye on January 11, 2026.
markdown
node-js
cache
What are Claude Skills?
By punkpeye on January 10, 2026.
mcp
skills
How to Test MCP Streamable HTTP Endpoints Using cURL
By punkpeye on January 2, 2026.
tutorial
bash

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/wkoleilat-happytitan/mcp-podcast-scraper'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

index.ts•25.3 kB