MCP-RSS-Crawler

index.ts•17.8 kB

import dbManager from '../db-manager'; import { parseRssFeed, type ParsedFeed } from './feed-parser'; import { convertDbItemsToFeedItems, formatFeedResponse } from './formatter'; import { defaultRssFeeds } from '../../../../config'; import type { RssFeed, FeedResponse, FeedItem } from './types'; import FirecrawlApp from '@mendable/firecrawl-js'; import crypto from 'crypto'; /** * RSS Manager class to handle RSS feed operations */ export class RssManager { private cachedFeedItems: Map<string, FeedItem[]> = new Map(); private lastFetchTime: number = 0; private initialized: boolean = false; private firecrawl: any; constructor() { // Initialize firecrawl with API key from environment variables const apiKey = process.env.FIRECRAWL_API_KEY; if (!apiKey) { console.warn('FIRECRAWL_API_KEY not found in environment variables. Article fetching may not work correctly.'); // Create a dummy FirecrawlApp that will throw errors when used this.firecrawl = new FirecrawlApp({ apiKey: 'dummy-key' }); } else { this.firecrawl = new FirecrawlApp({ apiKey }); } // Initialize with default feeds if needed - do this asynchronously setTimeout(() => { this.initializeFeeds().catch(err => { console.error('Error initializing feeds:', err); }); }, 1000); } /** * Initialize feeds in the database with default feeds if needed */ private async initializeFeeds(): Promise<void> { if (this.initialized) return; try { // Get all feeds from the database const feeds = await dbManager.getAllFeeds(); // If no feeds exist, add the default feeds if (feeds.length === 0) { for (const feed of defaultRssFeeds) { await dbManager.addFeed(feed.url, feed.name, feed.category); } console.error('Initialized database with default RSS feeds'); } this.initialized = true; } catch (error) { console.error('Error initializing feeds:', error); } } /** * Add a new RSS feed to the database */ public async addFeed(feed: RssFeed): Promise<boolean> { return await dbManager.addFeed(feed.url, feed.name, feed.category); } /** * Remove an RSS feed from the database */ public async removeFeed(url: string): Promise<boolean> { return await dbManager.removeFeed(url); } /** * Get the list of RSS feeds from the database */ public async getFeeds(): Promise<RssFeed[]> { const feeds = await dbManager.getAllFeeds(); return feeds.map((feed: any) => ({ url: feed.url, name: feed.name, category: feed.category || undefined })); } /** * Add a new interest keyword to the database */ public async addKeyword(keyword: string): Promise<boolean> { return await dbManager.addKeyword(keyword); } /** * Remove an interest keyword from the database */ public async removeKeyword(keyword: string): Promise<boolean> { return await dbManager.removeKeyword(keyword); } /** * Get all interest keywords from the database */ public async getKeywords(): Promise<string[]> { const keywords = await dbManager.getAllKeywords(); return keywords.map((k: any) => k.keyword); } /** * Get articles matching the user's interest keywords */ public async getArticlesByKeywords(limit: number = 10): Promise<FeedResponse> { try { // Get items matching keywords from the database const items = await dbManager.getItemsByKeywords(limit); // Convert to feed items const feedItems = convertDbItemsToFeedItems(items); // Return formatted response return formatFeedResponse( feedItems, "Articles Matching Your Interests", "feed/interests", "Articles matching your interest keywords" ); } catch (error) { console.error('Error getting articles by keywords:', error); return formatFeedResponse([], "Error", "error", "Error getting articles by keywords"); } } /** * Fetch an article from a URL using firecrawl and save it to the database * @param url The URL to fetch the article from * @returns The fetched article or null if there was an error */ public async fetchArticleFromUrl(url: string): Promise<any> { try { console.error(`Fetching article from URL: ${url}`); // Check if the article already exists in the database const existingArticle = await dbManager.getArticleByUrl(url); if (existingArticle) { console.error(`Article already exists in database: ${url}`); return existingArticle; } // Use firecrawl to fetch the article const scrapeResponse = await this.firecrawl.scrapeUrl(url, { formats: ['markdown', 'html'] }); if (!scrapeResponse.success) { console.error(`Failed to scrape URL: ${url} - ${scrapeResponse.error}`); return null; } // Generate a unique ID for the article const id = `article/${crypto.createHash('md5').update(url).digest('hex')}`; // Save the article to the database const article = { id, url, title: scrapeResponse.title || '', content: scrapeResponse.markdown || '', html: scrapeResponse.html || '', author: '', // Firecrawl may not provide author directly published_date: '', // Firecrawl may not provide published date directly image_url: '', // Firecrawl may not provide image URL directly summary: scrapeResponse.description || '', fetched_at: Date.now() }; const saved = await dbManager.saveArticle(article); if (!saved) { console.error(`Failed to save article to database: ${url}`); return null; } console.error(`Successfully fetched and saved article: ${url}`); return article; } catch (error) { console.error(`Error fetching article from URL: ${url}`, error); return null; } } /** * Crawl a website using Firecrawl and save all pages to the database * @param url The URL to crawl * @param limit Maximum number of pages to crawl * @returns Object with success status and result information */ public async crawlWebsite(url: string, limit: number = 100): Promise<any> { try { console.error(`Crawling website: ${url} with limit: ${limit}`); const crawlResponse = await this.firecrawl.crawlUrl(url, { limit: limit, scrapeOptions: { formats: ['markdown', 'html'], } }); if (!crawlResponse.success) { console.error(`Failed to crawl website: ${url} - ${crawlResponse.error}`); return { success: false, error: crawlResponse.error }; } console.error(`Successfully crawled website: ${url}`); // Process and save each page let savedCount = 0; // Use type assertion to access response data const responseData = crawlResponse as any; const pages = responseData.pages || []; if (Array.isArray(pages)) { for (const page of pages) { if (!page.url) continue; // Generate a unique ID for the article const id = `article/${crypto.createHash('md5').update(page.url).digest('hex')}`; // Save the article to the database const article = { id, url: page.url, title: page.title || '', content: page.markdown || '', html: page.html || '', author: page.author || '', published_date: page.publishedDate || '', image_url: page.imageUrl || '', summary: page.description || '', fetched_at: Date.now() }; const saved = await dbManager.saveArticle(article); if (saved) { savedCount++; } } } return { success: true, totalPages: pages.length, savedPages: savedCount }; } catch (error) { console.error(`Error crawling website: ${url}`, error); return { success: false, error: error instanceof Error ? error.message : String(error) }; } } /** * Asynchronously crawl a website using Firecrawl * @param url The URL to crawl * @param limit Maximum number of pages to crawl * @returns Object with crawl ID and status */ public async asyncCrawlWebsite(url: string, limit: number = 100): Promise<any> { try { console.error(`Starting async crawl of website: ${url} with limit: ${limit}`); const crawlResponse = await this.firecrawl.asyncCrawlUrl(url, { limit: limit, scrapeOptions: { formats: ['markdown', 'html'], } }); if (!crawlResponse.success) { console.error(`Failed to start async crawl: ${url} - ${crawlResponse.error}`); return { success: false, error: crawlResponse.error }; } console.error(`Successfully started async crawl of website: ${url}, crawl ID: ${crawlResponse.id}`); // Use type assertion to access response data const responseData = crawlResponse as any; return { success: true, id: crawlResponse.id, status: responseData.status || 'pending' }; } catch (error) { console.error(`Error starting async crawl of website: ${url}`, error); return { success: false, error: error instanceof Error ? error.message : String(error) }; } } /** * Check the status of an asynchronous crawl * @param crawlId The ID of the crawl job * @returns The current status of the crawl job */ public async checkCrawlStatus(crawlId: string): Promise<any> { try { console.error(`Checking status of crawl: ${crawlId}`); const statusResponse = await this.firecrawl.checkCrawlStatus(crawlId); if (!statusResponse.success) { console.error(`Failed to check crawl status: ${crawlId} - ${statusResponse.error}`); return { success: false, error: statusResponse.error }; } // Use type assertion to access response data const responseData = statusResponse as any; return { success: true, status: responseData.status || 'unknown', pagesProcessed: responseData.pagesProcessed || 0, totalPages: responseData.totalPages || 0 }; } catch (error) { console.error(`Error checking crawl status: ${crawlId}`, error); return { success: false, error: error instanceof Error ? error.message : String(error) }; } } /** * Cancel an asynchronous crawl job * @param crawlId The ID of the crawl job to cancel * @returns The cancellation status */ public async cancelCrawl(crawlId: string): Promise<any> { try { console.error(`Cancelling crawl: ${crawlId}`); const cancelResponse = await this.firecrawl.cancelCrawl(crawlId); if (!cancelResponse.success) { console.error(`Failed to cancel crawl: ${crawlId} - ${cancelResponse.error}`); return { success: false, error: cancelResponse.error }; } return { success: true, message: `Crawl ${crawlId} cancelled successfully` }; } catch (error) { console.error(`Error cancelling crawl: ${crawlId}`, error); return { success: false, error: error instanceof Error ? error.message : String(error) }; } } /** * Get articles from the database * @param limit Maximum number of articles to return * @returns Array of articles */ public async getArticles(limit: number = 10): Promise<any[]> { try { console.error(`Getting articles with limit: ${limit}`); return await dbManager.getAllArticles(limit); } catch (error) { console.error(`Error getting articles: ${error}`); return []; } } /** * Search articles in the database * @param query Search query * @param limit Maximum number of articles to return * @returns Array of matching articles */ public async searchArticles(query: string, limit: number = 10): Promise<any[]> { try { console.error(`Searching articles with query: ${query}, limit: ${limit}`); return await dbManager.searchArticles(query, limit); } catch (error) { console.error(`Error searching articles: ${error}`); return []; } } /** * Crawl articles from all RSS feeds and store in SQLite */ public async crawlFeed(limit: number = 10): Promise<FeedResponse> { try { // Collect items from all feeds const allItems: FeedItem[] = []; // Get all feeds from the database const feeds = await this.getFeeds(); // Process feeds in parallel const feedPromises = feeds.map(feed => this.fetchFeed(feed.url, feed.name, feed.category) .then(items => { // Add items to the collection if (items && items.length > 0) { allItems.push(...items); // Cache the items for this feed this.cachedFeedItems.set(feed.url, items); } }) .catch(error => { console.error(`Error fetching feed ${feed.url}:`, error); }) ); // Wait for all feeds to be processed await Promise.all(feedPromises); // Sort all items by published date (newest first) allItems.sort((a, b) => (b.published || 0) - (a.published || 0)); // Update last fetch time this.lastFetchTime = Date.now(); // Return formatted response with limited items return formatFeedResponse( allItems.slice(0, limit), "RSS Manager Feeds", "feed/all", "Aggregated feeds from RSS Manager" ); } catch (error) { console.error('Error fetching feeds:', error); return formatFeedResponse([], "Error", "error", "Error fetching feeds"); } } /** * Fetch a single RSS feed and store in SQLite * @param itemsPerFeed Number of items to fetch per feed (default: 20) */ private async fetchFeed(feedUrl: string, feedName: string, category?: string, itemsPerFeed: number = 20): Promise<FeedItem[]> { try { // Parse the RSS feed const parsedFeed: ParsedFeed = await parseRssFeed(feedUrl); if (!parsedFeed || !parsedFeed.items || parsedFeed.items.length === 0) { console.warn(`No items found in feed: ${feedUrl}`); return []; } // Use the feed title from the parsed feed if available, otherwise use the provided name const title = feedName || parsedFeed.title || 'Unknown Feed'; // Limit items to the specified number const limitedItems = parsedFeed.items.slice(0, itemsPerFeed); console.error(`Fetched ${limitedItems.length} items from feed: ${feedUrl}`); // Save the feed and its items to the database await dbManager.saveFeed( feedUrl, title, category, limitedItems ); return limitedItems; } catch (error) { console.error(`Error fetching feed ${feedUrl}:`, error); return []; } } /** * Get the latest feeds from the database */ public async getLatestArticles(limit: number = 10): Promise<FeedResponse> { try { // Get items from the database const items = await dbManager.getItems(limit); // Convert to feed items const feedItems = convertDbItemsToFeedItems(items); // Return formatted response return formatFeedResponse( feedItems, "Latest RSS Feeds", "feed/latest", "Latest articles from RSS feeds" ); } catch (error) { console.error('Error getting latest feeds:', error); return formatFeedResponse([], "Error", "error", "Error getting latest feeds"); } } /** * Get feeds by category from the database */ public async getFeedsByCategory(category: string, limit: number = 10): Promise<FeedResponse> { try { // Get items by category from the database const items = await dbManager.getItemsByCategory(category, limit); // Convert to feed items const feedItems = convertDbItemsToFeedItems(items); // Return formatted response return formatFeedResponse( feedItems, `${category} Feeds`, `category/${category}`, `Feeds from the ${category} category` ); } catch (error) { console.error(`Error getting feeds by category ${category}:`, error); return formatFeedResponse([], "Error", "error", `Error getting feeds for category: ${category}`); } } /** * Search feeds in the database */ public async searchFeeds(query: string, limit: number = 10): Promise<FeedResponse> { try { // Search items in the database const items = await dbManager.searchItems(query, limit); // Convert to feed items const feedItems = convertDbItemsToFeedItems(items); // Return formatted response return formatFeedResponse( feedItems, `Search Results for "${query}"`, `search/${query}`, `Search results for "${query}"` ); } catch (error) { console.error(`Error searching feeds for ${query}:`, error); return formatFeedResponse([], "Error", "error", `Error searching feeds for: ${query}`); } } } // Create and export an instance of the RSS manager const rssManager = new RssManager(); export default rssManager;

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/mshk/mcp-rss-crawler'

If you have feedback or need assistance with the MCP directory API, please join our Discord server