import dbManager from '../db-manager';
import { parseRssFeed, type ParsedFeed } from './feed-parser';
import { convertDbItemsToFeedItems, formatFeedResponse } from './formatter';
import { defaultRssFeeds } from '../../../../config';
import type { RssFeed, FeedResponse, FeedItem } from './types';
import FirecrawlApp from '@mendable/firecrawl-js';
import crypto from 'crypto';
/**
* RSS Manager class to handle RSS feed operations
*/
export class RssManager {
private cachedFeedItems: Map<string, FeedItem[]> = new Map();
private lastFetchTime: number = 0;
private initialized: boolean = false;
private firecrawl: any;
constructor() {
// Initialize firecrawl with API key from environment variables
const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
console.warn('FIRECRAWL_API_KEY not found in environment variables. Article fetching may not work correctly.');
// Create a dummy FirecrawlApp that will throw errors when used
this.firecrawl = new FirecrawlApp({ apiKey: 'dummy-key' });
} else {
this.firecrawl = new FirecrawlApp({ apiKey });
}
// Initialize with default feeds if needed - do this asynchronously
setTimeout(() => {
this.initializeFeeds().catch(err => {
console.error('Error initializing feeds:', err);
});
}, 1000);
}
/**
* Initialize feeds in the database with default feeds if needed
*/
private async initializeFeeds(): Promise<void> {
if (this.initialized) return;
try {
// Get all feeds from the database
const feeds = await dbManager.getAllFeeds();
// If no feeds exist, add the default feeds
if (feeds.length === 0) {
for (const feed of defaultRssFeeds) {
await dbManager.addFeed(feed.url, feed.name, feed.category);
}
console.error('Initialized database with default RSS feeds');
}
this.initialized = true;
} catch (error) {
console.error('Error initializing feeds:', error);
}
}
/**
* Add a new RSS feed to the database
*/
public async addFeed(feed: RssFeed): Promise<boolean> {
return await dbManager.addFeed(feed.url, feed.name, feed.category);
}
/**
* Remove an RSS feed from the database
*/
public async removeFeed(url: string): Promise<boolean> {
return await dbManager.removeFeed(url);
}
/**
* Get the list of RSS feeds from the database
*/
public async getFeeds(): Promise<RssFeed[]> {
const feeds = await dbManager.getAllFeeds();
return feeds.map((feed: any) => ({
url: feed.url,
name: feed.name,
category: feed.category || undefined
}));
}
/**
* Add a new interest keyword to the database
*/
public async addKeyword(keyword: string): Promise<boolean> {
return await dbManager.addKeyword(keyword);
}
/**
* Remove an interest keyword from the database
*/
public async removeKeyword(keyword: string): Promise<boolean> {
return await dbManager.removeKeyword(keyword);
}
/**
* Get all interest keywords from the database
*/
public async getKeywords(): Promise<string[]> {
const keywords = await dbManager.getAllKeywords();
return keywords.map((k: any) => k.keyword);
}
/**
* Get articles matching the user's interest keywords
*/
public async getArticlesByKeywords(limit: number = 10): Promise<FeedResponse> {
try {
// Get items matching keywords from the database
const items = await dbManager.getItemsByKeywords(limit);
// Convert to feed items
const feedItems = convertDbItemsToFeedItems(items);
// Return formatted response
return formatFeedResponse(
feedItems,
"Articles Matching Your Interests",
"feed/interests",
"Articles matching your interest keywords"
);
} catch (error) {
console.error('Error getting articles by keywords:', error);
return formatFeedResponse([], "Error", "error", "Error getting articles by keywords");
}
}
/**
* Fetch an article from a URL using firecrawl and save it to the database
* @param url The URL to fetch the article from
* @returns The fetched article or null if there was an error
*/
public async fetchArticleFromUrl(url: string): Promise<any> {
try {
console.error(`Fetching article from URL: ${url}`);
// Check if the article already exists in the database
const existingArticle = await dbManager.getArticleByUrl(url);
if (existingArticle) {
console.error(`Article already exists in database: ${url}`);
return existingArticle;
}
// Use firecrawl to fetch the article
const scrapeResponse = await this.firecrawl.scrapeUrl(url, {
formats: ['markdown', 'html']
});
if (!scrapeResponse.success) {
console.error(`Failed to scrape URL: ${url} - ${scrapeResponse.error}`);
return null;
}
// Generate a unique ID for the article
const id = `article/${crypto.createHash('md5').update(url).digest('hex')}`;
// Save the article to the database
const article = {
id,
url,
title: scrapeResponse.title || '',
content: scrapeResponse.markdown || '',
html: scrapeResponse.html || '',
author: '', // Firecrawl may not provide author directly
published_date: '', // Firecrawl may not provide published date directly
image_url: '', // Firecrawl may not provide image URL directly
summary: scrapeResponse.description || '',
fetched_at: Date.now()
};
const saved = await dbManager.saveArticle(article);
if (!saved) {
console.error(`Failed to save article to database: ${url}`);
return null;
}
console.error(`Successfully fetched and saved article: ${url}`);
return article;
} catch (error) {
console.error(`Error fetching article from URL: ${url}`, error);
return null;
}
}
/**
* Crawl a website using Firecrawl and save all pages to the database
* @param url The URL to crawl
* @param limit Maximum number of pages to crawl
* @returns Object with success status and result information
*/
public async crawlWebsite(url: string, limit: number = 100): Promise<any> {
try {
console.error(`Crawling website: ${url} with limit: ${limit}`);
const crawlResponse = await this.firecrawl.crawlUrl(url, {
limit: limit,
scrapeOptions: {
formats: ['markdown', 'html'],
}
});
if (!crawlResponse.success) {
console.error(`Failed to crawl website: ${url} - ${crawlResponse.error}`);
return { success: false, error: crawlResponse.error };
}
console.error(`Successfully crawled website: ${url}`);
// Process and save each page
let savedCount = 0;
// Use type assertion to access response data
const responseData = crawlResponse as any;
const pages = responseData.pages || [];
if (Array.isArray(pages)) {
for (const page of pages) {
if (!page.url) continue;
// Generate a unique ID for the article
const id = `article/${crypto.createHash('md5').update(page.url).digest('hex')}`;
// Save the article to the database
const article = {
id,
url: page.url,
title: page.title || '',
content: page.markdown || '',
html: page.html || '',
author: page.author || '',
published_date: page.publishedDate || '',
image_url: page.imageUrl || '',
summary: page.description || '',
fetched_at: Date.now()
};
const saved = await dbManager.saveArticle(article);
if (saved) {
savedCount++;
}
}
}
return {
success: true,
totalPages: pages.length,
savedPages: savedCount
};
} catch (error) {
console.error(`Error crawling website: ${url}`, error);
return {
success: false,
error: error instanceof Error ? error.message : String(error)
};
}
}
/**
* Asynchronously crawl a website using Firecrawl
* @param url The URL to crawl
* @param limit Maximum number of pages to crawl
* @returns Object with crawl ID and status
*/
public async asyncCrawlWebsite(url: string, limit: number = 100): Promise<any> {
try {
console.error(`Starting async crawl of website: ${url} with limit: ${limit}`);
const crawlResponse = await this.firecrawl.asyncCrawlUrl(url, {
limit: limit,
scrapeOptions: {
formats: ['markdown', 'html'],
}
});
if (!crawlResponse.success) {
console.error(`Failed to start async crawl: ${url} - ${crawlResponse.error}`);
return { success: false, error: crawlResponse.error };
}
console.error(`Successfully started async crawl of website: ${url}, crawl ID: ${crawlResponse.id}`);
// Use type assertion to access response data
const responseData = crawlResponse as any;
return {
success: true,
id: crawlResponse.id,
status: responseData.status || 'pending'
};
} catch (error) {
console.error(`Error starting async crawl of website: ${url}`, error);
return {
success: false,
error: error instanceof Error ? error.message : String(error)
};
}
}
/**
* Check the status of an asynchronous crawl
* @param crawlId The ID of the crawl job
* @returns The current status of the crawl job
*/
public async checkCrawlStatus(crawlId: string): Promise<any> {
try {
console.error(`Checking status of crawl: ${crawlId}`);
const statusResponse = await this.firecrawl.checkCrawlStatus(crawlId);
if (!statusResponse.success) {
console.error(`Failed to check crawl status: ${crawlId} - ${statusResponse.error}`);
return { success: false, error: statusResponse.error };
}
// Use type assertion to access response data
const responseData = statusResponse as any;
return {
success: true,
status: responseData.status || 'unknown',
pagesProcessed: responseData.pagesProcessed || 0,
totalPages: responseData.totalPages || 0
};
} catch (error) {
console.error(`Error checking crawl status: ${crawlId}`, error);
return {
success: false,
error: error instanceof Error ? error.message : String(error)
};
}
}
/**
* Cancel an asynchronous crawl job
* @param crawlId The ID of the crawl job to cancel
* @returns The cancellation status
*/
public async cancelCrawl(crawlId: string): Promise<any> {
try {
console.error(`Cancelling crawl: ${crawlId}`);
const cancelResponse = await this.firecrawl.cancelCrawl(crawlId);
if (!cancelResponse.success) {
console.error(`Failed to cancel crawl: ${crawlId} - ${cancelResponse.error}`);
return { success: false, error: cancelResponse.error };
}
return {
success: true,
message: `Crawl ${crawlId} cancelled successfully`
};
} catch (error) {
console.error(`Error cancelling crawl: ${crawlId}`, error);
return {
success: false,
error: error instanceof Error ? error.message : String(error)
};
}
}
/**
* Get articles from the database
* @param limit Maximum number of articles to return
* @returns Array of articles
*/
public async getArticles(limit: number = 10): Promise<any[]> {
try {
console.error(`Getting articles with limit: ${limit}`);
return await dbManager.getAllArticles(limit);
} catch (error) {
console.error(`Error getting articles: ${error}`);
return [];
}
}
/**
* Search articles in the database
* @param query Search query
* @param limit Maximum number of articles to return
* @returns Array of matching articles
*/
public async searchArticles(query: string, limit: number = 10): Promise<any[]> {
try {
console.error(`Searching articles with query: ${query}, limit: ${limit}`);
return await dbManager.searchArticles(query, limit);
} catch (error) {
console.error(`Error searching articles: ${error}`);
return [];
}
}
/**
* Crawl articles from all RSS feeds and store in SQLite
*/
public async crawlFeed(limit: number = 10): Promise<FeedResponse> {
try {
// Collect items from all feeds
const allItems: FeedItem[] = [];
// Get all feeds from the database
const feeds = await this.getFeeds();
// Process feeds in parallel
const feedPromises = feeds.map(feed =>
this.fetchFeed(feed.url, feed.name, feed.category)
.then(items => {
// Add items to the collection
if (items && items.length > 0) {
allItems.push(...items);
// Cache the items for this feed
this.cachedFeedItems.set(feed.url, items);
}
})
.catch(error => {
console.error(`Error fetching feed ${feed.url}:`, error);
})
);
// Wait for all feeds to be processed
await Promise.all(feedPromises);
// Sort all items by published date (newest first)
allItems.sort((a, b) => (b.published || 0) - (a.published || 0));
// Update last fetch time
this.lastFetchTime = Date.now();
// Return formatted response with limited items
return formatFeedResponse(
allItems.slice(0, limit),
"RSS Manager Feeds",
"feed/all",
"Aggregated feeds from RSS Manager"
);
} catch (error) {
console.error('Error fetching feeds:', error);
return formatFeedResponse([], "Error", "error", "Error fetching feeds");
}
}
/**
* Fetch a single RSS feed and store in SQLite
* @param itemsPerFeed Number of items to fetch per feed (default: 20)
*/
private async fetchFeed(feedUrl: string, feedName: string, category?: string, itemsPerFeed: number = 20): Promise<FeedItem[]> {
try {
// Parse the RSS feed
const parsedFeed: ParsedFeed = await parseRssFeed(feedUrl);
if (!parsedFeed || !parsedFeed.items || parsedFeed.items.length === 0) {
console.warn(`No items found in feed: ${feedUrl}`);
return [];
}
// Use the feed title from the parsed feed if available, otherwise use the provided name
const title = feedName || parsedFeed.title || 'Unknown Feed';
// Limit items to the specified number
const limitedItems = parsedFeed.items.slice(0, itemsPerFeed);
console.error(`Fetched ${limitedItems.length} items from feed: ${feedUrl}`);
// Save the feed and its items to the database
await dbManager.saveFeed(
feedUrl,
title,
category,
limitedItems
);
return limitedItems;
} catch (error) {
console.error(`Error fetching feed ${feedUrl}:`, error);
return [];
}
}
/**
* Get the latest feeds from the database
*/
public async getLatestArticles(limit: number = 10): Promise<FeedResponse> {
try {
// Get items from the database
const items = await dbManager.getItems(limit);
// Convert to feed items
const feedItems = convertDbItemsToFeedItems(items);
// Return formatted response
return formatFeedResponse(
feedItems,
"Latest RSS Feeds",
"feed/latest",
"Latest articles from RSS feeds"
);
} catch (error) {
console.error('Error getting latest feeds:', error);
return formatFeedResponse([], "Error", "error", "Error getting latest feeds");
}
}
/**
* Get feeds by category from the database
*/
public async getFeedsByCategory(category: string, limit: number = 10): Promise<FeedResponse> {
try {
// Get items by category from the database
const items = await dbManager.getItemsByCategory(category, limit);
// Convert to feed items
const feedItems = convertDbItemsToFeedItems(items);
// Return formatted response
return formatFeedResponse(
feedItems,
`${category} Feeds`,
`category/${category}`,
`Feeds from the ${category} category`
);
} catch (error) {
console.error(`Error getting feeds by category ${category}:`, error);
return formatFeedResponse([], "Error", "error", `Error getting feeds for category: ${category}`);
}
}
/**
* Search feeds in the database
*/
public async searchFeeds(query: string, limit: number = 10): Promise<FeedResponse> {
try {
// Search items in the database
const items = await dbManager.searchItems(query, limit);
// Convert to feed items
const feedItems = convertDbItemsToFeedItems(items);
// Return formatted response
return formatFeedResponse(
feedItems,
`Search Results for "${query}"`,
`search/${query}`,
`Search results for "${query}"`
);
} catch (error) {
console.error(`Error searching feeds for ${query}:`, error);
return formatFeedResponse([], "Error", "error", `Error searching feeds for: ${query}`);
}
}
}
// Create and export an instance of the RSS manager
const rssManager = new RssManager();
export default rssManager;