rss-reader.ts•6 kB
import { parseFeed } from '@rowanmanning/feed-parser';
import { Feed } from '@rowanmanning/feed-parser/lib/feed/base.js';
import { FeedItem as RSSFeedItem } from '@rowanmanning/feed-parser/lib/feed/item/base.js';
import { FeedResult, FeedInfo, FeedItem, Enclosure } from '../types.js';
import { httpClient } from '../utils/http.js';
import { toEpochMs } from '../utils/date.js';
import { extractCleanContent, sanitizeString } from '../utils/content.js';
import { logger } from '../logger.js';
import { config } from '../config.js';
import { nanoid } from 'nanoid';
export class RSSReader {
  /**
   * Fetches raw RSS feed data from a URL
   */
  async fetchRawFeed(url: string, etag?: string, lastModified?: string): Promise<{
    data: string;
    etag?: string;
    lastModified?: string;
    notModified: boolean;
  }> {
    logger.debug(`Fetching RSS feed from: ${url}`);
    
    const headers: Record<string, string> = {
      'User-Agent': config.rssUserAgent,
      'Accept': 'application/rss+xml, application/atom+xml, application/xml, text/xml, */*',
    };
    
    if (etag) {
      headers['If-None-Match'] = etag;
    }
    if (lastModified) {
      headers['If-Modified-Since'] = lastModified;
    }
    
    try {
      const response = await httpClient.get(url, {
        headers,
        timeout: config.rssRequestTimeout,
        maxRedirects: config.rssFollowRedirects ? 5 : 0,
        responseType: 'text',
        maxContentLength: config.rssMaxResponseSize,
      });
      
      if (response.status === 304) {
        logger.debug(`Feed not modified: ${url}`);
        return { data: '', notModified: true, etag: response.headers.etag, lastModified: response.headers['last-modified'] };
      }
      
      return {
        data: response.data,
        etag: response.headers.etag,
        lastModified: response.headers['last-modified'],
        notModified: false,
      };
    } catch (error: any) {
      logger.error(`Error fetching RSS feed from ${url}: ${error.message}`);
      throw new Error(`Failed to fetch RSS feed: ${error.message}`);
    }
  }
  
  /**
   * Parses RSS feed XML into structured data
   */
  async parseFeed(xml: string): Promise<Feed | null> {
    try {
      return parseFeed(xml);
    } catch (error: any) {
      logger.error(`Error parsing RSS feed: ${error.message}`);
      return null;
    }
  }
  
  /**
   * Formats parsed feed into our internal structure
   */
  formatFeed(feed: Feed, feedUrl: string, useDescriptionAsContent?: boolean): FeedResult {
    const feedJson = feed.toJSON();
    const { items: _, ...feedMeta } = feedJson;
    
    // Extract feed info
    const info: FeedInfo = {
      title: feedMeta.title || null,
      description: feedMeta.description || null,
      url: feedMeta.url || null,
      feedUrl,
      language: feedMeta.language || null,
      copyright: feedMeta.copyright || null,
      published: toEpochMs(feedMeta.published),
      updated: toEpochMs(feedMeta.updated),
      categories: feedMeta.categories?.map((c: any) => c.label || c) || [],
      author: feedMeta.authors?.[0]?.name || null,
      image: feedMeta.image ? {
        url: feedMeta.image.url || null,
        title: feedMeta.image.title || null,
      } : null,
    };
    
    // Format items
    const items: FeedItem[] = feed.items
      .slice(0, config.rssMaxItemsPerFeed)
      .map((item: RSSFeedItem) => {
        const itemJson = item.toJSON();
        
        // Clean content and description
        let content = itemJson.content || null;
        let description = itemJson.description || null;
        
        if (content) {
          content = extractCleanContent(content).text;
        }
        if (description) {
          description = extractCleanContent(description).text;
        }
        
        // Handle useDescriptionAsContent option
        if (useDescriptionAsContent && description) {
          content = description;
        }
        
        // Safely access properties that may not be on the base type
        const guid = 'guid' in item ? String(item.guid) : nanoid();
        const enclosures: Enclosure[] = ('enclosures' in item && Array.isArray(item.enclosures))
          ? item.enclosures.map((enc: { url: string; type?: string; length?: string }) => ({
              url: enc.url,
              type: enc.type || null,
              length: enc.length ? parseInt(enc.length, 10) : null,
            }))
          : [];
        return {
          id: sanitizeString(guid || itemJson.url || itemJson.title || ''),
          title: itemJson.title || null,
          url: itemJson.url || null,
          content,
          description,
          published: toEpochMs(itemJson.published),
          updated: toEpochMs(itemJson.updated),
          author: itemJson.authors?.[0]?.name || null,
          categories: itemJson.categories?.map((c: any) => c.label || c) || [],
          enclosures,
          guid: guid,
        };
      });
    
    return {
      info,
      items,
      fetchedAt: Date.now(),
    };
  }
  
  /**
   * Complete feed fetching and parsing pipeline
   */
  async fetchFeed(
    url: string,
    options?: {
      useDescriptionAsContent?: boolean;
      etag?: string;
      lastModified?: string;
    }
  ): Promise<FeedResult> {
    // Fetch raw feed
    const { data, etag, lastModified, notModified } = await this.fetchRawFeed(
      url,
      options?.etag,
      options?.lastModified
    );
    
    if (notModified) {
      throw new Error('NOT_MODIFIED');
    }
    
    // Parse feed
    const parsed = await this.parseFeed(data);
    if (!parsed) {
      throw new Error('Failed to parse feed XML');
    }
    
    // Format feed
    const result = this.formatFeed(parsed, url, options?.useDescriptionAsContent);
    
    // Add cache headers if available
    if (etag) result.etag = etag;
    if (lastModified) result.lastModified = lastModified;
    
    return result;
  }
}
// Singleton instance
export const rssReader = new RSSReader();