index.ts•7.65 kB
import { http_json } from '../../../common/http.js';
import {
	ErrorType,
	ProcessingProvider,
	ProcessingResult,
	ProviderError,
} from '../../../common/types.js';
import {
	is_valid_url,
	retry_with_backoff,
	validate_api_key,
} from '../../../common/utils.js';
import { config } from '../../../config/env.js';
interface FirecrawlActionsResponse {
	success: boolean;
	data?: {
		markdown?: string;
		html?: string;
		rawHtml?: string;
		screenshot?: string;
		actions?: {
			screenshots?: string[];
		};
		metadata?: {
			title?: string;
			description?: string;
			language?: string;
			sourceURL?: string;
			statusCode?: number;
			error?: string;
			[key: string]: any;
		};
	};
	error?: string;
}
// Define the action types
type ActionType = 'click' | 'type' | 'scroll' | 'wait' | 'select';
interface Action {
	type: ActionType;
	selector?: string;
	text?: string;
	x?: number;
	y?: number;
	duration?: number;
	value?: string;
}
export class FirecrawlActionsProvider implements ProcessingProvider {
	name = 'firecrawl_actions';
	description =
		'Support for page interactions (clicking, scrolling, etc.) before extraction for dynamic content using Firecrawl. Enables extraction from JavaScript-heavy sites, single-page applications, and content behind user interactions. Best for accessing content that requires navigation, form filling, or other interactions.';
	async process_content(
		url: string | string[],
		extract_depth: 'basic' | 'advanced' = 'basic',
	): Promise<ProcessingResult> {
		// Actions works with a single URL
		const actions_url = Array.isArray(url) ? url[0] : url;
		// Validate URL
		if (!is_valid_url(actions_url)) {
			throw new ProviderError(
				ErrorType.INVALID_INPUT,
				`Invalid URL provided: ${actions_url}`,
				this.name,
			);
		}
		const actions_request = async () => {
			const api_key = validate_api_key(
				config.processing.firecrawl_actions.api_key,
				this.name,
			);
			try {
				// Define actions based on extract_depth
				// For basic, we'll just scroll down once to load more content
				// For advanced, we'll perform more complex interactions
				const actions: Action[] =
					extract_depth === 'advanced'
						? [
								{ type: 'wait', duration: 2000 }, // Wait for initial page load
								{ type: 'scroll', duration: 1000 }, // Scroll down
								{ type: 'wait', duration: 1000 }, // Wait for content to load
								{ type: 'scroll', duration: 1000 }, // Scroll down more
								{ type: 'wait', duration: 1000 }, // Wait for content to load
								// Click on "Read more" or "Show more" buttons if they exist
								{
									type: 'click',
									selector:
										'button:contains("Read more"), button:contains("Show more"), a:contains("Read more"), a:contains("Show more")',
								},
								{ type: 'wait', duration: 2000 }, // Wait for content to expand
							]
						: [
								{ type: 'wait', duration: 2000 }, // Wait for initial page load
								{ type: 'scroll', duration: 1000 }, // Scroll down once
								{ type: 'wait', duration: 1000 }, // Wait for content to load
							];
				// Start the actions
				const actions_data =
					await http_json<FirecrawlActionsResponse>(
						this.name,
						config.processing.firecrawl_actions.base_url,
						{
							method: 'POST',
							headers: {
								Authorization: `Bearer ${api_key}`,
								'Content-Type': 'application/json',
							},
							body: JSON.stringify({
								url: actions_url,
								formats: ['markdown', 'screenshot'],
								actions: actions.map((action) => {
									// Convert our action format to Firecrawl's action format
									switch (action.type) {
										case 'wait':
											return {
												type: 'wait',
												milliseconds: action.duration || 1000,
												selector: action.selector,
											};
										case 'scroll':
											return {
												type: 'scroll',
												// Firecrawl might use different parameters for scroll
												// Adjust as needed based on their documentation
											};
										case 'click':
											return {
												type: 'click',
												selector: action.selector,
												x: action.x,
												y: action.y,
											};
										case 'type':
											return {
												type: 'type',
												selector: action.selector,
												text: action.text || '',
											};
										case 'select':
											return {
												type: 'select',
												selector: action.selector,
												value: action.value || '',
											};
										default:
											return action;
									}
								}),
							}),
							signal: AbortSignal.timeout(
								config.processing.firecrawl_actions.timeout,
							),
						},
					);
				// Check if there was an error in the response
				if (!actions_data.success || actions_data.error) {
					throw new ProviderError(
						ErrorType.PROVIDER_ERROR,
						`Error performing actions: ${actions_data.error || 'Unknown error'}`,
						this.name,
					);
				}
				// Check if we have data
				if (!actions_data.data) {
					throw new ProviderError(
						ErrorType.PROVIDER_ERROR,
						'No data returned from API',
						this.name,
					);
				}
				// Check if we have content
				if (
					!actions_data.data.markdown &&
					!actions_data.data.html &&
					!actions_data.data.rawHtml
				) {
					throw new ProviderError(
						ErrorType.PROVIDER_ERROR,
						'No content extracted after performing actions',
						this.name,
					);
				}
				// Prefer markdown, fallback to HTML, then rawHtml
				const content =
					actions_data.data.markdown ||
					actions_data.data.html ||
					actions_data.data.rawHtml ||
					'';
				// Add information about the actions performed
				const actions_description =
					`# Content from ${actions_url} after interactions\n\n` +
					`The following actions were performed before extraction:\n\n` +
					actions
						.map((action, index) => {
							switch (action.type) {
								case 'click':
									return `${index + 1}. Click on ${action.selector || `coordinates (${action.x}, ${action.y})`}`;
								case 'type':
									return `${index + 1}. Type "${action.text}" ${action.selector ? `into ${action.selector}` : ''}`;
								case 'scroll':
									return `${index + 1}. Scroll ${action.duration ? `for ${action.duration}ms` : ''}`;
								case 'wait':
									return `${index + 1}. Wait ${action.duration ? `for ${action.duration}ms` : ''}`;
								case 'select':
									return `${index + 1}. Select "${action.value}" from ${action.selector}`;
								default:
									return `${index + 1}. Perform ${action.type} action`;
							}
						})
						.join('\n') +
					'\n\n---\n\n' +
					content;
				// Create a single raw_content entry
				const raw_contents = [
					{
						url: actions_url,
						content: actions_description,
					},
				];
				// Calculate word count
				const word_count = actions_description
					.split(/\s+/)
					.filter(Boolean).length;
				return {
					content: actions_description,
					raw_contents,
					metadata: {
						title: `Content from ${actions_url} after interactions`,
						word_count,
						urls_processed: 1,
						successful_extractions: 1,
						extract_depth,
						screenshot: actions_data.data.screenshot,
					},
					source_provider: this.name,
				};
			} catch (error) {
				if (error instanceof ProviderError) {
					throw error;
				}
				throw new ProviderError(
					ErrorType.API_ERROR,
					`Failed to perform actions: ${
						error instanceof Error ? error.message : 'Unknown error'
					}`,
					this.name,
				);
			}
		};
		return retry_with_backoff(actions_request);
	}
}