index.ts•5 kB
import {
	ErrorType,
	ProcessingProvider,
	ProcessingResult,
	ProviderError,
} from '../../../common/types.js';
import {
	retry_with_backoff,
	validate_api_key,
} from '../../../common/utils.js';
import { http_json } from '../../../common/http.js';
import { config } from '../../../config/env.js';
interface ExaSimilarRequest {
	url: string;
	numResults?: number;
	contents?: {
		text?: { maxCharacters?: number };
		highlights?: boolean;
		summary?: boolean;
		livecrawl?: 'always' | 'fallback' | 'preferred';
	};
	includeDomains?: string[];
	excludeDomains?: string[];
}
interface ExaSimilarResult {
	id: string;
	title: string;
	url: string;
	text?: string;
	highlights?: string[];
	summary?: string;
	publishedDate?: string;
	author?: string;
	score?: number;
}
interface ExaSimilarResponse {
	results: ExaSimilarResult[];
	requestId: string;
}
export class ExaSimilarProvider implements ProcessingProvider {
	name = 'exa_similar';
	description =
		'Find web pages semantically similar to a given URL using Exa';
	async process_content(
		url: string | string[],
		extract_depth: 'basic' | 'advanced' = 'basic',
	): Promise<ProcessingResult> {
		const api_key = validate_api_key(
			config.processing.exa_similar.api_key,
			this.name,
		);
		// This provider only accepts a single URL
		const target_url = Array.isArray(url) ? url[0] : url;
		if (!target_url) {
			throw new ProviderError(
				ErrorType.INVALID_INPUT,
				'A URL must be provided',
				this.name,
			);
		}
		// Validate URL format
		try {
			new URL(target_url);
		} catch {
			throw new ProviderError(
				ErrorType.INVALID_INPUT,
				'Invalid URL format',
				this.name,
			);
		}
		const process_request = async () => {
			try {
				const request_body: ExaSimilarRequest = {
					url: target_url,
					numResults: extract_depth === 'advanced' ? 15 : 10,
					contents: {
						text: {
							maxCharacters:
								extract_depth === 'advanced' ? 3000 : 1500,
						},
						highlights: extract_depth === 'advanced',
						summary: extract_depth === 'advanced',
						livecrawl:
							extract_depth === 'advanced' ? 'preferred' : 'fallback',
					},
				};
				const data = await http_json<ExaSimilarResponse>(
					this.name,
					`${config.processing.exa_similar.base_url}/findSimilar`,
					{
						method: 'POST',
						headers: {
							'x-api-key': api_key,
							Authorization: `Bearer ${api_key}`,
							'Content-Type': 'application/json',
						},
						body: JSON.stringify(request_body),
					},
				);
				// Combine all content
				let combined_content = `# Similar Pages to ${target_url}\n\n`;
				combined_content += `Found ${data.results.length} similar pages:\n\n`;
				const raw_contents: Array<{ url: string; content: string }> =
					[];
				let total_word_count = 0;
				for (const result of data.results) {
					const content =
						result.text || result.summary || 'No content available';
					const word_count = content.split(/\s+/).length;
					total_word_count += word_count;
					// Add to combined content
					combined_content += `## ${result.title}\n\n`;
					if (result.author) {
						combined_content += `**Author:** ${result.author}\n`;
					}
					if (result.publishedDate) {
						combined_content += `**Published:** ${result.publishedDate}\n`;
					}
					if (result.score) {
						combined_content += `**Similarity Score:** ${result.score.toFixed(
							3,
						)}\n`;
					}
					combined_content += `**URL:** ${result.url}\n\n`;
					if (result.highlights && result.highlights.length > 0) {
						combined_content += `**Key Highlights:**\n`;
						for (const highlight of result.highlights) {
							combined_content += `- ${highlight}\n`;
						}
						combined_content += '\n';
					}
					if (result.summary && result.text) {
						combined_content += `**Summary:** ${result.summary}\n\n`;
						combined_content += `**Content Preview:**\n${result.text.substring(
							0,
							500,
						)}${result.text.length > 500 ? '...' : ''}\n\n`;
					} else {
						combined_content += `${content.substring(0, 500)}${
							content.length > 500 ? '...' : ''
						}\n\n`;
					}
					combined_content += '---\n\n';
					// Add to raw contents
					raw_contents.push({
						url: result.url,
						content: content,
					});
				}
				return {
					content: combined_content,
					raw_contents,
					metadata: {
						title: `Similar pages to ${target_url}`,
						word_count: total_word_count,
						urls_processed: data.results.length,
						successful_extractions: data.results.length,
						extract_depth,
						original_url: target_url,
						requestId: data.requestId,
					},
					source_provider: this.name,
				};
			} catch (error) {
				if (error instanceof ProviderError) {
					throw error;
				}
				throw new ProviderError(
					ErrorType.API_ERROR,
					`Failed to find similar pages: ${
						error instanceof Error ? error.message : 'Unknown error'
					}`,
					this.name,
				);
			}
		};
		return retry_with_backoff(process_request);
	}
}