HashPilot

hashpilot
src
services

firecrawl-service.ts•13.9 KiB

/** * Firecrawl Service * * Service for crawling Hedera documentation using Firecrawl API. * Handles rate limiting, retries, and progress tracking. */ import FirecrawlApp from '@mendable/firecrawl-js'; import { Document, DocumentContentType } from '../types/rag.js'; import { FIRECRAWL_CONFIG } from '../config/rag.js'; import { logger } from '../utils/logger.js'; /** * Crawl options */ export interface CrawlOptions { /** Maximum pages to crawl */ maxPages?: number; /** URL patterns to exclude */ excludePatterns?: string[]; /** URL patterns to include (for selective crawling) */ includePatterns?: string[]; /** Progress callback */ onProgress?: (current: number, total: number, url: string) => void; /** Include subdomains */ includeSubdomains?: boolean; /** Maximum depth */ maxDepth?: number; } /** * Crawl result */ export interface CrawlResult { /** Crawled documents */ documents: Document[]; /** Total pages crawled */ totalPages: number; /** Failed URLs */ failedUrls: string[]; /** Errors encountered */ errors: string[]; } /** * Firecrawl Service */ export class FirecrawlService { private firecrawl: FirecrawlApp; constructor(apiKeyOrUrl: string) { // Check if using local Firecrawl instance (URL) or cloud (API key) const isLocalUrl = apiKeyOrUrl.startsWith('http://') || apiKeyOrUrl.startsWith('https://'); if (isLocalUrl) { // Local Firecrawl instance this.firecrawl = new FirecrawlApp({ apiUrl: apiKeyOrUrl }); logger.info('FirecrawlService initialized (local)', { url: apiKeyOrUrl }); } else { // Cloud Firecrawl with API key this.firecrawl = new FirecrawlApp({ apiKey: apiKeyOrUrl }); logger.info('FirecrawlService initialized (cloud)'); } } /** * Crawl Hedera documentation */ async crawlHederaDocs(options: CrawlOptions = {}): Promise<CrawlResult> { const maxPages = options.maxPages || FIRECRAWL_CONFIG.maxPages; const excludePatterns = options.excludePatterns || [...FIRECRAWL_CONFIG.excludePatterns]; const includePatterns = options.includePatterns || (FIRECRAWL_CONFIG as any).includePatterns || []; logger.info('Starting Hedera documentation crawl', { maxPages, excludePatterns, includePatterns, baseUrls: FIRECRAWL_CONFIG.baseUrls, }); const documents: Document[] = []; const failedUrls: string[] = []; const errors: string[] = []; let currentPage = 0; try { // Crawl each base URL for (const baseUrl of FIRECRAWL_CONFIG.baseUrls) { try { const crawlResult = await this.crawlWebsite(baseUrl, { ...options, maxPages: maxPages - currentPage, excludePatterns, includePatterns, }); documents.push(...crawlResult.documents); failedUrls.push(...crawlResult.failedUrls); errors.push(...crawlResult.errors); currentPage += crawlResult.documents.length; logger.info('Crawl completed for base URL', { baseUrl, documentsFound: crawlResult.documents.length, totalDocuments: documents.length, }); // Stop if we reached max pages if (currentPage >= maxPages) { logger.info('Maximum pages reached', { maxPages, currentPage }); break; } } catch (error: any) { const errorMsg = `Failed to crawl ${baseUrl}: ${error.message}`; logger.error(errorMsg, { error: error.message }); errors.push(errorMsg); failedUrls.push(baseUrl); } } logger.info('Hedera documentation crawl completed', { totalDocuments: documents.length, failedUrls: failedUrls.length, errors: errors.length, }); return { documents, totalPages: documents.length, failedUrls, errors, }; } catch (error: any) { logger.error('Crawl failed', { error: error.message }); throw new Error(`Hedera docs crawl failed: ${error.message}`); } } /** * Crawl a single website */ private async crawlWebsite(url: string, options: CrawlOptions): Promise<CrawlResult> { const documents: Document[] = []; const failedUrls: string[] = []; const errors: string[] = []; try { logger.info('Crawling website', { url, options }); // Check if this is a GitHub repository URL const isGitHub = url.includes('github.com'); // Prepare crawl parameters const crawlParams: any = { limit: options.maxPages || 100, scrapeOptions: { formats: ['markdown', 'html'], onlyMainContent: true, waitFor: 2000, // Wait for JavaScript to render }, }; // Add exclude patterns if (options.excludePatterns && options.excludePatterns.length > 0) { crawlParams.excludePaths = options.excludePatterns; } // For GitHub repos, add include patterns to focus on docs/examples if (isGitHub && options.includePatterns && options.includePatterns.length > 0) { // GitHub-specific: focus on documentation paths crawlParams.includePaths = options.includePatterns; // Limit depth to avoid crawling too deep into code crawlParams.maxDepth = 4; logger.info('GitHub repo detected, applying include patterns', { url, includePatterns: options.includePatterns, }); } // Start crawl const crawlResponse = await this.firecrawl.crawl(url, crawlParams) as any; // Check if crawl was successful (status should be 'completed') if (crawlResponse.status !== 'completed' && !crawlResponse.data) { throw new Error(`Crawl failed: ${crawlResponse.error || 'Status: ' + crawlResponse.status}`); } // Process crawled pages const pages = crawlResponse.data || []; for (let i = 0; i < pages.length; i++) { const page = pages[i]; try { // Report progress if (options.onProgress) { options.onProgress(i + 1, pages.length, page.url || url); } // Extract document from page const document = this.extractDocument(page, i); documents.push(document); logger.debug('Page processed', { url: document.url, title: document.title, contentLength: document.content.length, }); } catch (error: any) { const errorMsg = `Failed to process page ${page.url}: ${error.message}`; logger.warn(errorMsg); errors.push(errorMsg); failedUrls.push(page.url || `${url}#${i}`); } } logger.info('Website crawl completed', { url, documentsFound: documents.length, failed: failedUrls.length, }); return { documents, totalPages: documents.length, failedUrls, errors, }; } catch (error: any) { logger.error('Website crawl failed', { url, error: error.message }); throw error; } } /** * Extract document from crawled page */ private extractDocument(page: any, index: number): Document { const url = page.url || page.metadata?.url || `unknown-${index}`; const title = page.metadata?.title || this.extractTitleFromUrl(url); const content = page.markdown || page.html || ''; // Classify content type from URL const contentType = this.classifyContentType(url); // Extract metadata const metadata = { url, title, description: page.metadata?.description || '', contentType, tags: this.extractTags(url, content), language: this.detectLanguage(content, url), crawledAt: new Date().toISOString(), updatedAt: page.metadata?.modifiedTime || page.metadata?.publishedTime, }; const document: Document = { id: this.generateDocumentId(url), url, title, content, metadata, }; return document; } /** * Generate unique document ID from URL */ private generateDocumentId(url: string): string { // Use URL as base, normalize and hash const normalized = url .replace(/^https?:\/\//, '') .replace(/\/$/, '') .replace(/[^a-zA-Z0-9-_]/g, '-'); return `doc-${normalized}`; } /** * Extract title from URL if not available */ private extractTitleFromUrl(url: string): string { try { const pathname = new URL(url).pathname; const parts = pathname.split('/').filter(Boolean); const lastPart = parts[parts.length - 1] || 'index'; // Convert kebab-case to Title Case return lastPart .replace(/-/g, ' ') .replace(/\b\w/g, char => char.toUpperCase()); } catch { return 'Untitled Document'; } } /** * Classify content type from URL */ private classifyContentType(url: string): DocumentContentType { const urlLower = url.toLowerCase(); // GitHub repository patterns if (urlLower.includes('github.com')) { if (urlLower.includes('/examples/') || urlLower.includes('/example-')) { return 'example'; } if (urlLower.includes('/docs/') || urlLower.includes('/manual/') || urlLower.includes('readme')) { return 'guide'; } if (urlLower.includes('contributing') || urlLower.includes('migration') || urlLower.includes('changelog')) { return 'guide'; } } if (urlLower.includes('/tutorial') || urlLower.includes('/getting-started')) { return 'tutorial'; } if (urlLower.includes('/api') || urlLower.includes('/reference')) { return 'api'; } if (urlLower.includes('/example') || urlLower.includes('/sample')) { return 'example'; } if (urlLower.includes('/guide')) { return 'guide'; } if (urlLower.includes('/concept') || urlLower.includes('/learn')) { return 'concept'; } // Default to concept return 'concept'; } /** * Extract tags from URL and content */ private extractTags(url: string, content: string): string[] { const tags: Set<string> = new Set(); // Extract from URL path const urlParts = url.split('/').filter(Boolean); urlParts.forEach(part => { if (part.length > 3 && !part.includes('.')) { tags.add(part.toLowerCase()); } }); // Extract common Hedera terms from content const hederaTerms = [ 'account', 'token', 'smart contract', 'consensus', 'topic', 'transaction', 'hbar', 'nft', 'sdk', 'hashgraph', 'hedera', 'testnet', 'mainnet', ]; const contentLower = content.toLowerCase(); hederaTerms.forEach(term => { if (contentLower.includes(term)) { tags.add(term.replace(/\s+/g, '-')); } }); return Array.from(tags).slice(0, 10); // Limit to 10 tags } /** * Detect programming language from content and URL */ private detectLanguage(content: string, url?: string): 'javascript' | 'typescript' | 'java' | 'python' | 'go' | 'solidity' | undefined { // First check URL for SDK repository patterns if (url) { const urlLower = url.toLowerCase(); // Check which SDK repository this is from if (urlLower.includes('hedera-sdk-js') || urlLower.includes('.js') || urlLower.includes('.ts')) { return urlLower.includes('.ts') ? 'typescript' : 'javascript'; } if (urlLower.includes('hedera-sdk-java') || urlLower.includes('.java')) { return 'java'; } if (urlLower.includes('hedera-sdk-go') || urlLower.includes('.go')) { return 'go'; } if (urlLower.includes('hiero-sdk-python') || urlLower.includes('hedera-sdk-python') || urlLower.includes('.py')) { return 'python'; } if (urlLower.includes('hedera-sdk-rust') || urlLower.includes('.rs')) { // Map Rust to closest available (we don't have rust in our type, but we can add tags) return undefined; // Will be tagged as 'rust' separately } if (urlLower.includes('.sol')) { return 'solidity'; } } // Then check content for code block markers const languagePatterns: Array<[RegExp, 'javascript' | 'typescript' | 'java' | 'python' | 'go' | 'solidity']> = [ [/```typescript|```ts\b/i, 'typescript'], [/```javascript|```js\b/i, 'javascript'], [/```java\b/i, 'java'], [/```python|```py\b/i, 'python'], [/```go\b/i, 'go'], [/```solidity|```sol\b/i, 'solidity'], ]; for (const [pattern, language] of languagePatterns) { if (pattern.test(content)) { return language; } } return undefined; } /** * Scrape a single page (not a full crawl) */ async scrapePage(url: string): Promise<Document> { try { logger.info('Scraping single page', { url }); const scrapeResult = await this.firecrawl.scrape(url, { formats: ['markdown', 'html'], onlyMainContent: true, waitFor: 2000, }) as any; // Scrape returns { markdown, metadata } directly if (!scrapeResult.markdown && !scrapeResult.html) { throw new Error(`Scrape failed: No content returned`); } const document = this.extractDocument(scrapeResult, 0); logger.info('Page scraped successfully', { url: document.url, title: document.title, contentLength: document.content.length, }); return document; } catch (error: any) { logger.error('Page scrape failed', { url, error: error.message }); throw new Error(`Failed to scrape ${url}: ${error.message}`); } } /** * Check API credits (if available) */ async checkCredits(): Promise<{ remaining?: number; total?: number } | null> { try { // Firecrawl API doesn't expose credits endpoint in free tier // This is a placeholder for future implementation logger.info('Credit check not available in free tier'); return null; } catch (error: any) { logger.warn('Failed to check credits', { error: error.message }); return null; } } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/justmert/hashpilot'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

firecrawl-service.ts•13.9 KiB