import { promises as fs } from 'fs';
import { join, dirname } from 'path';
import { logger } from '@/utils/logger.js';
import { extractDomain } from '@/utils/text.js';
import type { CrawlResult } from './types.js';
export interface CacheEntry {
url: string;
result: CrawlResult;
timestamp: number;
ttl: number;
}
export interface CacheStats {
totalEntries: number;
totalSize: number;
hitRate: number;
oldestEntry?: number;
newestEntry?: number;
}
export class FileCache {
private memoryCache = new Map<string, CacheEntry>();
private accessCount = 0;
private hitCount = 0;
private maxMemoryEntries: number;
constructor(
private cacheDir: string,
private defaultTTL: number = 86400000, // 24 hours
maxMemoryEntries: number = 1000
) {
this.maxMemoryEntries = maxMemoryEntries;
this.ensureCacheDir();
}
async get(url: string): Promise<CrawlResult | null> {
this.accessCount++;
const key = this.getKey(url);
// Check memory cache first
const memoryEntry = this.memoryCache.get(key);
if (memoryEntry && !this.isExpired(memoryEntry)) {
this.hitCount++;
logger.debug(`Cache hit (memory): ${url}`);
return memoryEntry.result;
}
// Check file cache
try {
const filePath = this.getFilePath(key);
const data = await fs.readFile(filePath, 'utf-8');
const entry: CacheEntry = JSON.parse(data);
if (!this.isExpired(entry)) {
this.hitCount++;
logger.debug(`Cache hit (file): ${url}`);
// Add to memory cache
this.setMemoryCache(key, entry);
return entry.result;
} else {
// Remove expired file
await this.removeFile(filePath);
}
} catch (error) {
// File doesn't exist or is corrupted, not an error
}
logger.debug(`Cache miss: ${url}`);
return null;
}
async set(url: string, result: CrawlResult, ttl?: number): Promise<void> {
const key = this.getKey(url);
const entry: CacheEntry = {
url,
result,
timestamp: Date.now(),
ttl: ttl || this.defaultTTL,
};
// Save to memory cache
this.setMemoryCache(key, entry);
// Save to file cache
try {
const filePath = this.getFilePath(key);
await fs.mkdir(dirname(filePath), { recursive: true });
await fs.writeFile(filePath, JSON.stringify(entry, null, 2));
logger.debug(`Cached to file: ${url}`);
} catch (error) {
logger.error(`Failed to cache to file ${url}:`, error);
}
}
async has(url: string): Promise<boolean> {
const result = await this.get(url);
return result !== null;
}
async delete(url: string): Promise<boolean> {
const key = this.getKey(url);
// Remove from memory cache
const hadMemory = this.memoryCache.delete(key);
// Remove from file cache
try {
const filePath = this.getFilePath(key);
await fs.unlink(filePath);
logger.debug(`Deleted from cache: ${url}`);
return true;
} catch (error) {
return hadMemory;
}
}
async clear(urlPattern?: string): Promise<number> {
let cleared = 0;
if (!urlPattern) {
// Clear everything
this.memoryCache.clear();
try {
await fs.rm(this.cacheDir, { recursive: true, force: true });
await this.ensureCacheDir();
logger.info('Cleared entire cache');
return cleared;
} catch (error) {
logger.error('Failed to clear cache directory:', error);
}
} else {
// Clear by pattern
const regex = new RegExp(urlPattern.replace(/\*/g, '.*'));
// Clear from memory
for (const [key, entry] of this.memoryCache.entries()) {
if (regex.test(entry.url)) {
this.memoryCache.delete(key);
cleared++;
}
}
// Clear from files
try {
const files = await this.getAllCacheFiles();
for (const file of files) {
try {
const data = await fs.readFile(file, 'utf-8');
const entry: CacheEntry = JSON.parse(data);
if (regex.test(entry.url)) {
await fs.unlink(file);
cleared++;
}
} catch (error) {
// Skip corrupted files
}
}
} catch (error) {
logger.error('Failed to clear cache files by pattern:', error);
}
logger.info(`Cleared ${cleared} cache entries matching pattern: ${urlPattern}`);
}
return cleared;
}
async getStats(): Promise<CacheStats> {
const memoryEntries = this.memoryCache.size;
let fileEntries = 0;
let totalSize = 0;
let oldestEntry: number | undefined;
let newestEntry: number | undefined;
try {
const files = await this.getAllCacheFiles();
fileEntries = files.length;
for (const file of files) {
try {
const stats = await fs.stat(file);
totalSize += stats.size;
const data = await fs.readFile(file, 'utf-8');
const entry: CacheEntry = JSON.parse(data);
if (!oldestEntry || entry.timestamp < oldestEntry) {
oldestEntry = entry.timestamp;
}
if (!newestEntry || entry.timestamp > newestEntry) {
newestEntry = entry.timestamp;
}
} catch (error) {
// Skip corrupted files
}
}
} catch (error) {
logger.error('Failed to calculate cache stats:', error);
}
const hitRate = this.accessCount > 0 ? (this.hitCount / this.accessCount) * 100 : 0;
return {
totalEntries: Math.max(memoryEntries, fileEntries),
totalSize,
hitRate,
oldestEntry,
newestEntry,
};
}
async getAllEntries(): Promise<CrawlResult[]> {
const results: CrawlResult[] = [];
// Get from memory cache first
for (const entry of this.memoryCache.values()) {
if (!this.isExpired(entry)) {
results.push(entry.result);
}
}
// Get from file cache (skip duplicates)
const memoryUrls = new Set(results.map(r => r.url));
try {
const files = await this.getAllCacheFiles();
for (const file of files) {
try {
const data = await fs.readFile(file, 'utf-8');
const entry: CacheEntry = JSON.parse(data);
if (!this.isExpired(entry) && !memoryUrls.has(entry.url)) {
results.push(entry.result);
}
} catch (error) {
// Skip corrupted files
logger.debug(`Skipping corrupted cache file: ${file}`);
}
}
} catch (error) {
logger.error('Failed to read cache files:', error);
}
return results;
}
async cleanup(): Promise<number> {
let cleaned = 0;
// Clean expired entries from memory
for (const [key, entry] of this.memoryCache.entries()) {
if (this.isExpired(entry)) {
this.memoryCache.delete(key);
cleaned++;
}
}
// Clean expired files
try {
const files = await this.getAllCacheFiles();
for (const file of files) {
try {
const data = await fs.readFile(file, 'utf-8');
const entry: CacheEntry = JSON.parse(data);
if (this.isExpired(entry)) {
await fs.unlink(file);
cleaned++;
}
} catch (error) {
// Remove corrupted files
await this.removeFile(file);
cleaned++;
}
}
} catch (error) {
logger.error('Failed to cleanup cache files:', error);
}
if (cleaned > 0) {
logger.info(`Cleaned up ${cleaned} expired cache entries`);
}
return cleaned;
}
private getKey(url: string): string {
// Create a safe filename from URL
return Buffer.from(url).toString('base64').replace(/[/+=]/g, '_');
}
private getFilePath(key: string): string {
const domain = this.getDomainFromKey(key);
return join(this.cacheDir, domain, `${key}.json`);
}
private getDomainFromKey(key: string): string {
try {
const url = Buffer.from(key.replace(/_/g, '='), 'base64').toString();
return extractDomain(url) || 'unknown';
} catch {
return 'unknown';
}
}
private isExpired(entry: CacheEntry): boolean {
return Date.now() - entry.timestamp > entry.ttl;
}
private setMemoryCache(key: string, entry: CacheEntry): void {
// Implement LRU eviction if memory cache is full
if (this.memoryCache.size >= this.maxMemoryEntries) {
const firstKey = this.memoryCache.keys().next().value;
if (firstKey) {
this.memoryCache.delete(firstKey);
}
}
this.memoryCache.set(key, entry);
}
private async ensureCacheDir(): Promise<void> {
try {
await fs.mkdir(this.cacheDir, { recursive: true });
} catch (error) {
logger.error(`Failed to create cache directory ${this.cacheDir}:`, error);
}
}
private async getAllCacheFiles(): Promise<string[]> {
const files: string[] = [];
async function scanDir(dir: string): Promise<void> {
try {
const entries = await fs.readdir(dir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = join(dir, entry.name);
if (entry.isDirectory()) {
await scanDir(fullPath);
} else if (entry.name.endsWith('.json')) {
files.push(fullPath);
}
}
} catch (error) {
// Directory doesn't exist or is inaccessible
}
}
await scanDir(this.cacheDir);
return files;
}
private async removeFile(filePath: string): Promise<void> {
try {
await fs.unlink(filePath);
} catch (error) {
// File already doesn't exist
}
}
}