/**
* GPU-Safe VRAM Manager
*
* CRITICAL PRINCIPLES:
* 1. Protect the GPU above all else (including service disruption)
* 2. Think strategically about long-term GPU health
* 3. Handle edge cases gracefully without GPU damage
* 4. Trust but verify all operations
* 5. Do no harm - refuse operations rather than risk hardware
*
* This manager prioritizes GPU safety over service availability.
* It will refuse requests, force cleanups, or even recommend system
* shutdown before allowing conditions that could damage the GPU.
*/
import fetch from 'node-fetch';
import { runMaintenanceWorkflow, MaintenanceScheduler } from '../workflows/maintenance-workflow.js';
// GPU Safety Thresholds
const GPU_SAFETY = {
// Temperature thresholds (Celsius)
TEMP_WARNING: 75, // Start monitoring closely
TEMP_CRITICAL: 83, // Force cleanup
TEMP_EMERGENCY: 87, // Refuse all operations
TEMP_SHUTDOWN: 90, // Recommend immediate shutdown
// VRAM thresholds
VRAM_SAFE: 0.70, // Safe operating range
VRAM_WARNING: 0.75, // Monitor closely
VRAM_CRITICAL: 0.85, // Force cleanup
VRAM_EMERGENCY: 0.92, // Refuse new operations
VRAM_SHUTDOWN: 0.98, // Emergency shutdown recommended
// Sustained load thresholds
SUSTAINED_HIGH_MINUTES: 10, // Minutes of high usage before intervention
COOLDOWN_REQUIRED: 5 * 60 * 1000, // 5 min minimum between cleanups
// Failure thresholds
MAX_CLEANUP_FAILURES: 3, // Max consecutive cleanup failures
MAX_STAT_FAILURES: 5, // Max consecutive stat check failures
};
export class GPUSafeVRAMManager {
constructor(comfyuiClient, options = {}) {
this.client = comfyuiClient;
// Configuration with safety-first defaults
this.config = {
// Safety thresholds (conservative defaults)
safetyThresholds: {
...GPU_SAFETY,
...(options.safetyThresholds || {})
},
// Operational parameters
checkInterval: options.checkInterval || 30000, // Check every 30 seconds (more frequent)
emergencyCheckInterval: 5000, // Check every 5 seconds in emergency
idleTimeout: options.idleTimeout || 15 * 60 * 1000, // 15 minutes idle
// Features
autoProtection: options.autoProtection !== false, // Auto-protect GPU
refuseOnHighVRAM: options.refuseOnHighVRAM !== false, // Refuse ops when high
verifyCleanup: options.verifyCleanup !== false, // Verify cleanup worked
historicalTracking: options.historicalTracking !== false, // Track history
// Logging
logging: options.logging !== false,
alerting: options.alerting !== false, // Send alerts on critical events
};
// State tracking with history
this.state = {
// Current state
currentUsage: 0,
currentTemp: null,
lastCheck: null,
lastCleanup: null,
isMonitoring: false,
emergencyMode: false,
// Failure tracking
consecutiveCleanupFailures: 0,
consecutiveStatFailures: 0,
// Historical data (for trend analysis)
history: [],
maxHistorySize: 100,
// Sustained load tracking
highUsageStartTime: null,
// Service health
serviceHealthy: true,
lastHealthCheck: null,
};
// Initialize scheduler with safety parameters
this.scheduler = new MaintenanceScheduler(comfyuiClient, {
idleTimeout: this.config.idleTimeout,
periodicInterval: 30 * 60 * 1000,
strategy: 'sd15', // Default to SD1.5 strategy
enabled: this.config.autoProtection
});
// Track operations in progress
this.operationsInProgress = new Set();
// Emergency shutdown callback
this.emergencyShutdownCallback = options.onEmergencyShutdown || null;
}
/**
* Get comprehensive GPU stats including temperature
* TRUST BUT VERIFY: Cross-check multiple sources
*/
async getGPUStats() {
try {
// Primary endpoint (confirmed working)
const response = await fetch(
`http://${this.client.serverAddress}/system_stats`
);
if (!response.ok) {
throw new Error(`Stats endpoint returned ${response.status}`);
}
const data = await response.json();
const stats = this.parseVRAMStats(data);
if (!stats) {
throw new Error('Unable to parse VRAM stats');
}
// VERIFY: Sanity check the stats
if (stats.total === 0 || stats.usage > 1 || stats.usage < 0) {
this.log('β οΈ Suspicious stats detected, marking as unreliable', 'warn');
stats.reliable = false;
} else {
stats.reliable = true;
}
// Add temperature if available
stats.temperature = this.parseTemperature(data);
// Track successful stat check
this.state.consecutiveStatFailures = 0;
return stats;
} catch (error) {
this.state.consecutiveStatFailures++;
// EDGE CASE: Multiple stat failures
if (this.state.consecutiveStatFailures >= this.config.safetyThresholds.MAX_STAT_FAILURES) {
this.log('π¨ CRITICAL: Unable to monitor GPU stats!', 'error');
this.enterEmergencyMode('stat_failures');
}
return null;
}
}
/**
* Parse temperature from stats (if available)
*/
parseTemperature(data) {
if (data.devices) {
for (const device of data.devices) {
if (device.type === 'cuda' && device.temperature) {
return device.temperature;
}
}
}
return null;
}
/**
* Parse VRAM stats with validation
*/
parseVRAMStats(data) {
let stats = null;
if (data.devices) {
for (const device of data.devices) {
if (device.type === 'cuda') {
const total = device.vram_total || 0;
const free = device.vram_free || 0;
const used = total - free;
stats = {
total,
used,
free,
usage: total > 0 ? used / total : 0,
totalGB: total / (1024**3),
usedGB: used / (1024**3),
freeGB: free / (1024**3)
};
break;
}
}
}
return stats;
}
/**
* CRITICAL: Check GPU safety and take protective actions
*/
async checkGPUSafety() {
const stats = await this.getGPUStats();
if (!stats) {
// NO STATS = ASSUME WORST CASE
this.log('β οΈ No GPU stats available, assuming high usage', 'warn');
return {
safe: false,
reason: 'no_stats',
action: 'refuse_operations'
};
}
// Update state
this.state.currentUsage = stats.usage;
this.state.currentTemp = stats.temperature;
this.state.lastCheck = Date.now();
// Add to history for trend analysis
this.addToHistory(stats);
// Temperature checks (highest priority)
if (stats.temperature) {
if (stats.temperature >= this.config.safetyThresholds.TEMP_SHUTDOWN) {
this.log(`π₯ EMERGENCY: GPU temp ${stats.temperature}Β°C - SHUTDOWN REQUIRED!`, 'error');
this.triggerEmergencyShutdown('temperature_critical', stats);
return { safe: false, reason: 'temp_shutdown', action: 'shutdown' };
}
if (stats.temperature >= this.config.safetyThresholds.TEMP_EMERGENCY) {
this.log(`π¨ GPU temp critical: ${stats.temperature}Β°C`, 'error');
this.enterEmergencyMode('temperature_high');
return { safe: false, reason: 'temp_emergency', action: 'refuse_all' };
}
if (stats.temperature >= this.config.safetyThresholds.TEMP_CRITICAL) {
this.log(`β οΈ GPU temp high: ${stats.temperature}Β°C - forcing cleanup`, 'warn');
await this.emergencyCleanup(stats);
return { safe: false, reason: 'temp_critical', action: 'cleanup' };
}
}
// VRAM checks
if (stats.usage >= this.config.safetyThresholds.VRAM_SHUTDOWN) {
this.log(`π EMERGENCY: VRAM at ${(stats.usage * 100).toFixed(1)}% - SHUTDOWN REQUIRED!`, 'error');
this.triggerEmergencyShutdown('vram_critical', stats);
return { safe: false, reason: 'vram_shutdown', action: 'shutdown' };
}
if (stats.usage >= this.config.safetyThresholds.VRAM_EMERGENCY) {
this.log(`π¨ VRAM emergency: ${(stats.usage * 100).toFixed(1)}% - refusing operations`, 'error');
this.enterEmergencyMode('vram_emergency');
await this.emergencyCleanup(stats);
return { safe: false, reason: 'vram_emergency', action: 'refuse_new' };
}
if (stats.usage >= this.config.safetyThresholds.VRAM_CRITICAL) {
this.log(`β οΈ VRAM critical: ${(stats.usage * 100).toFixed(1)}% - cleanup required`, 'warn');
// Check sustained high usage
if (!this.state.highUsageStartTime) {
this.state.highUsageStartTime = Date.now();
} else {
const sustainedMinutes = (Date.now() - this.state.highUsageStartTime) / 60000;
if (sustainedMinutes >= this.config.safetyThresholds.SUSTAINED_HIGH_MINUTES) {
this.log(`β±οΈ Sustained high VRAM for ${sustainedMinutes.toFixed(1)} minutes`, 'warn');
await this.emergencyCleanup(stats);
}
}
return { safe: false, reason: 'vram_critical', action: 'cleanup_needed' };
}
// Reset sustained usage tracking if below critical
if (stats.usage < this.config.safetyThresholds.VRAM_CRITICAL) {
this.state.highUsageStartTime = null;
}
// Check if safe for operations
const safe = stats.usage < this.config.safetyThresholds.VRAM_SAFE;
this.log(
`GPU: ${stats.usedGB.toFixed(1)}/${stats.totalGB.toFixed(1)}GB ` +
`(${(stats.usage * 100).toFixed(1)}%) ` +
(stats.temperature ? `| Temp: ${stats.temperature}Β°C ` : '') +
`| Status: ${safe ? 'β
SAFE' : 'β οΈ CAUTION'}`
);
return {
safe,
stats,
reason: safe ? 'within_limits' : 'approaching_limits',
action: safe ? 'allow' : 'monitor'
};
}
/**
* Emergency cleanup with verification
* DO NO HARM: Verify cleanup worked, escalate if not
*/
async emergencyCleanup(stats) {
this.log('π¨ EMERGENCY CLEANUP INITIATED', 'error');
try {
// Stop all operations
this.pauseAllOperations();
// Run aggressive cleanup
const result = await runMaintenanceWorkflow(this.client, 'empty');
if (!result.success) {
throw new Error('Cleanup workflow failed');
}
// VERIFY: Wait and check if cleanup worked
await new Promise(resolve => setTimeout(resolve, 3000));
const newStats = await this.getGPUStats();
if (!newStats) {
throw new Error('Cannot verify cleanup - no stats');
}
const freedGB = stats.usedGB - newStats.usedGB;
const success = newStats.usage < stats.usage && freedGB > 0.5; // At least 500MB freed
if (success) {
this.log(
`β
Emergency cleanup successful: freed ${freedGB.toFixed(1)}GB`,
'success'
);
this.state.consecutiveCleanupFailures = 0;
return { success: true, freedGB, newUsage: newStats.usage };
} else {
throw new Error(`Cleanup ineffective: only freed ${freedGB.toFixed(1)}GB`);
}
} catch (error) {
this.state.consecutiveCleanupFailures++;
this.log(`β Emergency cleanup failed: ${error.message}`, 'error');
// ESCALATE: Multiple failures = emergency mode
if (this.state.consecutiveCleanupFailures >= this.config.safetyThresholds.MAX_CLEANUP_FAILURES) {
this.log('π CRITICAL: Multiple cleanup failures - entering emergency mode', 'error');
this.enterEmergencyMode('cleanup_failures');
}
return { success: false, error: error.message };
}
}
/**
* Enter emergency mode - refuse all non-critical operations
* PROTECT THE GPU: Better to deny service than damage hardware
*/
enterEmergencyMode(reason) {
this.state.emergencyMode = true;
this.state.serviceHealthy = false;
this.log(`π¨ EMERGENCY MODE ACTIVATED: ${reason}`, 'error');
this.log('β All non-critical operations will be refused', 'error');
// Switch to rapid monitoring
if (this.checkInterval) {
clearInterval(this.checkInterval);
}
this.checkInterval = setInterval(() => {
this.checkGPUSafety();
}, this.config.emergencyCheckInterval);
// Alert if configured
if (this.config.alerting) {
this.sendAlert('emergency_mode', { reason, timestamp: Date.now() });
}
}
/**
* Trigger emergency shutdown
* LAST RESORT: Protect hardware at all costs
*/
triggerEmergencyShutdown(reason, stats) {
this.log('πππ EMERGENCY SHUTDOWN TRIGGERED πππ', 'error');
this.log(`Reason: ${reason}`, 'error');
this.log(`Stats: VRAM ${(stats.usage * 100).toFixed(1)}%, Temp: ${stats.temperature}Β°C`, 'error');
// Call emergency shutdown callback if provided
if (this.emergencyShutdownCallback) {
this.emergencyShutdownCallback(reason, stats);
}
// Stop all operations immediately
this.pauseAllOperations();
// Alert
if (this.config.alerting) {
this.sendAlert('emergency_shutdown', { reason, stats, timestamp: Date.now() });
}
// Log to file if possible
this.logEmergencyToFile(reason, stats);
}
/**
* Pre-operation safety check
* THINK STRATEGICALLY: Prevent problems before they occur
*/
async canSafelyExecute(operationType = 'unknown', requirements = {}) {
// Check if in emergency mode
if (this.state.emergencyMode) {
this.log(`β Operation '${operationType}' refused - emergency mode active`, 'error');
return {
allowed: false,
reason: 'emergency_mode',
message: 'System in emergency mode - operations refused for GPU safety'
};
}
// Get current GPU state
const safety = await this.checkGPUSafety();
if (!safety.safe) {
this.log(`β Operation '${operationType}' refused - GPU unsafe: ${safety.reason}`, 'warn');
return {
allowed: false,
reason: safety.reason,
message: `GPU safety check failed: ${safety.reason}`
};
}
// Check operation requirements
const requiredVRAM = requirements.vramGB || 0;
const stats = safety.stats;
if (stats && requiredVRAM > 0) {
const availableGB = stats.freeGB;
if (availableGB < requiredVRAM * 1.2) { // 20% safety margin
this.log(
`β οΈ Operation '${operationType}' needs ${requiredVRAM}GB, only ${availableGB.toFixed(1)}GB free`,
'warn'
);
// Try cleanup first
const cleanup = await this.emergencyCleanup(stats);
if (!cleanup.success) {
return {
allowed: false,
reason: 'insufficient_vram',
message: `Insufficient VRAM: need ${requiredVRAM}GB, have ${availableGB.toFixed(1)}GB`
};
}
// Re-check after cleanup
return this.canSafelyExecute(operationType, requirements);
}
}
// Track operation
const operationId = `${operationType}_${Date.now()}`;
this.operationsInProgress.add(operationId);
return {
allowed: true,
operationId,
stats,
message: 'Operation approved - GPU within safe parameters'
};
}
/**
* Mark operation as complete
*/
operationComplete(operationId) {
this.operationsInProgress.delete(operationId);
// Check GPU state after operation
this.checkGPUSafety();
}
/**
* Pause all operations for safety
*/
pauseAllOperations() {
this.log('βΈοΈ Pausing all operations for GPU safety', 'warn');
this.operationsInProgress.clear();
}
/**
* Add stats to history for trend analysis
* PLAN FOR LONG TERM: Track patterns to predict issues
*/
addToHistory(stats) {
if (!this.config.historicalTracking) return;
this.state.history.push({
timestamp: Date.now(),
usage: stats.usage,
temperature: stats.temperature,
usedGB: stats.usedGB
});
// Maintain history size limit
if (this.state.history.length > this.state.maxHistorySize) {
this.state.history.shift();
}
// Analyze trends
this.analyzeTrends();
}
/**
* Analyze historical trends to predict issues
*/
analyzeTrends() {
if (this.state.history.length < 10) return;
const recent = this.state.history.slice(-10);
const avgUsage = recent.reduce((sum, h) => sum + h.usage, 0) / recent.length;
const trend = recent[9].usage - recent[0].usage; // Change over last 10 samples
if (trend > 0.1 && avgUsage > 0.7) {
this.log('π Warning: VRAM usage trending upward', 'warn');
}
if (recent.every(h => h.usage > 0.8)) {
this.log('β οΈ Sustained high VRAM usage detected', 'warn');
}
}
/**
* Send alert (implement based on your alerting system)
*/
sendAlert(type, data) {
console.error(`[ALERT] ${type}:`, data);
// Implement: email, webhook, logging service, etc.
}
/**
* Log emergency to file for post-mortem analysis
*/
logEmergencyToFile(reason, stats) {
const fs = require('fs').promises;
const logData = {
timestamp: new Date().toISOString(),
reason,
stats,
history: this.state.history.slice(-20) // Last 20 entries
};
fs.appendFile(
'gpu-emergency.log',
JSON.stringify(logData) + '\n'
).catch(err => console.error('Failed to log emergency:', err));
}
/**
* Health check
*/
async healthCheck() {
const safety = await this.checkGPUSafety();
this.state.lastHealthCheck = Date.now();
this.state.serviceHealthy = safety.safe && !this.state.emergencyMode;
return {
healthy: this.state.serviceHealthy,
emergencyMode: this.state.emergencyMode,
currentUsage: this.state.currentUsage,
temperature: this.state.currentTemp,
operationsInProgress: this.operationsInProgress.size,
consecutiveFailures: {
cleanup: this.state.consecutiveCleanupFailures,
stats: this.state.consecutiveStatFailures
}
};
}
/**
* Start monitoring
*/
start() {
if (this.state.isMonitoring) return;
this.state.isMonitoring = true;
this.log('π‘οΈ GPU-Safe VRAM Monitor starting...', 'info');
// Initial safety check
this.checkGPUSafety();
// Regular monitoring
this.checkInterval = setInterval(() => {
this.checkGPUSafety();
}, this.config.checkInterval);
this.log('β
GPU protection active', 'success');
}
/**
* Stop monitoring
*/
stop() {
if (!this.state.isMonitoring) return;
this.state.isMonitoring = false;
if (this.checkInterval) {
clearInterval(this.checkInterval);
this.checkInterval = null;
}
this.scheduler.stop();
this.log('π GPU-Safe VRAM Monitor stopped', 'info');
}
/**
* Logging with safety context
*/
log(message, level = 'info') {
if (!this.config.logging) return;
const timestamp = new Date().toISOString();
const prefix = {
info: 'βΉοΈ',
warn: 'β οΈ',
error: 'π¨',
success: 'β
'
}[level] || 'π';
const safety = this.state.emergencyMode ? '[EMERGENCY]' : '[NORMAL]';
console.log(`[${timestamp}] ${safety} ${prefix} ${message}`);
}
}
/**
* Setup GPU-safe VRAM management with MCP integration
*/
export function setupGPUSafeVRAMManagement(comfyuiClient, options = {}) {
const manager = new GPUSafeVRAMManager(comfyuiClient, {
...options,
onEmergencyShutdown: (reason, stats) => {
console.error('π¨π¨π¨ EMERGENCY GPU SHUTDOWN REQUIRED π¨π¨π¨');
console.error(`Reason: ${reason}`);
console.error('Action: Manual intervention required');
// Implement: actual shutdown commands, notifications, etc.
// For Docker: docker-compose down
// For system: shutdown -h now (with proper permissions)
}
});
// Start protection
manager.start();
// Hook into operations with safety checks
const originalGenerateImage = comfyuiClient.generateImage;
comfyuiClient.generateImage = async function(params, ...args) {
// Check if safe to proceed
const safety = await manager.canSafelyExecute('generate_image', {
vramGB: params?.model?.includes('flux') ? 11 : 2
});
if (!safety.allowed) {
throw new Error(`GPU Safety: ${safety.message}`);
}
try {
const result = await originalGenerateImage.apply(this, [params, ...args]);
manager.operationComplete(safety.operationId);
return result;
} catch (error) {
manager.operationComplete(safety.operationId);
throw error;
}
};
// Hook into upscaling with safety
if (comfyuiClient.upscaleImage) {
const originalUpscale = comfyuiClient.upscaleImage;
comfyuiClient.upscaleImage = async function(...args) {
const safety = await manager.canSafelyExecute('upscale', { vramGB: 4 });
if (!safety.allowed) {
throw new Error(`GPU Safety: ${safety.message}`);
}
try {
const result = await originalUpscale.apply(this, args);
manager.operationComplete(safety.operationId);
return result;
} catch (error) {
manager.operationComplete(safety.operationId);
throw error;
}
};
}
// Periodic health checks
setInterval(async () => {
const health = await manager.healthCheck();
if (!health.healthy) {
console.error('β οΈ GPU health check failed:', health);
}
}, 60000); // Every minute
return manager;
}
export default GPUSafeVRAMManager;