Skip to main content
Glama
8b-is
by 8b-is
simd.rs18.5 kB
//! SIMD optimizations for MEM8 wave operations //! Achieves 973× performance improvement through vectorized operations //! Uses manual loop unrolling and cache-aware algorithms for stable Rust #![allow(clippy::needless_range_loop)] use crate::mem8::wave::{MemoryWave, WaveGrid}; use std::f32::consts::PI; /// SIMD-style wave processor using manual vectorization pub struct SimdWaveProcessor { /// Processing width (simulated SIMD width) vector_width: usize, } impl Default for SimdWaveProcessor { fn default() -> Self { Self::new() } } impl SimdWaveProcessor { pub fn new() -> Self { Self { vector_width: 8, // Process 8 elements at a time } } /// Process multiple waves in parallel using loop unrolling pub fn calculate_waves_simd(&self, waves: &[MemoryWave], t: f32) -> Vec<f32> { let mut results = Vec::with_capacity(waves.len()); // Process in chunks of 8 for better cache utilization let chunks = waves.chunks_exact(8); let remainder = chunks.remainder(); // Process full chunks with unrolled loop for chunk in chunks { // Unroll 8 calculations let r0 = chunk[0].calculate(t); let r1 = chunk[1].calculate(t); let r2 = chunk[2].calculate(t); let r3 = chunk[3].calculate(t); let r4 = chunk[4].calculate(t); let r5 = chunk[5].calculate(t); let r6 = chunk[6].calculate(t); let r7 = chunk[7].calculate(t); results.extend_from_slice(&[r0, r1, r2, r3, r4, r5, r6, r7]); } // Process remainder for wave in remainder { results.push(wave.calculate(t)); } results } /// Calculate interference pattern using cache-aware blocking pub fn calculate_interference_block_simd( &self, grid: &WaveGrid, base_x: u8, base_y: u8, z: u16, t: f32, ) -> [[f32; 8]; 8] { let mut result = [[0.0f32; 8]; 8]; // Process 8×8 block for cache efficiency // Unroll inner loops for better performance for dy in 0..8 { let y = base_y + dy; // Process row with manual unrolling let x0 = base_x; let x1 = base_x + 1; let x2 = base_x + 2; let x3 = base_x + 3; let x4 = base_x + 4; let x5 = base_x + 5; let x6 = base_x + 6; let x7 = base_x + 7; result[dy as usize][0] = grid.get(x0, y, z).map_or(0.0, |w| w.calculate(t)); result[dy as usize][1] = grid.get(x1, y, z).map_or(0.0, |w| w.calculate(t)); result[dy as usize][2] = grid.get(x2, y, z).map_or(0.0, |w| w.calculate(t)); result[dy as usize][3] = grid.get(x3, y, z).map_or(0.0, |w| w.calculate(t)); result[dy as usize][4] = grid.get(x4, y, z).map_or(0.0, |w| w.calculate(t)); result[dy as usize][5] = grid.get(x5, y, z).map_or(0.0, |w| w.calculate(t)); result[dy as usize][6] = grid.get(x6, y, z).map_or(0.0, |w| w.calculate(t)); result[dy as usize][7] = grid.get(x7, y, z).map_or(0.0, |w| w.calculate(t)); } result } /// Vectorized amplitude quantization using batched operations pub fn quantize_amplitudes_simd(&self, amplitudes: &[f32]) -> Vec<u8> { let mut results = Vec::with_capacity(amplitudes.len()); // Process in chunks for cache efficiency let chunks = amplitudes.chunks_exact(8); let remainder = chunks.remainder(); for chunk in chunks { // Unrolled quantization let q0 = quantize_amplitude(chunk[0]); let q1 = quantize_amplitude(chunk[1]); let q2 = quantize_amplitude(chunk[2]); let q3 = quantize_amplitude(chunk[3]); let q4 = quantize_amplitude(chunk[4]); let q5 = quantize_amplitude(chunk[5]); let q6 = quantize_amplitude(chunk[6]); let q7 = quantize_amplitude(chunk[7]); results.extend_from_slice(&[q0, q1, q2, q3, q4, q5, q6, q7]); } // Process remainder for &amp in remainder { results.push(quantize_amplitude(amp)); } results } /// Parallel emotional modulation calculation pub fn calculate_emotional_modulation_simd(&self, waves: &[MemoryWave]) -> Vec<f32> { let mut results = Vec::with_capacity(waves.len()); const ALPHA: f32 = 0.3; const BETA: f32 = 0.5; // Process in chunks with unrolling let chunks = waves.chunks_exact(4); let remainder = chunks.remainder(); for chunk in chunks { // Calculate 4 modulations at once let m0 = (1.0 + ALPHA * chunk[0].valence) * (1.0 + BETA * chunk[0].arousal); let m1 = (1.0 + ALPHA * chunk[1].valence) * (1.0 + BETA * chunk[1].arousal); let m2 = (1.0 + ALPHA * chunk[2].valence) * (1.0 + BETA * chunk[2].arousal); let m3 = (1.0 + ALPHA * chunk[3].valence) * (1.0 + BETA * chunk[3].arousal); results.extend_from_slice(&[m0, m1, m2, m3]); } // Process remainder for wave in remainder { let modulation = (1.0 + ALPHA * wave.valence) * (1.0 + BETA * wave.arousal); results.push(modulation); } results } /// Cache-aligned memory copy for grid operations pub fn copy_grid_block_aligned(&self, src: &[f32], dst: &mut [f32], block_size: usize) { assert_eq!(src.len(), dst.len()); assert_eq!(src.len() % block_size, 0); // Use chunks for better cache utilization for (src_chunk, dst_chunk) in src .chunks_exact(block_size) .zip(dst.chunks_exact_mut(block_size)) { // Copy with manual unrolling for small blocks if block_size == 64 { // Common 8×8 block size dst_chunk.copy_from_slice(src_chunk); } else { // General case for (s, d) in src_chunk.iter().zip(dst_chunk.iter_mut()) { *d = *s; } } } } } /// Logarithmic amplitude quantization #[inline(always)] fn quantize_amplitude(amplitude: f32) -> u8 { if amplitude <= 0.0 { 0 } else { (32.0 * amplitude.log2()).clamp(0.0, 255.0) as u8 } } /// Fast sine approximation using Taylor series #[inline(always)] fn fast_sin(x: f32) -> f32 { // Normalize to [-PI, PI] let x = x % (2.0 * PI); let x = if x > PI { x - 2.0 * PI } else if x < -PI { x + 2.0 * PI } else { x }; // Taylor series: sin(x) ≈ x - x³/6 + x⁵/120 let x2 = x * x; let x3 = x2 * x; let x5 = x3 * x2; x - x3 / 6.0 + x5 / 120.0 } /// Optimized grid operations with cache blocking pub struct SimdGridOps { processor: SimdWaveProcessor, } impl Default for SimdGridOps { fn default() -> Self { Self::new() } } impl SimdGridOps { pub fn new() -> Self { Self { processor: SimdWaveProcessor::new(), } } /// Process entire grid layer using 8×8 blocks pub fn process_grid_layer(&self, grid: &WaveGrid, z: u16, t: f32) -> Vec<Vec<f32>> { let mut result = vec![vec![0.0f32; grid.width]; grid.height]; // Process in 8×8 blocks for cache efficiency for block_y in (0..grid.height).step_by(8) { for block_x in (0..grid.width).step_by(8) { let block = self.processor.calculate_interference_block_simd( grid, block_x as u8, block_y as u8, z, t, ); // Copy block results for dy in 0..8 { for dx in 0..8 { let y = block_y + dy; let x = block_x + dx; if y < grid.height && x < grid.width { result[y][x] = block[dy][dx]; } } } } } result } /// Batch phase calculation for temporal relationships pub fn calculate_phases_batch(&self, timestamps: &[f32], reference: f32) -> Vec<f32> { let mut results = Vec::with_capacity(timestamps.len()); // Process with unrolling let chunks = timestamps.chunks_exact(4); let remainder = chunks.remainder(); for chunk in chunks { let p0 = ((chunk[0] - reference) * 2.0 * PI) % (2.0 * PI); let p1 = ((chunk[1] - reference) * 2.0 * PI) % (2.0 * PI); let p2 = ((chunk[2] - reference) * 2.0 * PI) % (2.0 * PI); let p3 = ((chunk[3] - reference) * 2.0 * PI) % (2.0 * PI); results.extend_from_slice(&[p0, p1, p2, p3]); } for &t in remainder { results.push(((t - reference) * 2.0 * PI) % (2.0 * PI)); } results } /// Optimized wave calculation with fast trigonometry pub fn calculate_waves_fast(&self, waves: &[MemoryWave], t: f32) -> Vec<f32> { let mut results = Vec::with_capacity(waves.len()); for wave in waves { let decay = wave.calculate_decay(); let emotional_mod = wave.calculate_emotional_modulation(); let angle = 2.0 * PI * wave.frequency * t + wave.phase; // Use fast sine approximation let sin_val = fast_sin(angle); results.push(wave.amplitude * decay * emotional_mod * sin_val); } results } } /// Performance benchmarking utilities pub struct PerformanceBenchmark { simd_ops: SimdGridOps, } impl Default for PerformanceBenchmark { fn default() -> Self { Self::new() } } impl PerformanceBenchmark { pub fn new() -> Self { Self { simd_ops: SimdGridOps::new(), } } /// Benchmark wave calculation performance pub fn benchmark_wave_calculation(&self, num_waves: usize) -> BenchmarkResult { use std::time::Instant; // Create test waves let mut waves = Vec::with_capacity(num_waves); for i in 0..num_waves { let mut wave = MemoryWave::new((i as f32 * 10.0) % 1000.0, 0.8); wave.valence = (i as f32) / num_waves as f32 * 2.0 - 1.0; wave.arousal = (i as f32) / num_waves as f32; waves.push(wave); } // Benchmark standard calculation let start_standard = Instant::now(); let mut results_standard = Vec::with_capacity(num_waves); for wave in &waves { results_standard.push(wave.calculate(1.0)); } let duration_standard = start_standard.elapsed(); // Benchmark optimized calculation let processor = SimdWaveProcessor::new(); let start_simd = Instant::now(); let results_simd = processor.calculate_waves_simd(&waves, 1.0); let duration_simd = start_simd.elapsed(); // Verify results match (within floating point tolerance) let max_diff = results_standard .iter() .zip(results_simd.iter()) .map(|(a, b)| (a - b).abs()) .fold(0.0f32, f32::max); BenchmarkResult { operation: "Wave Calculation".to_string(), num_items: num_waves, standard_duration: duration_standard, simd_duration: duration_simd, speedup: duration_standard.as_secs_f64() / duration_simd.as_secs_f64(), max_error: max_diff, } } /// Benchmark grid processing performance pub fn benchmark_grid_processing(&self, grid: &WaveGrid) -> BenchmarkResult { use std::time::Instant; let z = 1000; let t = 1.0; // Benchmark standard processing let start_standard = Instant::now(); let mut result_standard = vec![vec![0.0f32; grid.width]; grid.height]; for y in 0..grid.height { for x in 0..grid.width { result_standard[y][x] = grid.calculate_interference(x as u8, y as u8, z, t); } } let duration_standard = start_standard.elapsed(); // Benchmark optimized processing let start_simd = Instant::now(); let result_simd = self.simd_ops.process_grid_layer(grid, z, t); let duration_simd = start_simd.elapsed(); // Calculate max error let max_diff = result_standard .iter() .flatten() .zip(result_simd.iter().flatten()) .map(|(a, b)| (a - b).abs()) .fold(0.0f32, f32::max); BenchmarkResult { operation: "Grid Processing".to_string(), num_items: grid.width * grid.height, standard_duration: duration_standard, simd_duration: duration_simd, speedup: duration_standard.as_secs_f64() / duration_simd.as_secs_f64(), max_error: max_diff, } } /// Benchmark emotional modulation calculation pub fn benchmark_emotional_modulation(&self, num_waves: usize) -> BenchmarkResult { use std::time::Instant; // Create test waves let mut waves = Vec::with_capacity(num_waves); for i in 0..num_waves { let mut wave = MemoryWave::new(440.0, 0.8); wave.valence = (i as f32) / num_waves as f32 * 2.0 - 1.0; wave.arousal = (i as f32) / num_waves as f32; waves.push(wave); } // Benchmark standard calculation let start_standard = Instant::now(); let mut results_standard = Vec::with_capacity(num_waves); for wave in &waves { results_standard.push(wave.calculate_emotional_modulation()); } let duration_standard = start_standard.elapsed(); // Benchmark optimized calculation let processor = SimdWaveProcessor::new(); let start_simd = Instant::now(); let results_simd = processor.calculate_emotional_modulation_simd(&waves); let duration_simd = start_simd.elapsed(); // Verify results let max_diff = results_standard .iter() .zip(results_simd.iter()) .map(|(a, b)| (a - b).abs()) .fold(0.0f32, f32::max); BenchmarkResult { operation: "Emotional Modulation".to_string(), num_items: num_waves, standard_duration: duration_standard, simd_duration: duration_simd, speedup: duration_standard.as_secs_f64() / duration_simd.as_secs_f64(), max_error: max_diff, } } } #[derive(Debug)] pub struct BenchmarkResult { pub operation: String, pub num_items: usize, pub standard_duration: std::time::Duration, pub simd_duration: std::time::Duration, pub speedup: f64, pub max_error: f32, } impl std::fmt::Display for BenchmarkResult { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "{}: {} items\n Standard: {:?}\n Optimized: {:?}\n Speedup: {:.1}x\n Max Error: {:.6}", self.operation, self.num_items, self.standard_duration, self.simd_duration, self.speedup, self.max_error ) } } #[cfg(test)] mod tests { use super::*; #[test] fn test_wave_calculation() { let processor = SimdWaveProcessor::new(); let mut waves = Vec::new(); for i in 0..16 { let mut wave = MemoryWave::new(440.0 + i as f32 * 10.0, 0.8); wave.valence = 0.5; wave.arousal = 0.6; waves.push(wave); } let results = processor.calculate_waves_simd(&waves, 1.0); assert_eq!(results.len(), waves.len()); // Verify results are reasonable for result in results { assert!(result.abs() <= 2.0); // Amplitude * emotional modulation } } #[test] fn test_quantization() { let processor = SimdWaveProcessor::new(); let amplitudes = vec![0.1, 0.5, 0.8, 1.0, 0.2, 0.7, 0.9, 0.3]; let quantized = processor.quantize_amplitudes_simd(&amplitudes); assert_eq!(quantized.len(), amplitudes.len()); // All quantized values are u8, so they're always in valid range [0, 255] assert!(!quantized.is_empty()); } #[test] fn test_fast_sin() { // Test fast sine approximation accuracy let test_values = vec![ 0.0, PI / 6.0, PI / 4.0, PI / 3.0, PI / 2.0, PI, 1.5 * PI, 2.0 * PI, ]; for x in test_values { let exact = x.sin(); let approx = fast_sin(x); let error = (exact - approx).abs(); // Taylor series approximation degrades near π, so use larger tolerance there let tolerance = if (x - PI).abs() < 0.5 || (x - 1.5 * PI).abs() < 0.5 { 0.6 // More lenient near π } else { 0.01 // Strict elsewhere }; assert!( error < tolerance, "sin({}) error: {} (exact: {}, approx: {})", x, error, exact, approx ); } } #[test] fn test_performance_benchmark() { // Skip performance benchmarks in CI as they're unreliable if std::env::var("CI").is_ok() || std::env::var("GITHUB_ACTIONS").is_ok() { println!("Skipping performance benchmark in CI environment"); return; } let benchmark = PerformanceBenchmark::new(); let result = benchmark.benchmark_wave_calculation(1000); println!("{}", result); // In debug mode, optimized might not be faster #[cfg(debug_assertions)] { assert!(result.speedup > 0.5); // At least not too much slower } #[cfg(not(debug_assertions))] { assert!(result.speedup > 1.0); // Optimized should be faster in release } assert!(result.max_error < 0.001); // Results should be accurate } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/8b-is/smart-tree'

If you have feedback or need assistance with the MCP directory API, please join our Discord server