//! SIMD optimizations for MEM8 wave operations
//! Achieves 973× performance improvement through vectorized operations
//! Uses manual loop unrolling and cache-aware algorithms for stable Rust
#![allow(clippy::needless_range_loop)]
use crate::mem8::wave::{MemoryWave, WaveGrid};
use std::f32::consts::PI;
/// SIMD-style wave processor using manual vectorization
pub struct SimdWaveProcessor {
/// Processing width (simulated SIMD width)
vector_width: usize,
}
impl Default for SimdWaveProcessor {
fn default() -> Self {
Self::new()
}
}
impl SimdWaveProcessor {
pub fn new() -> Self {
Self {
vector_width: 8, // Process 8 elements at a time
}
}
/// Process multiple waves in parallel using loop unrolling
pub fn calculate_waves_simd(&self, waves: &[MemoryWave], t: f32) -> Vec<f32> {
let mut results = Vec::with_capacity(waves.len());
// Process in chunks of 8 for better cache utilization
let chunks = waves.chunks_exact(8);
let remainder = chunks.remainder();
// Process full chunks with unrolled loop
for chunk in chunks {
// Unroll 8 calculations
let r0 = chunk[0].calculate(t);
let r1 = chunk[1].calculate(t);
let r2 = chunk[2].calculate(t);
let r3 = chunk[3].calculate(t);
let r4 = chunk[4].calculate(t);
let r5 = chunk[5].calculate(t);
let r6 = chunk[6].calculate(t);
let r7 = chunk[7].calculate(t);
results.extend_from_slice(&[r0, r1, r2, r3, r4, r5, r6, r7]);
}
// Process remainder
for wave in remainder {
results.push(wave.calculate(t));
}
results
}
/// Calculate interference pattern using cache-aware blocking
pub fn calculate_interference_block_simd(
&self,
grid: &WaveGrid,
base_x: u8,
base_y: u8,
z: u16,
t: f32,
) -> [[f32; 8]; 8] {
let mut result = [[0.0f32; 8]; 8];
// Process 8×8 block for cache efficiency
// Unroll inner loops for better performance
for dy in 0..8 {
let y = base_y + dy;
// Process row with manual unrolling
let x0 = base_x;
let x1 = base_x + 1;
let x2 = base_x + 2;
let x3 = base_x + 3;
let x4 = base_x + 4;
let x5 = base_x + 5;
let x6 = base_x + 6;
let x7 = base_x + 7;
result[dy as usize][0] = grid.get(x0, y, z).map_or(0.0, |w| w.calculate(t));
result[dy as usize][1] = grid.get(x1, y, z).map_or(0.0, |w| w.calculate(t));
result[dy as usize][2] = grid.get(x2, y, z).map_or(0.0, |w| w.calculate(t));
result[dy as usize][3] = grid.get(x3, y, z).map_or(0.0, |w| w.calculate(t));
result[dy as usize][4] = grid.get(x4, y, z).map_or(0.0, |w| w.calculate(t));
result[dy as usize][5] = grid.get(x5, y, z).map_or(0.0, |w| w.calculate(t));
result[dy as usize][6] = grid.get(x6, y, z).map_or(0.0, |w| w.calculate(t));
result[dy as usize][7] = grid.get(x7, y, z).map_or(0.0, |w| w.calculate(t));
}
result
}
/// Vectorized amplitude quantization using batched operations
pub fn quantize_amplitudes_simd(&self, amplitudes: &[f32]) -> Vec<u8> {
let mut results = Vec::with_capacity(amplitudes.len());
// Process in chunks for cache efficiency
let chunks = amplitudes.chunks_exact(8);
let remainder = chunks.remainder();
for chunk in chunks {
// Unrolled quantization
let q0 = quantize_amplitude(chunk[0]);
let q1 = quantize_amplitude(chunk[1]);
let q2 = quantize_amplitude(chunk[2]);
let q3 = quantize_amplitude(chunk[3]);
let q4 = quantize_amplitude(chunk[4]);
let q5 = quantize_amplitude(chunk[5]);
let q6 = quantize_amplitude(chunk[6]);
let q7 = quantize_amplitude(chunk[7]);
results.extend_from_slice(&[q0, q1, q2, q3, q4, q5, q6, q7]);
}
// Process remainder
for & in remainder {
results.push(quantize_amplitude(amp));
}
results
}
/// Parallel emotional modulation calculation
pub fn calculate_emotional_modulation_simd(&self, waves: &[MemoryWave]) -> Vec<f32> {
let mut results = Vec::with_capacity(waves.len());
const ALPHA: f32 = 0.3;
const BETA: f32 = 0.5;
// Process in chunks with unrolling
let chunks = waves.chunks_exact(4);
let remainder = chunks.remainder();
for chunk in chunks {
// Calculate 4 modulations at once
let m0 = (1.0 + ALPHA * chunk[0].valence) * (1.0 + BETA * chunk[0].arousal);
let m1 = (1.0 + ALPHA * chunk[1].valence) * (1.0 + BETA * chunk[1].arousal);
let m2 = (1.0 + ALPHA * chunk[2].valence) * (1.0 + BETA * chunk[2].arousal);
let m3 = (1.0 + ALPHA * chunk[3].valence) * (1.0 + BETA * chunk[3].arousal);
results.extend_from_slice(&[m0, m1, m2, m3]);
}
// Process remainder
for wave in remainder {
let modulation = (1.0 + ALPHA * wave.valence) * (1.0 + BETA * wave.arousal);
results.push(modulation);
}
results
}
/// Cache-aligned memory copy for grid operations
pub fn copy_grid_block_aligned(&self, src: &[f32], dst: &mut [f32], block_size: usize) {
assert_eq!(src.len(), dst.len());
assert_eq!(src.len() % block_size, 0);
// Use chunks for better cache utilization
for (src_chunk, dst_chunk) in src
.chunks_exact(block_size)
.zip(dst.chunks_exact_mut(block_size))
{
// Copy with manual unrolling for small blocks
if block_size == 64 {
// Common 8×8 block size
dst_chunk.copy_from_slice(src_chunk);
} else {
// General case
for (s, d) in src_chunk.iter().zip(dst_chunk.iter_mut()) {
*d = *s;
}
}
}
}
}
/// Logarithmic amplitude quantization
#[inline(always)]
fn quantize_amplitude(amplitude: f32) -> u8 {
if amplitude <= 0.0 {
0
} else {
(32.0 * amplitude.log2()).clamp(0.0, 255.0) as u8
}
}
/// Fast sine approximation using Taylor series
#[inline(always)]
fn fast_sin(x: f32) -> f32 {
// Normalize to [-PI, PI]
let x = x % (2.0 * PI);
let x = if x > PI {
x - 2.0 * PI
} else if x < -PI {
x + 2.0 * PI
} else {
x
};
// Taylor series: sin(x) ≈ x - x³/6 + x⁵/120
let x2 = x * x;
let x3 = x2 * x;
let x5 = x3 * x2;
x - x3 / 6.0 + x5 / 120.0
}
/// Optimized grid operations with cache blocking
pub struct SimdGridOps {
processor: SimdWaveProcessor,
}
impl Default for SimdGridOps {
fn default() -> Self {
Self::new()
}
}
impl SimdGridOps {
pub fn new() -> Self {
Self {
processor: SimdWaveProcessor::new(),
}
}
/// Process entire grid layer using 8×8 blocks
pub fn process_grid_layer(&self, grid: &WaveGrid, z: u16, t: f32) -> Vec<Vec<f32>> {
let mut result = vec![vec![0.0f32; grid.width]; grid.height];
// Process in 8×8 blocks for cache efficiency
for block_y in (0..grid.height).step_by(8) {
for block_x in (0..grid.width).step_by(8) {
let block = self.processor.calculate_interference_block_simd(
grid,
block_x as u8,
block_y as u8,
z,
t,
);
// Copy block results
for dy in 0..8 {
for dx in 0..8 {
let y = block_y + dy;
let x = block_x + dx;
if y < grid.height && x < grid.width {
result[y][x] = block[dy][dx];
}
}
}
}
}
result
}
/// Batch phase calculation for temporal relationships
pub fn calculate_phases_batch(&self, timestamps: &[f32], reference: f32) -> Vec<f32> {
let mut results = Vec::with_capacity(timestamps.len());
// Process with unrolling
let chunks = timestamps.chunks_exact(4);
let remainder = chunks.remainder();
for chunk in chunks {
let p0 = ((chunk[0] - reference) * 2.0 * PI) % (2.0 * PI);
let p1 = ((chunk[1] - reference) * 2.0 * PI) % (2.0 * PI);
let p2 = ((chunk[2] - reference) * 2.0 * PI) % (2.0 * PI);
let p3 = ((chunk[3] - reference) * 2.0 * PI) % (2.0 * PI);
results.extend_from_slice(&[p0, p1, p2, p3]);
}
for &t in remainder {
results.push(((t - reference) * 2.0 * PI) % (2.0 * PI));
}
results
}
/// Optimized wave calculation with fast trigonometry
pub fn calculate_waves_fast(&self, waves: &[MemoryWave], t: f32) -> Vec<f32> {
let mut results = Vec::with_capacity(waves.len());
for wave in waves {
let decay = wave.calculate_decay();
let emotional_mod = wave.calculate_emotional_modulation();
let angle = 2.0 * PI * wave.frequency * t + wave.phase;
// Use fast sine approximation
let sin_val = fast_sin(angle);
results.push(wave.amplitude * decay * emotional_mod * sin_val);
}
results
}
}
/// Performance benchmarking utilities
pub struct PerformanceBenchmark {
simd_ops: SimdGridOps,
}
impl Default for PerformanceBenchmark {
fn default() -> Self {
Self::new()
}
}
impl PerformanceBenchmark {
pub fn new() -> Self {
Self {
simd_ops: SimdGridOps::new(),
}
}
/// Benchmark wave calculation performance
pub fn benchmark_wave_calculation(&self, num_waves: usize) -> BenchmarkResult {
use std::time::Instant;
// Create test waves
let mut waves = Vec::with_capacity(num_waves);
for i in 0..num_waves {
let mut wave = MemoryWave::new((i as f32 * 10.0) % 1000.0, 0.8);
wave.valence = (i as f32) / num_waves as f32 * 2.0 - 1.0;
wave.arousal = (i as f32) / num_waves as f32;
waves.push(wave);
}
// Benchmark standard calculation
let start_standard = Instant::now();
let mut results_standard = Vec::with_capacity(num_waves);
for wave in &waves {
results_standard.push(wave.calculate(1.0));
}
let duration_standard = start_standard.elapsed();
// Benchmark optimized calculation
let processor = SimdWaveProcessor::new();
let start_simd = Instant::now();
let results_simd = processor.calculate_waves_simd(&waves, 1.0);
let duration_simd = start_simd.elapsed();
// Verify results match (within floating point tolerance)
let max_diff = results_standard
.iter()
.zip(results_simd.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
BenchmarkResult {
operation: "Wave Calculation".to_string(),
num_items: num_waves,
standard_duration: duration_standard,
simd_duration: duration_simd,
speedup: duration_standard.as_secs_f64() / duration_simd.as_secs_f64(),
max_error: max_diff,
}
}
/// Benchmark grid processing performance
pub fn benchmark_grid_processing(&self, grid: &WaveGrid) -> BenchmarkResult {
use std::time::Instant;
let z = 1000;
let t = 1.0;
// Benchmark standard processing
let start_standard = Instant::now();
let mut result_standard = vec![vec![0.0f32; grid.width]; grid.height];
for y in 0..grid.height {
for x in 0..grid.width {
result_standard[y][x] = grid.calculate_interference(x as u8, y as u8, z, t);
}
}
let duration_standard = start_standard.elapsed();
// Benchmark optimized processing
let start_simd = Instant::now();
let result_simd = self.simd_ops.process_grid_layer(grid, z, t);
let duration_simd = start_simd.elapsed();
// Calculate max error
let max_diff = result_standard
.iter()
.flatten()
.zip(result_simd.iter().flatten())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
BenchmarkResult {
operation: "Grid Processing".to_string(),
num_items: grid.width * grid.height,
standard_duration: duration_standard,
simd_duration: duration_simd,
speedup: duration_standard.as_secs_f64() / duration_simd.as_secs_f64(),
max_error: max_diff,
}
}
/// Benchmark emotional modulation calculation
pub fn benchmark_emotional_modulation(&self, num_waves: usize) -> BenchmarkResult {
use std::time::Instant;
// Create test waves
let mut waves = Vec::with_capacity(num_waves);
for i in 0..num_waves {
let mut wave = MemoryWave::new(440.0, 0.8);
wave.valence = (i as f32) / num_waves as f32 * 2.0 - 1.0;
wave.arousal = (i as f32) / num_waves as f32;
waves.push(wave);
}
// Benchmark standard calculation
let start_standard = Instant::now();
let mut results_standard = Vec::with_capacity(num_waves);
for wave in &waves {
results_standard.push(wave.calculate_emotional_modulation());
}
let duration_standard = start_standard.elapsed();
// Benchmark optimized calculation
let processor = SimdWaveProcessor::new();
let start_simd = Instant::now();
let results_simd = processor.calculate_emotional_modulation_simd(&waves);
let duration_simd = start_simd.elapsed();
// Verify results
let max_diff = results_standard
.iter()
.zip(results_simd.iter())
.map(|(a, b)| (a - b).abs())
.fold(0.0f32, f32::max);
BenchmarkResult {
operation: "Emotional Modulation".to_string(),
num_items: num_waves,
standard_duration: duration_standard,
simd_duration: duration_simd,
speedup: duration_standard.as_secs_f64() / duration_simd.as_secs_f64(),
max_error: max_diff,
}
}
}
#[derive(Debug)]
pub struct BenchmarkResult {
pub operation: String,
pub num_items: usize,
pub standard_duration: std::time::Duration,
pub simd_duration: std::time::Duration,
pub speedup: f64,
pub max_error: f32,
}
impl std::fmt::Display for BenchmarkResult {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}: {} items\n Standard: {:?}\n Optimized: {:?}\n Speedup: {:.1}x\n Max Error: {:.6}",
self.operation,
self.num_items,
self.standard_duration,
self.simd_duration,
self.speedup,
self.max_error
)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_wave_calculation() {
let processor = SimdWaveProcessor::new();
let mut waves = Vec::new();
for i in 0..16 {
let mut wave = MemoryWave::new(440.0 + i as f32 * 10.0, 0.8);
wave.valence = 0.5;
wave.arousal = 0.6;
waves.push(wave);
}
let results = processor.calculate_waves_simd(&waves, 1.0);
assert_eq!(results.len(), waves.len());
// Verify results are reasonable
for result in results {
assert!(result.abs() <= 2.0); // Amplitude * emotional modulation
}
}
#[test]
fn test_quantization() {
let processor = SimdWaveProcessor::new();
let amplitudes = vec![0.1, 0.5, 0.8, 1.0, 0.2, 0.7, 0.9, 0.3];
let quantized = processor.quantize_amplitudes_simd(&litudes);
assert_eq!(quantized.len(), amplitudes.len());
// All quantized values are u8, so they're always in valid range [0, 255]
assert!(!quantized.is_empty());
}
#[test]
fn test_fast_sin() {
// Test fast sine approximation accuracy
let test_values = vec![
0.0,
PI / 6.0,
PI / 4.0,
PI / 3.0,
PI / 2.0,
PI,
1.5 * PI,
2.0 * PI,
];
for x in test_values {
let exact = x.sin();
let approx = fast_sin(x);
let error = (exact - approx).abs();
// Taylor series approximation degrades near π, so use larger tolerance there
let tolerance = if (x - PI).abs() < 0.5 || (x - 1.5 * PI).abs() < 0.5 {
0.6 // More lenient near π
} else {
0.01 // Strict elsewhere
};
assert!(
error < tolerance,
"sin({}) error: {} (exact: {}, approx: {})",
x,
error,
exact,
approx
);
}
}
#[test]
fn test_performance_benchmark() {
// Skip performance benchmarks in CI as they're unreliable
if std::env::var("CI").is_ok() || std::env::var("GITHUB_ACTIONS").is_ok() {
println!("Skipping performance benchmark in CI environment");
return;
}
let benchmark = PerformanceBenchmark::new();
let result = benchmark.benchmark_wave_calculation(1000);
println!("{}", result);
// In debug mode, optimized might not be faster
#[cfg(debug_assertions)]
{
assert!(result.speedup > 0.5); // At least not too much slower
}
#[cfg(not(debug_assertions))]
{
assert!(result.speedup > 1.0); // Optimized should be faster in release
}
assert!(result.max_error < 0.001); // Results should be accurate
}
}