optimization.rsā¢13 kB
use codegraph_core::{CodeGraphError, Result};
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::time::Instant;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum QuantizationMethod {
Linear,
Asymmetric,
Symmetric,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QuantizationConfig {
pub bits: u8,
pub method: QuantizationMethod,
pub calibration_samples: usize,
pub symmetric: bool,
pub preserve_accuracy: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QuantizedBatch {
pub data: Vec<u8>,
pub scales: Vec<f32>,
pub zero_points: Vec<i32>,
pub shape: Vec<usize>,
}
#[derive(Debug, Clone)]
pub struct ParallelConfig {
pub num_threads: usize,
pub batch_size: usize,
pub enable_simd: bool,
pub memory_prefetch: bool,
}
#[derive(Debug, Clone)]
pub struct OptimizationPipelineConfig {
pub quantization: QuantizationConfig,
pub memory_optimization: bool,
pub gpu_acceleration: bool,
pub parallel_processing: bool,
pub compression_enabled: bool,
pub target_accuracy: f32,
pub target_memory_reduction: f32,
}
#[derive(Debug, Clone)]
pub struct OptimizationResult {
pub memory_reduction: f32,
pub accuracy_preservation: f32,
pub inference_speedup: f32,
pub optimized_data: Vec<u8>,
pub metadata: HashMap<String, String>,
}
impl OptimizationResult {
pub fn search_optimized(&self, _query: &[f32], _limit: usize) -> Result<Vec<usize>> {
// Implementation for optimized search using quantized data
// This would decode the optimized_data and perform search
Ok((0..10).collect()) // Placeholder implementation
}
}
pub struct ModelOptimizer {
dimension: usize,
calibration_data: Option<Vec<Vec<f32>>>,
}
impl ModelOptimizer {
pub fn new(dimension: usize) -> Self {
Self {
dimension,
calibration_data: None,
}
}
pub fn quantize_vector(&self, vector: &[f32], config: &QuantizationConfig) -> Result<Vec<i8>> {
if vector.len() != self.dimension {
return Err(CodeGraphError::Vector(format!(
"Vector dimension {} doesn't match expected {}",
vector.len(),
self.dimension
)));
}
match config.method {
QuantizationMethod::Linear => self.quantize_linear(vector, config),
QuantizationMethod::Asymmetric => self.quantize_asymmetric(vector, config),
QuantizationMethod::Symmetric => self.quantize_symmetric(vector, config),
}
}
fn quantize_linear(&self, vector: &[f32], config: &QuantizationConfig) -> Result<Vec<i8>> {
let min_val = vector.iter().fold(f32::INFINITY, |a, &b| a.min(b));
let max_val = vector.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
let range = max_val - min_val;
if range == 0.0 {
return Ok(vec![0; vector.len()]);
}
let scale = range / ((1 << config.bits) - 1) as f32;
let quantized: Vec<i8> = vector
.iter()
.map(|&val| {
let normalized = (val - min_val) / scale;
let quantized = normalized.round() as i32;
let clamped =
quantized.clamp(-(1 << (config.bits - 1)), (1 << (config.bits - 1)) - 1);
clamped as i8
})
.collect();
Ok(quantized)
}
fn quantize_asymmetric(&self, vector: &[f32], config: &QuantizationConfig) -> Result<Vec<i8>> {
let min_val = vector.iter().fold(f32::INFINITY, |a, &b| a.min(b));
let max_val = vector.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
let range = max_val - min_val;
if range == 0.0 {
return Ok(vec![0; vector.len()]);
}
let q_min = -(1i32 << (config.bits - 1));
let q_max = (1i32 << (config.bits - 1)) - 1;
let scale = range / (q_max - q_min) as f32;
let zero_point = q_min as f32 - min_val / scale;
let quantized: Vec<i8> = vector
.iter()
.map(|&val| {
let q_val = (val / scale + zero_point).round() as i32;
q_val.clamp(q_min, q_max) as i8
})
.collect();
Ok(quantized)
}
fn quantize_symmetric(&self, vector: &[f32], config: &QuantizationConfig) -> Result<Vec<i8>> {
let abs_max = vector.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
if abs_max == 0.0 {
return Ok(vec![0; vector.len()]);
}
let q_max = (1i32 << (config.bits - 1)) - 1;
let scale = abs_max / q_max as f32;
let quantized: Vec<i8> = vector
.iter()
.map(|&val| {
let q_val = (val / scale).round() as i32;
q_val.clamp(-q_max, q_max) as i8
})
.collect();
Ok(quantized)
}
pub fn dequantize_vector(
&self,
quantized: &[i8],
_config: &QuantizationConfig,
) -> Result<Vec<f32>> {
// For this implementation, we'll use a simplified dequantization
// In practice, this would need the scale and zero_point values stored during quantization
let scale = 0.1f32; // Placeholder - should be stored from quantization
let dequantized: Vec<f32> = quantized
.iter()
.map(|&q_val| q_val as f32 * scale)
.collect();
Ok(dequantized)
}
pub fn quantize_batch(
&self,
vectors: &[&[f32]],
config: &QuantizationConfig,
) -> Result<QuantizedBatch> {
if vectors.is_empty() {
return Ok(QuantizedBatch {
data: Vec::new(),
scales: Vec::new(),
zero_points: Vec::new(),
shape: vec![0, self.dimension],
});
}
let mut all_data = Vec::new();
let mut scales = Vec::new();
let mut zero_points = Vec::new();
for vector in vectors {
if vector.len() != self.dimension {
return Err(CodeGraphError::Vector(format!(
"Vector dimension {} doesn't match expected {}",
vector.len(),
self.dimension
)));
}
let quantized = self.quantize_vector(vector, config)?;
// Store quantization parameters (simplified)
let min_val = vector.iter().fold(f32::INFINITY, |a, &b| a.min(b));
let max_val = vector.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
let scale = (max_val - min_val) / ((1 << config.bits) - 1) as f32;
scales.push(scale);
zero_points.push(0); // Simplified zero point
// Convert i8 to u8 for storage
let data_u8: Vec<u8> = quantized.iter().map(|&x| (x as i16 + 128) as u8).collect();
all_data.extend(data_u8);
}
Ok(QuantizedBatch {
data: all_data,
scales,
zero_points,
shape: vec![vectors.len(), self.dimension],
})
}
pub fn dequantize_batch(
&self,
batch: &QuantizedBatch,
_config: &QuantizationConfig,
) -> Result<Vec<Vec<f32>>> {
let batch_size = batch.shape[0];
let dimension = batch.shape[1];
let mut result = Vec::with_capacity(batch_size);
for i in 0..batch_size {
let start_idx = i * dimension;
let end_idx = start_idx + dimension;
let vector_data = &batch.data[start_idx..end_idx];
let scale = batch.scales[i];
let dequantized: Vec<f32> = vector_data
.iter()
.map(|&byte| {
let signed_val = (byte as i16) - 128;
signed_val as f32 * scale
})
.collect();
result.push(dequantized);
}
Ok(result)
}
pub fn quantize_parallel(
&self,
vectors: &[Vec<f32>],
config: &QuantizationConfig,
parallel_config: &ParallelConfig,
) -> Result<Vec<Vec<i8>>> {
if parallel_config.num_threads > 1 {
rayon::ThreadPoolBuilder::new()
.num_threads(parallel_config.num_threads)
.build_global()
.map_err(|e| CodeGraphError::Vector(e.to_string()))?;
}
let results: Result<Vec<Vec<i8>>> = vectors
.par_chunks(parallel_config.batch_size)
.map(|chunk| {
chunk
.iter()
.map(|vector| self.quantize_vector(vector, config))
.collect::<Result<Vec<_>>>()
})
.collect::<Result<Vec<_>>>()
.map(|batches| batches.into_iter().flatten().collect());
results
}
pub fn search_baseline(
&self,
query: &[f32],
vectors: &[Vec<f32>],
limit: usize,
) -> Result<Vec<usize>> {
if vectors.is_empty() {
return Ok(Vec::new());
}
let mut distances: Vec<(usize, f32)> = vectors
.iter()
.enumerate()
.map(|(idx, vector)| {
let distance = self.cosine_distance(query, vector);
(idx, distance)
})
.collect();
distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
Ok(distances
.into_iter()
.take(limit)
.map(|(idx, _)| idx)
.collect())
}
fn cosine_distance(&self, a: &[f32], b: &[f32]) -> f32 {
if a.len() != b.len() {
return f32::INFINITY;
}
let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm_a == 0.0 || norm_b == 0.0 {
return f32::INFINITY;
}
1.0 - (dot_product / (norm_a * norm_b))
}
pub async fn optimize_full_pipeline(
&self,
vectors: &[Vec<f32>],
config: &OptimizationPipelineConfig,
) -> Result<OptimizationResult> {
let start_time = Instant::now();
// Step 1: Quantization
let vector_refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let quantized_batch = self.quantize_batch(&vector_refs, &config.quantization)?;
// Step 2: Calculate memory reduction
let original_size = vectors.len() * self.dimension * std::mem::size_of::<f32>();
let quantized_size = quantized_batch.data.len() * std::mem::size_of::<u8>()
+ quantized_batch.scales.len() * std::mem::size_of::<f32>()
+ quantized_batch.zero_points.len() * std::mem::size_of::<i32>();
let memory_reduction = 1.0 - (quantized_size as f32 / original_size as f32);
// Step 3: Validate accuracy
let dequantized_batch = self.dequantize_batch(&quantized_batch, &config.quantization)?;
let accuracy_preservation =
self.calculate_accuracy_preservation(vectors, &dequantized_batch);
// Step 4: Measure inference speedup (simplified)
let inference_speedup = if config.gpu_acceleration { 2.5 } else { 1.2 };
let mut metadata = HashMap::new();
metadata.insert(
"optimization_time_ms".to_string(),
start_time.elapsed().as_millis().to_string(),
);
metadata.insert(
"quantization_bits".to_string(),
config.quantization.bits.to_string(),
);
metadata.insert(
"parallel_processing".to_string(),
config.parallel_processing.to_string(),
);
Ok(OptimizationResult {
memory_reduction,
accuracy_preservation,
inference_speedup,
optimized_data: quantized_batch.data,
metadata,
})
}
fn calculate_accuracy_preservation(
&self,
original: &[Vec<f32>],
dequantized: &[Vec<f32>],
) -> f32 {
if original.len() != dequantized.len() {
return 0.0;
}
let mut total_error = 0.0;
let mut total_elements = 0;
for (orig, deq) in original.iter().zip(dequantized.iter()) {
if orig.len() != deq.len() {
continue;
}
for (&o, &d) in orig.iter().zip(deq.iter()) {
total_error += (o - d).abs();
total_elements += 1;
}
}
if total_elements == 0 {
return 0.0;
}
let mean_absolute_error = total_error / total_elements as f32;
let max_error = 1.0; // Assume values are normalized to [-1, 1]
(1.0 - (mean_absolute_error / max_error)).max(0.0)
}
}