Skip to main content
Glama
orneryd

M.I.M.I.R - Multi-agent Intelligent Memory & Insight Repository

by orneryd
cuda_bridge.go18.2 kB
//go:build cuda && (linux || windows) // +build cuda // +build linux windows // Package cuda provides NVIDIA GPU acceleration using CUDA and cuBLAS. package cuda /* #cgo linux CFLAGS: -I/usr/local/cuda/include #cgo linux LDFLAGS: -L/usr/local/cuda/lib64 -lcudart -lcublas -lcuda #cgo windows CFLAGS: -I"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/include" #cgo windows LDFLAGS: -L${SRCDIR}/../../../lib/cuda -lcudart -lcublas -lcuda #include <cuda.h> #include <cuda_runtime_api.h> #include <cublas_v2.h> #include <stdlib.h> #include <string.h> // Error handling static char cuda_last_error[256] = {0}; void cuda_set_error(const char* msg) { strncpy(cuda_last_error, msg, sizeof(cuda_last_error) - 1); } const char* cuda_get_last_error() { return cuda_last_error; } void cuda_clear_error() { cuda_last_error[0] = 0; } // Device management typedef struct { int device_id; cublasHandle_t cublas_handle; cudaStream_t stream; } CudaDevice; int cuda_get_device_count() { int count = 0; cudaError_t err = cudaGetDeviceCount(&count); if (err != cudaSuccess) { cuda_set_error(cudaGetErrorString(err)); return -1; } return count; } int cuda_is_available() { return cuda_get_device_count() > 0 ? 1 : 0; } CudaDevice* cuda_create_device(int device_id) { cudaError_t err = cudaSetDevice(device_id); if (err != cudaSuccess) { cuda_set_error(cudaGetErrorString(err)); return NULL; } CudaDevice* dev = (CudaDevice*)malloc(sizeof(CudaDevice)); if (!dev) { cuda_set_error("Failed to allocate device struct"); return NULL; } dev->device_id = device_id; // Create cuBLAS handle cublasStatus_t status = cublasCreate(&dev->cublas_handle); if (status != CUBLAS_STATUS_SUCCESS) { cuda_set_error("Failed to create cuBLAS handle"); free(dev); return NULL; } // Create CUDA stream err = cudaStreamCreate(&dev->stream); if (err != cudaSuccess) { cuda_set_error(cudaGetErrorString(err)); cublasDestroy(dev->cublas_handle); free(dev); return NULL; } // Associate stream with cuBLAS cublasSetStream(dev->cublas_handle, dev->stream); return dev; } void cuda_release_device(CudaDevice* dev) { if (dev) { if (dev->stream) cudaStreamDestroy(dev->stream); if (dev->cublas_handle) cublasDestroy(dev->cublas_handle); free(dev); } } const char* cuda_device_name(int device_id) { static char name[256]; struct cudaDeviceProp prop; cudaError_t err = cudaGetDeviceProperties(&prop, device_id); if (err != cudaSuccess) { return "Unknown"; } strncpy(name, prop.name, sizeof(name) - 1); return name; } size_t cuda_device_memory(int device_id) { struct cudaDeviceProp prop; cudaError_t err = cudaGetDeviceProperties(&prop, device_id); if (err != cudaSuccess) { return 0; } return prop.totalGlobalMem; } int cuda_device_compute_capability(int device_id) { struct cudaDeviceProp prop; cudaError_t err = cudaGetDeviceProperties(&prop, device_id); if (err != cudaSuccess) { return 0; } return prop.major * 10 + prop.minor; } // Buffer management typedef struct { float* data; size_t size; int memory_type; // 0 = device, 1 = host pinned } CudaBuffer; CudaBuffer* cuda_create_buffer(CudaDevice* dev, float* host_data, size_t count, int memory_type) { CudaBuffer* buf = (CudaBuffer*)malloc(sizeof(CudaBuffer)); if (!buf) { cuda_set_error("Failed to allocate buffer struct"); return NULL; } buf->size = count * sizeof(float); buf->memory_type = memory_type; cudaError_t err; if (memory_type == 0) { // Device memory err = cudaMalloc((void**)&buf->data, buf->size); if (err != cudaSuccess) { cuda_set_error(cudaGetErrorString(err)); free(buf); return NULL; } if (host_data) { err = cudaMemcpy(buf->data, host_data, buf->size, cudaMemcpyHostToDevice); if (err != cudaSuccess) { cuda_set_error(cudaGetErrorString(err)); cudaFree(buf->data); free(buf); return NULL; } } } else { // Host pinned memory (for faster transfers) err = cudaMallocHost((void**)&buf->data, buf->size); if (err != cudaSuccess) { cuda_set_error(cudaGetErrorString(err)); free(buf); return NULL; } if (host_data) { memcpy(buf->data, host_data, buf->size); } } return buf; } void cuda_release_buffer(CudaBuffer* buf) { if (buf) { if (buf->data) { if (buf->memory_type == 0) { cudaFree(buf->data); } else { cudaFreeHost(buf->data); } } free(buf); } } void* cuda_buffer_data(CudaBuffer* buf) { return buf ? buf->data : NULL; } size_t cuda_buffer_size(CudaBuffer* buf) { return buf ? buf->size : 0; } int cuda_buffer_copy_to_host(CudaBuffer* buf, float* host_data, size_t count) { if (!buf || !host_data) return -1; size_t copy_size = count * sizeof(float); if (copy_size > buf->size) copy_size = buf->size; cudaError_t err; if (buf->memory_type == 0) { err = cudaMemcpy(host_data, buf->data, copy_size, cudaMemcpyDeviceToHost); } else { memcpy(host_data, buf->data, copy_size); err = cudaSuccess; } if (err != cudaSuccess) { cuda_set_error(cudaGetErrorString(err)); return -1; } return 0; } // Vector operations using cuBLAS // Compute L2 norms for each vector (for normalization) // vectors: n x dims matrix (row-major) // norms: output array of n floats int cuda_compute_norms(CudaDevice* dev, CudaBuffer* vectors, CudaBuffer* norms, unsigned int n, unsigned int dims) { cublasStatus_t status; for (unsigned int i = 0; i < n; i++) { float* vec = vectors->data + i * dims; status = cublasSnrm2(dev->cublas_handle, dims, vec, 1, norms->data + i); if (status != CUBLAS_STATUS_SUCCESS) { cuda_set_error("cuBLAS norm computation failed"); return -1; } } cudaStreamSynchronize(dev->stream); return 0; } // Normalize vectors in-place int cuda_normalize_vectors(CudaDevice* dev, CudaBuffer* vectors, unsigned int n, unsigned int dims) { // Allocate norms buffer CudaBuffer* norms = cuda_create_buffer(dev, NULL, n, 0); if (!norms) return -1; // Compute norms if (cuda_compute_norms(dev, vectors, norms, n, dims) != 0) { cuda_release_buffer(norms); return -1; } // Copy norms to host for scaling float* host_norms = (float*)malloc(n * sizeof(float)); cuda_buffer_copy_to_host(norms, host_norms, n); // Scale each vector by 1/norm for (unsigned int i = 0; i < n; i++) { if (host_norms[i] > 1e-10f) { float scale = 1.0f / host_norms[i]; cublasStatus_t status = cublasSscal(dev->cublas_handle, dims, &scale, vectors->data + i * dims, 1); if (status != CUBLAS_STATUS_SUCCESS) { cuda_set_error("cuBLAS scale failed"); free(host_norms); cuda_release_buffer(norms); return -1; } } } cudaStreamSynchronize(dev->stream); free(host_norms); cuda_release_buffer(norms); return 0; } // Compute cosine similarity: scores = embeddings @ query (all normalized) // embeddings: n x dims (row-major on device) // query: dims x 1 (column vector on device) // scores: n x 1 output int cuda_cosine_similarity(CudaDevice* dev, CudaBuffer* embeddings, CudaBuffer* query, CudaBuffer* scores, unsigned int n, unsigned int dims, int normalized) { // If not normalized, we'd need to normalize first // For now, assume normalized (dot product = cosine similarity) float alpha = 1.0f; float beta = 0.0f; // Matrix-vector multiply: scores = embeddings * query // embeddings is n x dims (row-major) // In cuBLAS (column-major), we treat it as dims x n and transpose cublasStatus_t status = cublasSgemv(dev->cublas_handle, CUBLAS_OP_T, // Transpose because row-major dims, n, // Matrix dimensions &alpha, embeddings->data, dims, // A matrix query->data, 1, // x vector &beta, scores->data, 1); // y vector if (status != CUBLAS_STATUS_SUCCESS) { cuda_set_error("cuBLAS gemv failed"); return -1; } cudaStreamSynchronize(dev->stream); return 0; } // Simple top-k selection (CPU implementation for now) // For production, use thrust::sort or custom CUDA kernel int cuda_topk(CudaDevice* dev, CudaBuffer* scores, unsigned int* out_indices, float* out_scores, unsigned int n, unsigned int k) { // Copy scores to host float* host_scores = (float*)malloc(n * sizeof(float)); if (cuda_buffer_copy_to_host(scores, host_scores, n) != 0) { free(host_scores); return -1; } // Simple selection sort for top-k unsigned int* indices = (unsigned int*)malloc(n * sizeof(unsigned int)); for (unsigned int i = 0; i < n; i++) indices[i] = i; for (unsigned int i = 0; i < k && i < n; i++) { unsigned int max_idx = i; for (unsigned int j = i + 1; j < n; j++) { if (host_scores[indices[j]] > host_scores[indices[max_idx]]) { max_idx = j; } } // Swap unsigned int tmp = indices[i]; indices[i] = indices[max_idx]; indices[max_idx] = tmp; } // Copy top-k results for (unsigned int i = 0; i < k && i < n; i++) { out_indices[i] = indices[i]; out_scores[i] = host_scores[indices[i]]; } free(host_scores); free(indices); return 0; } */ import "C" import ( "errors" "fmt" "sync" "unsafe" ) // Errors var ( ErrCUDANotAvailable = errors.New("cuda: CUDA is not available on this system") ErrDeviceCreation = errors.New("cuda: failed to create CUDA device") ErrBufferCreation = errors.New("cuda: failed to create buffer") ErrKernelExecution = errors.New("cuda: kernel execution failed") ErrInvalidBuffer = errors.New("cuda: invalid buffer") ) // MemoryType defines how buffer memory is managed. type MemoryType int const ( // MemoryDevice allocates memory on GPU device. MemoryDevice MemoryType = 0 // MemoryPinned allocates page-locked host memory for faster transfers. MemoryPinned MemoryType = 1 ) // Device represents a CUDA GPU device. type Device struct { ptr *C.CudaDevice id int name string memory uint64 ccMajor int ccMinor int mu sync.Mutex } // Buffer represents a CUDA memory buffer. type Buffer struct { ptr *C.CudaBuffer size uint64 device *Device } // SearchResult holds a similarity search result. type SearchResult struct { Index uint32 Score float32 } // IsAvailable checks if CUDA is available on this system. func IsAvailable() bool { return C.cuda_is_available() != 0 } // DeviceCount returns the number of CUDA devices. func DeviceCount() int { count := C.cuda_get_device_count() if count < 0 { return 0 } return int(count) } // NewDevice creates a new CUDA device handle. func NewDevice(deviceID int) (*Device, error) { if !IsAvailable() { return nil, ErrCUDANotAvailable } ptr := C.cuda_create_device(C.int(deviceID)) if ptr == nil { errMsg := C.GoString(C.cuda_get_last_error()) C.cuda_clear_error() return nil, fmt.Errorf("%w: %s", ErrDeviceCreation, errMsg) } cc := int(C.cuda_device_compute_capability(C.int(deviceID))) return &Device{ ptr: ptr, id: deviceID, name: C.GoString(C.cuda_device_name(C.int(deviceID))), memory: uint64(C.cuda_device_memory(C.int(deviceID))), ccMajor: cc / 10, ccMinor: cc % 10, }, nil } // Release frees the CUDA device resources. func (d *Device) Release() { d.mu.Lock() defer d.mu.Unlock() if d.ptr != nil { C.cuda_release_device(d.ptr) d.ptr = nil } } // ID returns the device ID. func (d *Device) ID() int { return d.id } // Name returns the GPU device name. func (d *Device) Name() string { return d.name } // MemoryBytes returns the GPU memory size in bytes. func (d *Device) MemoryBytes() uint64 { return d.memory } // MemoryMB returns the GPU memory size in megabytes. func (d *Device) MemoryMB() int { return int(d.memory / (1024 * 1024)) } // ComputeCapability returns the CUDA compute capability (major, minor). func (d *Device) ComputeCapability() (int, int) { return d.ccMajor, d.ccMinor } // NewBuffer creates a new GPU buffer with data. func (d *Device) NewBuffer(data []float32, memType MemoryType) (*Buffer, error) { if len(data) == 0 { return nil, errors.New("cuda: cannot create empty buffer") } d.mu.Lock() defer d.mu.Unlock() ptr := C.cuda_create_buffer( d.ptr, (*C.float)(unsafe.Pointer(&data[0])), C.size_t(len(data)), C.int(memType), ) if ptr == nil { errMsg := C.GoString(C.cuda_get_last_error()) C.cuda_clear_error() return nil, fmt.Errorf("%w: %s", ErrBufferCreation, errMsg) } return &Buffer{ ptr: ptr, size: uint64(len(data) * 4), device: d, }, nil } // NewEmptyBuffer creates an uninitialized GPU buffer. func (d *Device) NewEmptyBuffer(count uint64, memType MemoryType) (*Buffer, error) { d.mu.Lock() defer d.mu.Unlock() ptr := C.cuda_create_buffer( d.ptr, nil, C.size_t(count), C.int(memType), ) if ptr == nil { errMsg := C.GoString(C.cuda_get_last_error()) C.cuda_clear_error() return nil, fmt.Errorf("%w: %s", ErrBufferCreation, errMsg) } return &Buffer{ ptr: ptr, size: count * 4, device: d, }, nil } // Release frees the buffer resources. func (b *Buffer) Release() { if b.ptr != nil { C.cuda_release_buffer(b.ptr) b.ptr = nil } } // Size returns the buffer size in bytes. func (b *Buffer) Size() uint64 { return b.size } // ReadFloat32 reads float32 values from the buffer. func (b *Buffer) ReadFloat32(count int) []float32 { if count <= 0 || uint64(count*4) > b.size { return nil } result := make([]float32, count) ret := C.cuda_buffer_copy_to_host(b.ptr, (*C.float)(unsafe.Pointer(&result[0])), C.size_t(count)) if ret != 0 { return nil } return result } // NormalizeVectors normalizes vectors in-place to unit length. func (d *Device) NormalizeVectors(vectors *Buffer, n, dimensions uint32) error { d.mu.Lock() defer d.mu.Unlock() ret := C.cuda_normalize_vectors(d.ptr, vectors.ptr, C.uint(n), C.uint(dimensions)) if ret != 0 { errMsg := C.GoString(C.cuda_get_last_error()) C.cuda_clear_error() return fmt.Errorf("%w: %s", ErrKernelExecution, errMsg) } return nil } // CosineSimilarity computes cosine similarity between query and all embeddings. func (d *Device) CosineSimilarity(embeddings, query, scores *Buffer, n, dimensions uint32, normalized bool) error { d.mu.Lock() defer d.mu.Unlock() normalizedInt := 0 if normalized { normalizedInt = 1 } ret := C.cuda_cosine_similarity(d.ptr, embeddings.ptr, query.ptr, scores.ptr, C.uint(n), C.uint(dimensions), C.int(normalizedInt)) if ret != 0 { errMsg := C.GoString(C.cuda_get_last_error()) C.cuda_clear_error() return fmt.Errorf("%w: %s", ErrKernelExecution, errMsg) } return nil } // TopK finds the k highest scoring indices. func (d *Device) TopK(scores *Buffer, n, k uint32) ([]uint32, []float32, error) { d.mu.Lock() defer d.mu.Unlock() indices := make([]uint32, k) topkScores := make([]float32, k) ret := C.cuda_topk(d.ptr, scores.ptr, (*C.uint)(unsafe.Pointer(&indices[0])), (*C.float)(unsafe.Pointer(&topkScores[0])), C.uint(n), C.uint(k)) if ret != 0 { errMsg := C.GoString(C.cuda_get_last_error()) C.cuda_clear_error() return nil, nil, fmt.Errorf("%w: %s", ErrKernelExecution, errMsg) } return indices, topkScores, nil } // Search performs a complete similarity search. func (d *Device) Search(embeddings *Buffer, query []float32, n, dimensions uint32, k int, normalized bool) ([]SearchResult, error) { if k <= 0 { return nil, nil } if k > int(n) { k = int(n) } // Create query buffer queryBuf, err := d.NewBuffer(query, MemoryDevice) if err != nil { return nil, err } defer queryBuf.Release() // Create scores buffer scoresBuf, err := d.NewEmptyBuffer(uint64(n), MemoryDevice) if err != nil { return nil, err } defer scoresBuf.Release() // Compute similarities if err := d.CosineSimilarity(embeddings, queryBuf, scoresBuf, n, dimensions, normalized); err != nil { return nil, err } // Find top-k indices, scores, err := d.TopK(scoresBuf, n, uint32(k)) if err != nil { return nil, err } // Build results results := make([]SearchResult, k) for i := 0; i < k; i++ { results[i] = SearchResult{ Index: indices[i], Score: scores[i], } } return results, nil } // HasGPUHardware returns true if CUDA GPU hardware is available. func HasGPUHardware() bool { return IsAvailable() } // IsCUDACapable returns true if CUDA operations are available. // In the real CUDA build, this is always true if IsAvailable() is true. func IsCUDACapable() bool { return IsAvailable() } // GPUName returns the name of the first CUDA device. func GPUName() string { if !IsAvailable() { return "" } device, err := NewDevice(0) if err != nil { return "" } defer device.Release() return device.Name() } // GPUMemoryMB returns the memory of the first CUDA device in MB. func GPUMemoryMB() int { if !IsAvailable() { return 0 } device, err := NewDevice(0) if err != nil { return 0 } defer device.Release() return device.MemoryMB() }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/orneryd/Mimir'

If you have feedback or need assistance with the MCP directory API, please join our Discord server