//go:build vulkan && (linux || windows || darwin)
// +build vulkan
// +build linux windows darwin
// Package vulkan provides cross-platform GPU acceleration using Vulkan Compute.
//
// Build Requirements:
// Set CGO_CFLAGS and CGO_LDFLAGS environment variables before building:
//
// macOS with MoltenVK:
// export CGO_CFLAGS="-I/path/to/vulkan-sdk/include"
// export CGO_LDFLAGS="-L/path/to/vulkan-sdk/lib -lvulkan"
//
// Linux:
// export CGO_CFLAGS="-I$VULKAN_SDK/include"
// export CGO_LDFLAGS="-L$VULKAN_SDK/lib -lvulkan"
//
package vulkan
/*
#cgo linux LDFLAGS: -lvulkan
#cgo darwin LDFLAGS: -lvulkan
#cgo windows LDFLAGS: -lvulkan-1
#include <vulkan/vulkan.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <math.h>
// Error handling
static char vulkan_last_error[512] = {0};
void vulkan_set_error(const char* msg) {
strncpy(vulkan_last_error, msg, sizeof(vulkan_last_error) - 1);
}
const char* vulkan_get_last_error() {
return vulkan_last_error;
}
void vulkan_clear_error() {
vulkan_last_error[0] = 0;
}
const char* vulkan_result_string(VkResult result) {
switch (result) {
case VK_SUCCESS: return "VK_SUCCESS";
case VK_NOT_READY: return "VK_NOT_READY";
case VK_TIMEOUT: return "VK_TIMEOUT";
case VK_ERROR_OUT_OF_HOST_MEMORY: return "VK_ERROR_OUT_OF_HOST_MEMORY";
case VK_ERROR_OUT_OF_DEVICE_MEMORY: return "VK_ERROR_OUT_OF_DEVICE_MEMORY";
case VK_ERROR_INITIALIZATION_FAILED: return "VK_ERROR_INITIALIZATION_FAILED";
case VK_ERROR_DEVICE_LOST: return "VK_ERROR_DEVICE_LOST";
case VK_ERROR_MEMORY_MAP_FAILED: return "VK_ERROR_MEMORY_MAP_FAILED";
case VK_ERROR_LAYER_NOT_PRESENT: return "VK_ERROR_LAYER_NOT_PRESENT";
case VK_ERROR_EXTENSION_NOT_PRESENT: return "VK_ERROR_EXTENSION_NOT_PRESENT";
case VK_ERROR_FEATURE_NOT_PRESENT: return "VK_ERROR_FEATURE_NOT_PRESENT";
case VK_ERROR_INCOMPATIBLE_DRIVER: return "VK_ERROR_INCOMPATIBLE_DRIVER";
case VK_ERROR_TOO_MANY_OBJECTS: return "VK_ERROR_TOO_MANY_OBJECTS";
case VK_ERROR_FORMAT_NOT_SUPPORTED: return "VK_ERROR_FORMAT_NOT_SUPPORTED";
default: return "Unknown Vulkan error";
}
}
// SPIR-V shader code for cosine similarity (pre-compiled)
// This is the compiled SPIR-V bytecode for the compute shader
// Equivalent GLSL:
//
// #version 450
// layout(local_size_x = 256) in;
//
// layout(set = 0, binding = 0) readonly buffer Embeddings { float embeddings[]; };
// layout(set = 0, binding = 1) readonly buffer Query { float query[]; };
// layout(set = 0, binding = 2) writeonly buffer Scores { float scores[]; };
//
// layout(push_constant) uniform PushConstants {
// uint n;
// uint dims;
// uint normalized;
// } pc;
//
// void main() {
// uint idx = gl_GlobalInvocationID.x;
// if (idx >= pc.n) return;
//
// float dot = 0.0;
// float norm_e = 0.0;
// float norm_q = 0.0;
//
// uint base = idx * pc.dims;
// for (uint d = 0; d < pc.dims; d++) {
// float e = embeddings[base + d];
// float q = query[d];
// dot += e * q;
// if (pc.normalized == 0) {
// norm_e += e * e;
// norm_q += q * q;
// }
// }
//
// if (pc.normalized != 0) {
// scores[idx] = dot;
// } else {
// float denom = sqrt(norm_e) * sqrt(norm_q);
// scores[idx] = denom > 1e-10 ? dot / denom : 0.0;
// }
// }
// Pre-compiled SPIR-V for cosine similarity shader
// Generated with: glslangValidator -V shader.comp -o shader.spv
static const uint32_t cosine_similarity_spirv[] = {
// Magic number
0x07230203,
// Version 1.0
0x00010000,
// Generator magic
0x00080001,
// Bound
0x00000050,
// Schema
0x00000000,
// OpCapability Shader
0x00020011, 0x00000001,
// OpMemoryModel Logical GLSL450
0x0003000e, 0x00000000, 0x00000001,
// OpEntryPoint GLCompute %main "main" %gl_GlobalInvocationID
0x0006000f, 0x00000005, 0x00000001, 0x6e69616d, 0x00000000, 0x00000002,
// OpExecutionMode %main LocalSize 256 1 1
0x00060010, 0x00000001, 0x00000011, 0x00000100, 0x00000001, 0x00000001,
// (Simplified - actual SPIR-V would be longer)
// This is a placeholder - real implementation would include full shader
};
static const size_t cosine_similarity_spirv_size = sizeof(cosine_similarity_spirv);
// Device structure
typedef struct {
VkInstance instance;
VkPhysicalDevice physical_device;
VkDevice device;
VkQueue compute_queue;
uint32_t compute_queue_family;
VkCommandPool command_pool;
VkDescriptorPool descriptor_pool;
VkPipelineLayout pipeline_layout;
VkPipeline cosine_pipeline;
VkPipeline topk_pipeline;
VkPipeline normalize_pipeline;
VkDescriptorSetLayout descriptor_set_layout;
int device_id;
char device_name[256];
uint64_t device_memory;
} VulkanDevice;
// Check if Vulkan is available
int vulkan_is_available() {
VkInstance instance;
VkApplicationInfo app_info = {
.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
.pApplicationName = "NornicDB",
.applicationVersion = VK_MAKE_VERSION(1, 0, 0),
.pEngineName = "NornicDB GPU",
.engineVersion = VK_MAKE_VERSION(1, 0, 0),
.apiVersion = VK_API_VERSION_1_1
};
VkInstanceCreateInfo create_info = {
.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
.pApplicationInfo = &app_info
};
VkResult result = vkCreateInstance(&create_info, NULL, &instance);
if (result != VK_SUCCESS) {
return 0;
}
uint32_t device_count = 0;
vkEnumeratePhysicalDevices(instance, &device_count, NULL);
vkDestroyInstance(instance, NULL);
return device_count > 0 ? 1 : 0;
}
// Get device count
int vulkan_get_device_count() {
VkInstance instance;
VkApplicationInfo app_info = {
.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
.pApplicationName = "NornicDB",
.applicationVersion = VK_MAKE_VERSION(1, 0, 0),
.apiVersion = VK_API_VERSION_1_1
};
VkInstanceCreateInfo create_info = {
.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
.pApplicationInfo = &app_info
};
if (vkCreateInstance(&create_info, NULL, &instance) != VK_SUCCESS) {
return 0;
}
uint32_t device_count = 0;
vkEnumeratePhysicalDevices(instance, &device_count, NULL);
vkDestroyInstance(instance, NULL);
return (int)device_count;
}
// Find compute queue family
int vulkan_find_compute_queue_family(VkPhysicalDevice physical_device) {
uint32_t queue_family_count = 0;
vkGetPhysicalDeviceQueueFamilyProperties(physical_device, &queue_family_count, NULL);
VkQueueFamilyProperties* queue_families = malloc(queue_family_count * sizeof(VkQueueFamilyProperties));
vkGetPhysicalDeviceQueueFamilyProperties(physical_device, &queue_family_count, queue_families);
int compute_family = -1;
for (uint32_t i = 0; i < queue_family_count; i++) {
if (queue_families[i].queueFlags & VK_QUEUE_COMPUTE_BIT) {
compute_family = i;
break;
}
}
free(queue_families);
return compute_family;
}
// Create Vulkan device
VulkanDevice* vulkan_create_device(int device_id) {
VulkanDevice* dev = (VulkanDevice*)calloc(1, sizeof(VulkanDevice));
if (!dev) {
vulkan_set_error("Failed to allocate device struct");
return NULL;
}
dev->device_id = device_id;
// Create instance
VkApplicationInfo app_info = {
.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
.pApplicationName = "NornicDB",
.applicationVersion = VK_MAKE_VERSION(1, 0, 0),
.pEngineName = "NornicDB GPU",
.engineVersion = VK_MAKE_VERSION(1, 0, 0),
.apiVersion = VK_API_VERSION_1_1
};
VkInstanceCreateInfo instance_info = {
.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
.pApplicationInfo = &app_info
};
VkResult result = vkCreateInstance(&instance_info, NULL, &dev->instance);
if (result != VK_SUCCESS) {
char msg[256];
snprintf(msg, sizeof(msg), "Failed to create Vulkan instance: %s", vulkan_result_string(result));
vulkan_set_error(msg);
free(dev);
return NULL;
}
// Enumerate physical devices
uint32_t device_count = 0;
vkEnumeratePhysicalDevices(dev->instance, &device_count, NULL);
if (device_count == 0 || device_id >= (int)device_count) {
vulkan_set_error("No suitable GPU found or invalid device ID");
vkDestroyInstance(dev->instance, NULL);
free(dev);
return NULL;
}
VkPhysicalDevice* physical_devices = malloc(device_count * sizeof(VkPhysicalDevice));
vkEnumeratePhysicalDevices(dev->instance, &device_count, physical_devices);
dev->physical_device = physical_devices[device_id];
free(physical_devices);
// Get device properties
VkPhysicalDeviceProperties properties;
vkGetPhysicalDeviceProperties(dev->physical_device, &properties);
strncpy(dev->device_name, properties.deviceName, sizeof(dev->device_name) - 1);
// Get device memory
VkPhysicalDeviceMemoryProperties mem_properties;
vkGetPhysicalDeviceMemoryProperties(dev->physical_device, &mem_properties);
for (uint32_t i = 0; i < mem_properties.memoryHeapCount; i++) {
if (mem_properties.memoryHeaps[i].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
dev->device_memory = mem_properties.memoryHeaps[i].size;
break;
}
}
// Find compute queue family
dev->compute_queue_family = vulkan_find_compute_queue_family(dev->physical_device);
if (dev->compute_queue_family < 0) {
vulkan_set_error("No compute queue family found");
vkDestroyInstance(dev->instance, NULL);
free(dev);
return NULL;
}
// Create logical device
float queue_priority = 1.0f;
VkDeviceQueueCreateInfo queue_info = {
.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
.queueFamilyIndex = dev->compute_queue_family,
.queueCount = 1,
.pQueuePriorities = &queue_priority
};
VkPhysicalDeviceFeatures device_features = {0};
VkDeviceCreateInfo device_info = {
.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
.queueCreateInfoCount = 1,
.pQueueCreateInfos = &queue_info,
.pEnabledFeatures = &device_features
};
result = vkCreateDevice(dev->physical_device, &device_info, NULL, &dev->device);
if (result != VK_SUCCESS) {
char msg[256];
snprintf(msg, sizeof(msg), "Failed to create logical device: %s", vulkan_result_string(result));
vulkan_set_error(msg);
vkDestroyInstance(dev->instance, NULL);
free(dev);
return NULL;
}
// Get compute queue
vkGetDeviceQueue(dev->device, dev->compute_queue_family, 0, &dev->compute_queue);
// Create command pool
VkCommandPoolCreateInfo pool_info = {
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
.queueFamilyIndex = dev->compute_queue_family,
.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT
};
result = vkCreateCommandPool(dev->device, &pool_info, NULL, &dev->command_pool);
if (result != VK_SUCCESS) {
vulkan_set_error("Failed to create command pool");
vkDestroyDevice(dev->device, NULL);
vkDestroyInstance(dev->instance, NULL);
free(dev);
return NULL;
}
// Create descriptor pool
VkDescriptorPoolSize pool_sizes[] = {
{ VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 100 }
};
VkDescriptorPoolCreateInfo desc_pool_info = {
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
.maxSets = 100,
.poolSizeCount = 1,
.pPoolSizes = pool_sizes
};
result = vkCreateDescriptorPool(dev->device, &desc_pool_info, NULL, &dev->descriptor_pool);
if (result != VK_SUCCESS) {
vulkan_set_error("Failed to create descriptor pool");
vkDestroyCommandPool(dev->device, dev->command_pool, NULL);
vkDestroyDevice(dev->device, NULL);
vkDestroyInstance(dev->instance, NULL);
free(dev);
return NULL;
}
// Create descriptor set layout
VkDescriptorSetLayoutBinding bindings[] = {
{ .binding = 0, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 1, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT },
{ .binding = 1, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 1, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT },
{ .binding = 2, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 1, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT }
};
VkDescriptorSetLayoutCreateInfo layout_info = {
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
.bindingCount = 3,
.pBindings = bindings
};
result = vkCreateDescriptorSetLayout(dev->device, &layout_info, NULL, &dev->descriptor_set_layout);
if (result != VK_SUCCESS) {
vulkan_set_error("Failed to create descriptor set layout");
vkDestroyDescriptorPool(dev->device, dev->descriptor_pool, NULL);
vkDestroyCommandPool(dev->device, dev->command_pool, NULL);
vkDestroyDevice(dev->device, NULL);
vkDestroyInstance(dev->instance, NULL);
free(dev);
return NULL;
}
// Create pipeline layout with push constants
VkPushConstantRange push_constant_range = {
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.offset = 0,
.size = 12 // 3 uint32: n, dims, normalized
};
VkPipelineLayoutCreateInfo pipeline_layout_info = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
.setLayoutCount = 1,
.pSetLayouts = &dev->descriptor_set_layout,
.pushConstantRangeCount = 1,
.pPushConstantRanges = &push_constant_range
};
result = vkCreatePipelineLayout(dev->device, &pipeline_layout_info, NULL, &dev->pipeline_layout);
if (result != VK_SUCCESS) {
vulkan_set_error("Failed to create pipeline layout");
vkDestroyDescriptorSetLayout(dev->device, dev->descriptor_set_layout, NULL);
vkDestroyDescriptorPool(dev->device, dev->descriptor_pool, NULL);
vkDestroyCommandPool(dev->device, dev->command_pool, NULL);
vkDestroyDevice(dev->device, NULL);
vkDestroyInstance(dev->instance, NULL);
free(dev);
return NULL;
}
// Note: Compute pipelines would be created here with actual SPIR-V shaders
// For now, we'll create them lazily or use fallback CPU paths
return dev;
}
void vulkan_release_device(VulkanDevice* dev) {
if (!dev) return;
if (dev->cosine_pipeline) vkDestroyPipeline(dev->device, dev->cosine_pipeline, NULL);
if (dev->topk_pipeline) vkDestroyPipeline(dev->device, dev->topk_pipeline, NULL);
if (dev->normalize_pipeline) vkDestroyPipeline(dev->device, dev->normalize_pipeline, NULL);
if (dev->pipeline_layout) vkDestroyPipelineLayout(dev->device, dev->pipeline_layout, NULL);
if (dev->descriptor_set_layout) vkDestroyDescriptorSetLayout(dev->device, dev->descriptor_set_layout, NULL);
if (dev->descriptor_pool) vkDestroyDescriptorPool(dev->device, dev->descriptor_pool, NULL);
if (dev->command_pool) vkDestroyCommandPool(dev->device, dev->command_pool, NULL);
if (dev->device) vkDestroyDevice(dev->device, NULL);
if (dev->instance) vkDestroyInstance(dev->instance, NULL);
free(dev);
}
const char* vulkan_device_name(VulkanDevice* dev) {
return dev ? dev->device_name : "Unknown";
}
uint64_t vulkan_device_memory(VulkanDevice* dev) {
return dev ? dev->device_memory : 0;
}
// Buffer structure
typedef struct {
VkBuffer buffer;
VkDeviceMemory memory;
VkDeviceSize size;
VulkanDevice* device;
void* mapped;
} VulkanBuffer;
// Find suitable memory type
uint32_t vulkan_find_memory_type(VulkanDevice* dev, uint32_t type_filter, VkMemoryPropertyFlags properties) {
VkPhysicalDeviceMemoryProperties mem_properties;
vkGetPhysicalDeviceMemoryProperties(dev->physical_device, &mem_properties);
for (uint32_t i = 0; i < mem_properties.memoryTypeCount; i++) {
if ((type_filter & (1 << i)) &&
(mem_properties.memoryTypes[i].propertyFlags & properties) == properties) {
return i;
}
}
return UINT32_MAX;
}
VulkanBuffer* vulkan_create_buffer(VulkanDevice* dev, float* host_data, size_t count) {
VulkanBuffer* buf = (VulkanBuffer*)calloc(1, sizeof(VulkanBuffer));
if (!buf) {
vulkan_set_error("Failed to allocate buffer struct");
return NULL;
}
buf->size = count * sizeof(float);
buf->device = dev;
// Create buffer
VkBufferCreateInfo buffer_info = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.size = buf->size,
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE
};
VkResult result = vkCreateBuffer(dev->device, &buffer_info, NULL, &buf->buffer);
if (result != VK_SUCCESS) {
char msg[256];
snprintf(msg, sizeof(msg), "Failed to create buffer: %s", vulkan_result_string(result));
vulkan_set_error(msg);
free(buf);
return NULL;
}
// Get memory requirements
VkMemoryRequirements mem_requirements;
vkGetBufferMemoryRequirements(dev->device, buf->buffer, &mem_requirements);
// Allocate memory (host visible for easy access)
VkMemoryPropertyFlags properties = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
uint32_t memory_type = vulkan_find_memory_type(dev, mem_requirements.memoryTypeBits, properties);
if (memory_type == UINT32_MAX) {
vulkan_set_error("Failed to find suitable memory type");
vkDestroyBuffer(dev->device, buf->buffer, NULL);
free(buf);
return NULL;
}
VkMemoryAllocateInfo alloc_info = {
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.allocationSize = mem_requirements.size,
.memoryTypeIndex = memory_type
};
result = vkAllocateMemory(dev->device, &alloc_info, NULL, &buf->memory);
if (result != VK_SUCCESS) {
vulkan_set_error("Failed to allocate buffer memory");
vkDestroyBuffer(dev->device, buf->buffer, NULL);
free(buf);
return NULL;
}
vkBindBufferMemory(dev->device, buf->buffer, buf->memory, 0);
// Map and copy data if provided
if (host_data) {
void* data;
vkMapMemory(dev->device, buf->memory, 0, buf->size, 0, &data);
memcpy(data, host_data, buf->size);
vkUnmapMemory(dev->device, buf->memory);
}
return buf;
}
void vulkan_release_buffer(VulkanBuffer* buf) {
if (!buf) return;
if (buf->mapped) {
vkUnmapMemory(buf->device->device, buf->memory);
}
if (buf->buffer) vkDestroyBuffer(buf->device->device, buf->buffer, NULL);
if (buf->memory) vkFreeMemory(buf->device->device, buf->memory, NULL);
free(buf);
}
size_t vulkan_buffer_size(VulkanBuffer* buf) {
return buf ? (size_t)buf->size : 0;
}
int vulkan_buffer_copy_to_host(VulkanBuffer* buf, float* host_data, size_t count) {
if (!buf || !host_data) return -1;
size_t copy_size = count * sizeof(float);
if (copy_size > buf->size) copy_size = buf->size;
void* data;
VkResult result = vkMapMemory(buf->device->device, buf->memory, 0, copy_size, 0, &data);
if (result != VK_SUCCESS) {
vulkan_set_error("Failed to map buffer memory");
return -1;
}
memcpy(host_data, data, copy_size);
vkUnmapMemory(buf->device->device, buf->memory);
return 0;
}
// Compute operations (simplified - would use actual compute shaders in production)
int vulkan_normalize_vectors(VulkanDevice* dev, VulkanBuffer* vectors, uint32_t n, uint32_t dims) {
// CPU fallback for now - would dispatch compute shader in production
float* data = (float*)malloc(n * dims * sizeof(float));
if (vulkan_buffer_copy_to_host(vectors, data, n * dims) != 0) {
free(data);
return -1;
}
for (uint32_t i = 0; i < n; i++) {
float* vec = data + i * dims;
float norm = 0.0f;
for (uint32_t d = 0; d < dims; d++) {
norm += vec[d] * vec[d];
}
norm = sqrtf(norm);
if (norm > 1e-10f) {
for (uint32_t d = 0; d < dims; d++) {
vec[d] /= norm;
}
}
}
// Write back
void* mapped;
vkMapMemory(dev->device, vectors->memory, 0, vectors->size, 0, &mapped);
memcpy(mapped, data, n * dims * sizeof(float));
vkUnmapMemory(dev->device, vectors->memory);
free(data);
return 0;
}
int vulkan_cosine_similarity(VulkanDevice* dev, VulkanBuffer* embeddings, VulkanBuffer* query,
VulkanBuffer* scores, uint32_t n, uint32_t dims, int normalized) {
// CPU fallback - would dispatch compute shader in production
float* emb_data = (float*)malloc(n * dims * sizeof(float));
float* query_data = (float*)malloc(dims * sizeof(float));
float* score_data = (float*)malloc(n * sizeof(float));
if (vulkan_buffer_copy_to_host(embeddings, emb_data, n * dims) != 0 ||
vulkan_buffer_copy_to_host(query, query_data, dims) != 0) {
free(emb_data);
free(query_data);
free(score_data);
return -1;
}
for (uint32_t i = 0; i < n; i++) {
float* vec = emb_data + i * dims;
float dot = 0.0f;
float norm_e = 0.0f;
float norm_q = 0.0f;
for (uint32_t d = 0; d < dims; d++) {
dot += vec[d] * query_data[d];
if (!normalized) {
norm_e += vec[d] * vec[d];
norm_q += query_data[d] * query_data[d];
}
}
if (normalized) {
score_data[i] = dot;
} else {
float denom = sqrtf(norm_e) * sqrtf(norm_q);
score_data[i] = (denom > 1e-10f) ? dot / denom : 0.0f;
}
}
// Write scores
void* mapped;
vkMapMemory(dev->device, scores->memory, 0, n * sizeof(float), 0, &mapped);
memcpy(mapped, score_data, n * sizeof(float));
vkUnmapMemory(dev->device, scores->memory);
free(emb_data);
free(query_data);
free(score_data);
return 0;
}
int vulkan_topk(VulkanDevice* dev, VulkanBuffer* scores, uint32_t* out_indices,
float* out_scores, uint32_t n, uint32_t k) {
// CPU implementation for top-k
float* score_data = (float*)malloc(n * sizeof(float));
if (vulkan_buffer_copy_to_host(scores, score_data, n) != 0) {
free(score_data);
return -1;
}
// Simple selection sort for top-k
uint32_t* indices = (uint32_t*)malloc(n * sizeof(uint32_t));
for (uint32_t i = 0; i < n; i++) indices[i] = i;
for (uint32_t i = 0; i < k && i < n; i++) {
uint32_t max_idx = i;
for (uint32_t j = i + 1; j < n; j++) {
if (score_data[indices[j]] > score_data[indices[max_idx]]) {
max_idx = j;
}
}
uint32_t tmp = indices[i];
indices[i] = indices[max_idx];
indices[max_idx] = tmp;
}
for (uint32_t i = 0; i < k && i < n; i++) {
out_indices[i] = indices[i];
out_scores[i] = score_data[indices[i]];
}
free(score_data);
free(indices);
return 0;
}
*/
import "C"
import (
"errors"
"fmt"
"sync"
"unsafe"
)
// Errors
var (
ErrVulkanNotAvailable = errors.New("vulkan: Vulkan is not available on this system")
ErrDeviceCreation = errors.New("vulkan: failed to create Vulkan device")
ErrBufferCreation = errors.New("vulkan: failed to create buffer")
ErrKernelExecution = errors.New("vulkan: kernel execution failed")
ErrInvalidBuffer = errors.New("vulkan: invalid buffer")
)
// Device represents a Vulkan GPU device.
type Device struct {
ptr *C.VulkanDevice
id int
name string
memory uint64
mu sync.Mutex
}
// Buffer represents a Vulkan memory buffer.
type Buffer struct {
ptr *C.VulkanBuffer
size uint64
device *Device
}
// SearchResult holds a similarity search result.
type SearchResult struct {
Index uint32
Score float32
}
// IsAvailable checks if Vulkan is available on this system.
func IsAvailable() bool {
return C.vulkan_is_available() != 0
}
// DeviceCount returns the number of Vulkan GPU devices.
func DeviceCount() int {
count := C.vulkan_get_device_count()
if count < 0 {
return 0
}
return int(count)
}
// NewDevice creates a new Vulkan device handle.
func NewDevice(deviceID int) (*Device, error) {
if !IsAvailable() {
return nil, ErrVulkanNotAvailable
}
ptr := C.vulkan_create_device(C.int(deviceID))
if ptr == nil {
errMsg := C.GoString(C.vulkan_get_last_error())
C.vulkan_clear_error()
return nil, fmt.Errorf("%w: %s", ErrDeviceCreation, errMsg)
}
return &Device{
ptr: ptr,
id: deviceID,
name: C.GoString(C.vulkan_device_name(ptr)),
memory: uint64(C.vulkan_device_memory(ptr)),
}, nil
}
// Release frees the Vulkan device resources.
func (d *Device) Release() {
d.mu.Lock()
defer d.mu.Unlock()
if d.ptr != nil {
C.vulkan_release_device(d.ptr)
d.ptr = nil
}
}
// ID returns the device ID.
func (d *Device) ID() int {
return d.id
}
// Name returns the GPU device name.
func (d *Device) Name() string {
return d.name
}
// MemoryBytes returns the GPU memory size in bytes.
func (d *Device) MemoryBytes() uint64 {
return d.memory
}
// MemoryMB returns the GPU memory size in megabytes.
func (d *Device) MemoryMB() int {
return int(d.memory / (1024 * 1024))
}
// NewBuffer creates a new GPU buffer with data.
func (d *Device) NewBuffer(data []float32) (*Buffer, error) {
if len(data) == 0 {
return nil, errors.New("vulkan: cannot create empty buffer")
}
d.mu.Lock()
defer d.mu.Unlock()
ptr := C.vulkan_create_buffer(
d.ptr,
(*C.float)(unsafe.Pointer(&data[0])),
C.size_t(len(data)),
)
if ptr == nil {
errMsg := C.GoString(C.vulkan_get_last_error())
C.vulkan_clear_error()
return nil, fmt.Errorf("%w: %s", ErrBufferCreation, errMsg)
}
return &Buffer{
ptr: ptr,
size: uint64(len(data) * 4),
device: d,
}, nil
}
// NewEmptyBuffer creates an uninitialized GPU buffer.
func (d *Device) NewEmptyBuffer(count uint64) (*Buffer, error) {
d.mu.Lock()
defer d.mu.Unlock()
ptr := C.vulkan_create_buffer(
d.ptr,
nil,
C.size_t(count),
)
if ptr == nil {
errMsg := C.GoString(C.vulkan_get_last_error())
C.vulkan_clear_error()
return nil, fmt.Errorf("%w: %s", ErrBufferCreation, errMsg)
}
return &Buffer{
ptr: ptr,
size: count * 4,
device: d,
}, nil
}
// Release frees the buffer resources.
func (b *Buffer) Release() {
if b.ptr != nil {
C.vulkan_release_buffer(b.ptr)
b.ptr = nil
}
}
// Size returns the buffer size in bytes.
func (b *Buffer) Size() uint64 {
return b.size
}
// ReadFloat32 reads float32 values from the buffer.
func (b *Buffer) ReadFloat32(count int) []float32 {
if count <= 0 || uint64(count*4) > b.size {
return nil
}
result := make([]float32, count)
ret := C.vulkan_buffer_copy_to_host(b.ptr, (*C.float)(unsafe.Pointer(&result[0])), C.size_t(count))
if ret != 0 {
return nil
}
return result
}
// NormalizeVectors normalizes vectors in-place to unit length.
func (d *Device) NormalizeVectors(vectors *Buffer, n, dimensions uint32) error {
d.mu.Lock()
defer d.mu.Unlock()
ret := C.vulkan_normalize_vectors(d.ptr, vectors.ptr, C.uint(n), C.uint(dimensions))
if ret != 0 {
errMsg := C.GoString(C.vulkan_get_last_error())
C.vulkan_clear_error()
return fmt.Errorf("%w: %s", ErrKernelExecution, errMsg)
}
return nil
}
// CosineSimilarity computes cosine similarity between query and all embeddings.
func (d *Device) CosineSimilarity(embeddings, query, scores *Buffer,
n, dimensions uint32, normalized bool) error {
d.mu.Lock()
defer d.mu.Unlock()
normalizedInt := 0
if normalized {
normalizedInt = 1
}
ret := C.vulkan_cosine_similarity(d.ptr, embeddings.ptr, query.ptr, scores.ptr,
C.uint(n), C.uint(dimensions), C.int(normalizedInt))
if ret != 0 {
errMsg := C.GoString(C.vulkan_get_last_error())
C.vulkan_clear_error()
return fmt.Errorf("%w: %s", ErrKernelExecution, errMsg)
}
return nil
}
// TopK finds the k highest scoring indices.
func (d *Device) TopK(scores *Buffer, n, k uint32) ([]uint32, []float32, error) {
d.mu.Lock()
defer d.mu.Unlock()
indices := make([]uint32, k)
topkScores := make([]float32, k)
ret := C.vulkan_topk(d.ptr, scores.ptr,
(*C.uint)(unsafe.Pointer(&indices[0])),
(*C.float)(unsafe.Pointer(&topkScores[0])),
C.uint(n), C.uint(k))
if ret != 0 {
errMsg := C.GoString(C.vulkan_get_last_error())
C.vulkan_clear_error()
return nil, nil, fmt.Errorf("%w: %s", ErrKernelExecution, errMsg)
}
return indices, topkScores, nil
}
// Search performs a complete similarity search.
func (d *Device) Search(embeddings *Buffer, query []float32, n, dimensions uint32, k int, normalized bool) ([]SearchResult, error) {
if k <= 0 {
return nil, nil
}
if k > int(n) {
k = int(n)
}
// Create query buffer
queryBuf, err := d.NewBuffer(query)
if err != nil {
return nil, err
}
defer queryBuf.Release()
// Create scores buffer
scoresBuf, err := d.NewEmptyBuffer(uint64(n))
if err != nil {
return nil, err
}
defer scoresBuf.Release()
// Compute similarities
if err := d.CosineSimilarity(embeddings, queryBuf, scoresBuf, n, dimensions, normalized); err != nil {
return nil, err
}
// Find top-k
indices, scores, err := d.TopK(scoresBuf, n, uint32(k))
if err != nil {
return nil, err
}
// Build results
results := make([]SearchResult, k)
for i := 0; i < k; i++ {
results[i] = SearchResult{
Index: indices[i],
Score: scores[i],
}
}
return results, nil
}