//go:build cgo && windows && cuda
// Package localllm provides CGO bindings to llama.cpp for local GGUF model inference.
//
// This is the Windows CUDA implementation for GPU-accelerated embedding generation.
//
// CUDA Optimizations (NVIDIA GPU):
// - Flash attention for faster inference
// - Full model GPU offload by default
// - VRAM-optimized tensor placement
// - SIMD-optimized CPU fallback
//
// Build Requirements:
// - CUDA Toolkit 12.x installed
// - Visual Studio 2022 Build Tools with MSVC
// - Pre-built libllama_windows_amd64.a with CUDA support
//
// Example:
//
// opts := localllm.DefaultOptions("/models/bge-m3.gguf")
// model, err := localllm.LoadModel(opts)
// if err != nil {
// log.Fatal(err)
// }
// defer model.Close()
//
// embedding, err := model.Embed(ctx, "hello world")
// // embedding is a normalized []float32
package localllm
/*
#cgo CFLAGS: -I${SRCDIR}/../../lib/llama/windows_amd64_cuda
// Windows with CUDA - link against all llama.cpp static libraries
// Libraries built with MSVC: llama.lib, ggml-cuda.lib, ggml-cpu.lib, ggml-base.lib, ggml.lib, common.lib
#cgo LDFLAGS: -L${SRCDIR}/../../lib/llama/windows_amd64_cuda
#cgo LDFLAGS: ${SRCDIR}/../../lib/llama/windows_amd64_cuda/llama.lib
#cgo LDFLAGS: ${SRCDIR}/../../lib/llama/windows_amd64_cuda/ggml-cuda.lib
#cgo LDFLAGS: ${SRCDIR}/../../lib/llama/windows_amd64_cuda/ggml-cpu.lib
#cgo LDFLAGS: ${SRCDIR}/../../lib/llama/windows_amd64_cuda/ggml-base.lib
#cgo LDFLAGS: ${SRCDIR}/../../lib/llama/windows_amd64_cuda/ggml.lib
#cgo LDFLAGS: ${SRCDIR}/../../lib/llama/windows_amd64_cuda/common.lib
// CUDA runtime and cuBLAS libraries from CUDA Toolkit
#cgo LDFLAGS: -L"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/lib/x64"
#cgo LDFLAGS: -lcudart -lcublas -lcublasLt -lcuda
#include <stdlib.h>
#include <string.h>
#include "llama.h"
// Initialize backend once (handles GPU detection)
static int initialized = 0;
void init_backend() {
if (!initialized) {
llama_backend_init();
initialized = 1;
}
}
// Get number of layers in model (for GPU offload calculation)
int get_n_layers(struct llama_model* model) {
return llama_model_n_layer(model);
}
// Load model with optimal GPU settings
// n_gpu_layers: -1 = all layers on GPU, 0 = CPU only, N = N layers on GPU
struct llama_model* load_model(const char* path, int n_gpu_layers) {
init_backend();
struct llama_model_params params = llama_model_default_params();
// Memory mapping for low memory usage
params.use_mmap = 1;
// Device selection - NULL means use all available devices
// (new in b7285, explicit for clarity)
params.devices = NULL;
// GPU layer offloading
// -1 means offload all layers (determined after loading)
// For now, use a high number that will be clamped by llama.cpp
if (n_gpu_layers < 0) {
params.n_gpu_layers = 999; // Will be clamped to actual layer count
} else {
params.n_gpu_layers = n_gpu_layers;
}
return llama_model_load_from_file(path, params);
}
// Create embedding context with CUDA optimizations
struct llama_context* create_context(struct llama_model* model, int n_ctx, int n_batch, int n_threads) {
struct llama_context_params params = llama_context_default_params();
// Context size for tokenization
params.n_ctx = n_ctx;
// Batch sizes for processing
params.n_batch = n_batch; // Logical batch size
params.n_ubatch = n_batch; // Physical batch size (same for embeddings)
// CPU threading (used for CPU-only layers or fallback)
params.n_threads = n_threads;
params.n_threads_batch = n_threads;
// Enable embeddings mode
params.embeddings = 1;
params.pooling_type = LLAMA_POOLING_TYPE_MEAN;
// Set attention type for embedding models (non-causal, BERT-style)
// Embedding models use bidirectional attention unlike causal LLMs
params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL;
// Flash attention - major speedup on CUDA
// Auto-detect best setting based on hardware and model
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
// logits_all removed in newer llama.cpp - controlled per-batch now
// We set batch.logits[i] = 1 in embed() function instead
return llama_init_from_model(model, params);
}
// Tokenize using model's vocab
int tokenize(struct llama_model* model, const char* text, int text_len, int32_t* tokens, int max_tokens) {
const struct llama_vocab* vocab = llama_model_get_vocab(model);
// add_bos=true, special=true for proper embedding format
return llama_tokenize(vocab, text, text_len, tokens, max_tokens, 1, 1);
}
// Generate embedding with GPU acceleration
int embed(struct llama_context* ctx, int32_t* tokens, int n_tokens, float* out, int n_embd) {
// Clear memory before each embedding (not persistent for embeddings)
// KV cache API renamed to "memory" in b7285
// Second arg (true) clears the data
llama_memory_clear(llama_get_memory(ctx), 1);
// Create batch
struct llama_batch batch = llama_batch_init(n_tokens, 0, 1);
for (int i = 0; i < n_tokens; i++) {
batch.token[i] = tokens[i];
batch.pos[i] = i;
batch.n_seq_id[i] = 1;
batch.seq_id[i][0] = 0;
batch.logits[i] = 1; // Enable output for embedding extraction
}
batch.n_tokens = n_tokens;
// Decode (this is where GPU compute happens)
if (llama_decode(ctx, batch) != 0) {
llama_batch_free(batch);
return -1;
}
// Get pooled embedding
float* embd = llama_get_embeddings_seq(ctx, 0);
if (!embd) {
llama_batch_free(batch);
return -2;
}
// Copy to output
memcpy(out, embd, n_embd * sizeof(float));
llama_batch_free(batch);
return 0;
}
// Get embedding dimensions
int get_n_embd(struct llama_model* model) {
return llama_model_n_embd(model);
}
// Free resources
void free_ctx(struct llama_context* ctx) { if (ctx) llama_free(ctx); }
void free_model(struct llama_model* model) { if (model) llama_model_free(model); }
*/
import "C"
import (
"context"
"fmt"
"runtime"
"sync"
"unsafe"
"github.com/orneryd/nornicdb/pkg/math/vector"
)
// Model wraps a GGUF model for embedding generation.
//
// Thread-safe: The Embed and EmbedBatch methods can be called concurrently,
// but operations are serialized internally via mutex to prevent race conditions
// with the underlying C context.
type Model struct {
model *C.struct_llama_model
ctx *C.struct_llama_context
dims int
modelDesc string
mu sync.Mutex
}
// Options configures model loading and inference.
//
// Fields:
// - ModelPath: Path to .gguf model file
// - ContextSize: Max context size for tokenization (default: 512)
// - BatchSize: Batch size for processing (default: 512)
// - Threads: CPU threads for inference (default: NumCPU/2, min 4)
// - GPULayers: GPU layer offload (-1=auto/all, 0=CPU only, N=N layers)
type Options struct {
ModelPath string
ContextSize int
BatchSize int
Threads int
GPULayers int
}
// DefaultOptions returns options optimized for embedding generation.
//
// GPU is enabled by default (-1 = auto-detect and use all layers).
// Set GPULayers to 0 to force CPU-only mode.
//
// For NVIDIA GPUs, this enables full CUDA acceleration with:
// - Flash attention
// - Full model offload to GPU
// - Optimized tensor operations
//
// Example:
//
// opts := localllm.DefaultOptions("/models/bge-m3.gguf")
// opts.GPULayers = 0 // Force CPU mode
// model, err := localllm.LoadModel(opts)
func DefaultOptions(modelPath string) Options {
// Optimal thread count for hybrid CPU/GPU workloads
threads := runtime.NumCPU() / 2
if threads < 4 {
threads = 4
}
if threads > 8 {
threads = 8 // Diminishing returns beyond 8 for embeddings
}
return Options{
ModelPath: modelPath,
ContextSize: 512, // Enough for most embedding inputs
BatchSize: 512, // Matches context for efficient processing
Threads: threads,
GPULayers: -1, // Auto: offload all layers to GPU
}
}
// LoadModel loads a GGUF model for embedding generation.
//
// The model is memory-mapped for low memory footprint. GPU layers are
// automatically offloaded based on Options.GPULayers:
// - -1: Auto-detect GPU and offload all layers (recommended)
// - 0: CPU only (no GPU offload)
// - N: Offload N layers to GPU
//
// CUDA Optimization (NVIDIA GPU):
//
// When running with CUDA support:
// - All model layers are offloaded to GPU by default
// - Flash attention is enabled for faster inference
// - VRAM is used for tensor storage
// - Typical speedup: 5-10x over CPU-only
//
// Example:
//
// opts := localllm.DefaultOptions("/models/bge-m3.gguf")
// model, err := localllm.LoadModel(opts)
// if err != nil {
// log.Fatalf("Failed to load model: %v", err)
// }
// defer model.Close()
//
// fmt.Printf("Model loaded: %d dimensions\n", model.Dimensions())
func LoadModel(opts Options) (*Model, error) {
cPath := C.CString(opts.ModelPath)
defer C.free(unsafe.Pointer(cPath))
model := C.load_model(cPath, C.int(opts.GPULayers))
if model == nil {
return nil, fmt.Errorf("failed to load model: %s", opts.ModelPath)
}
ctx := C.create_context(model, C.int(opts.ContextSize), C.int(opts.BatchSize), C.int(opts.Threads))
if ctx == nil {
C.free_model(model)
return nil, fmt.Errorf("failed to create context for: %s", opts.ModelPath)
}
return &Model{
model: model,
ctx: ctx,
dims: int(C.get_n_embd(model)),
modelDesc: opts.ModelPath, // Use path as description
}, nil
}
// Embed generates a normalized embedding vector for the given text.
//
// The returned vector is L2-normalized (unit length), suitable for
// cosine similarity calculations.
//
// Concurrency:
//
// Operations are serialized via mutex because llama.cpp contexts are NOT
// thread-safe. The C.embed call holds the lock for the duration of GPU/CPU
// inference (~5-50ms depending on text length and hardware).
//
// For higher throughput under concurrent load, create multiple Model instances
// (each with its own GPU context). The GPU can process multiple contexts
// efficiently via kernel scheduling.
//
// GPU Acceleration:
//
// On NVIDIA GPUs with CUDA, the embedding is computed on the GPU:
// 1. Tokenization (CPU)
// 2. Model inference (GPU - flash attention enabled)
// 3. Pooling (GPU)
// 4. Normalization (CPU)
//
// Example:
//
// vec, err := model.Embed(ctx, "graph database")
// if err != nil {
// return err
// }
// fmt.Printf("Embedding: %d dimensions\n", len(vec))
func (m *Model) Embed(ctx context.Context, text string) ([]float32, error) {
if text == "" {
return nil, nil
}
// Lock required: llama.cpp contexts are not thread-safe.
// For higher concurrency, use multiple Model instances.
m.mu.Lock()
defer m.mu.Unlock()
// Check context cancellation
select {
case <-ctx.Done():
return nil, ctx.Err()
default:
}
// Tokenize
cText := C.CString(text)
defer C.free(unsafe.Pointer(cText))
tokens := make([]C.int, 512)
n := C.tokenize(m.model, cText, C.int(len(text)), &tokens[0], 512)
if n < 0 {
return nil, fmt.Errorf("tokenization failed for text of length %d", len(text))
}
if n == 0 {
return nil, fmt.Errorf("text produced no tokens")
}
// Generate embedding (GPU-accelerated on CUDA)
emb := make([]float32, m.dims)
result := C.embed(m.ctx, (*C.int)(&tokens[0]), n, (*C.float)(&emb[0]), C.int(m.dims))
if result != 0 {
return nil, fmt.Errorf("embedding generation failed (code: %d)", result)
}
// Normalize to unit vector for cosine similarity
vector.NormalizeInPlace(emb)
return emb, nil
}
// EmbedBatch generates normalized embeddings for multiple texts.
//
// Each text is processed sequentially through the GPU. For maximum throughput
// with many texts, consider parallel processing with multiple Model instances.
//
// Note: True batch processing (multiple texts in single GPU kernel) would
// require llama.cpp changes. Current implementation is efficient for
// moderate batch sizes due to GPU kernel reuse.
//
// Example:
//
// texts := []string{"hello", "world", "test"}
// vecs, err := model.EmbedBatch(ctx, texts)
// if err != nil {
// return err
// }
// for i, vec := range vecs {
// fmt.Printf("Text %d: %d dims\n", i, len(vec))
// }
func (m *Model) EmbedBatch(ctx context.Context, texts []string) ([][]float32, error) {
results := make([][]float32, len(texts))
for i, t := range texts {
select {
case <-ctx.Done():
return nil, ctx.Err()
default:
}
emb, err := m.Embed(ctx, t)
if err != nil {
return nil, fmt.Errorf("text %d: %w", i, err)
}
results[i] = emb
}
return results, nil
}
// Dimensions returns the embedding vector size.
//
// This is determined by the model architecture:
// - BGE-M3: 1024 dimensions
// - E5-large: 1024 dimensions
// - Jina-v2-base-code: 768 dimensions
func (m *Model) Dimensions() int { return m.dims }
// ModelDescription returns a human-readable description of the loaded model.
func (m *Model) ModelDescription() string { return m.modelDesc }
// Close releases all resources associated with the model.
//
// After Close is called, the Model must not be used.
// This properly releases GPU memory on CUDA.
func (m *Model) Close() error {
m.mu.Lock()
defer m.mu.Unlock()
if m.ctx != nil {
C.free_ctx(m.ctx)
m.ctx = nil
}
if m.model != nil {
C.free_model(m.model)
m.model = nil
}
return nil
}