Skip to main content
Glama
train-qwen-standard.py11.3 kB
#!/usr/bin/env python3 """ ZigNet Fine-Tuning Script - QLoRA Standard (NO Unsloth) Optimized for RTX 3090 (24GB VRAM) Usage: python scripts/train-qwen-standard.py Environment: - CUDA 12.7 - PyTorch 2.4.1 - transformers 4.45.2 - trl 0.11.4 - peft 0.13.2 - bitsandbytes 0.44.1 """ import os import json import torch from datetime import datetime from datasets import load_dataset from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, ) from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from trl import SFTTrainer # ============================================================================ # CONFIGURATION # ============================================================================ # Model settings MODEL_NAME = "Qwen/Qwen2.5-Coder-7B-Instruct" MAX_SEQ_LENGTH = 2048 # Zig code can be long USE_FLASH_ATTENTION = False # Flash Attention 2 (requires flash-attn package) # LoRA hyperparameters LORA_R = 16 # Rank (higher = more parameters, better quality, slower) LORA_ALPHA = 32 # Scaling factor (typically 2x rank) LORA_DROPOUT = 0.05 # Prevent overfitting LORA_TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] # All attention + MLP # Training hyperparameters OUTPUT_DIR = "./models/zignet-qwen-7b" NUM_EPOCHS = 3 BATCH_SIZE = 4 # Per GPU (RTX 3090 can handle 4-8) GRADIENT_ACCUMULATION_STEPS = 4 # Effective batch size = 4 * 4 = 16 LEARNING_RATE = 2e-4 WARMUP_STEPS = 100 WEIGHT_DECAY = 0.01 LR_SCHEDULER_TYPE = "cosine" SAVE_STRATEGY = "steps" SAVE_STEPS = 500 LOGGING_STEPS = 50 EVAL_STRATEGY = "steps" EVAL_STEPS = 500 FP16 = True # Use mixed precision (RTX 3090 supports fp16) # Dataset paths TRAIN_DATA = "./data/training/dataset-train.jsonl" VAL_DATA = "./data/training/dataset-validation.jsonl" TEST_DATA = "./data/training/dataset-test.jsonl" # HuggingFace upload (optional) PUSH_TO_HUB = False HF_REPO_NAME = "fulgidus/zignet-qwen2.5-coder-7b" # ============================================================================ # FUNCTIONS # ============================================================================ def load_training_data(): """Load and format datasets""" print("📂 Loading datasets...") dataset = load_dataset( "json", data_files={ "train": TRAIN_DATA, "validation": VAL_DATA, "test": TEST_DATA, } ) print(f"✅ Loaded {len(dataset['train'])} training examples") print(f"✅ Loaded {len(dataset['validation'])} validation examples") print(f"✅ Loaded {len(dataset['test'])} test examples") return dataset def format_prompt_alpaca(example): """ Format dataset example into Alpaca-style prompt Expected format in JSONL: { "instruction": "Explain this Zig code", "input": "fn add(a: i32, b: i32) i32 { return a + b; }", "output": "This function adds two i32 integers..." } """ if example.get("input"): prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {example['instruction']} ### Input: {example['input']} ### Response: {example['output']}""" else: prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: {example['instruction']} ### Response: {example['output']}""" return {"text": prompt} def setup_model(): """Setup model with QLoRA (4-bit quantization + LoRA adapters)""" print(f"🔧 Loading model: {MODEL_NAME}") # BitsAndBytes config for 4-bit quantization bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, # Nested quantization for better compression bnb_4bit_quant_type="nf4", # NormalFloat4 (best for fine-tuning) bnb_4bit_compute_dtype=torch.float16, # Compute in fp16 for speed ) # Load model model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, quantization_config=bnb_config, device_map="auto", # Automatic GPU distribution trust_remote_code=True, attn_implementation="flash_attention_2" if USE_FLASH_ATTENTION else None, ) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained( MODEL_NAME, trust_remote_code=True, ) tokenizer.pad_token = tokenizer.eos_token # Qwen uses eos as pad tokenizer.padding_side = "right" # For training stability # Prepare model for k-bit training (PEFT requirement) model = prepare_model_for_kbit_training(model) # Configure LoRA adapters peft_config = LoraConfig( r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=LORA_DROPOUT, target_modules=LORA_TARGET_MODULES, bias="none", task_type="CAUSAL_LM", ) # Add LoRA adapters to model model = get_peft_model(model, peft_config) # Print trainable parameters trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) total_params = sum(p.numel() for p in model.parameters()) print(f"✅ Model loaded:") print(f" - Total params: {total_params:,}") print(f" - Trainable params: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)") return model, tokenizer def train_model(model, tokenizer, dataset): """Train model with SFTTrainer""" print("🚀 Starting training...") # Format datasets train_dataset = dataset["train"].map(format_prompt_alpaca, remove_columns=dataset["train"].column_names) eval_dataset = dataset["validation"].map(format_prompt_alpaca, remove_columns=dataset["validation"].column_names) # Training arguments training_args = TrainingArguments( output_dir=OUTPUT_DIR, num_train_epochs=NUM_EPOCHS, per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, learning_rate=LEARNING_RATE, warmup_steps=WARMUP_STEPS, weight_decay=WEIGHT_DECAY, lr_scheduler_type=LR_SCHEDULER_TYPE, save_strategy=SAVE_STRATEGY, save_steps=SAVE_STEPS, logging_steps=LOGGING_STEPS, evaluation_strategy=EVAL_STRATEGY, eval_steps=EVAL_STEPS, fp16=FP16, gradient_checkpointing=True, # Save VRAM at cost of speed optim="paged_adamw_8bit", # Memory-efficient optimizer push_to_hub=PUSH_TO_HUB, hub_model_id=HF_REPO_NAME if PUSH_TO_HUB else None, report_to="none", # Disable wandb/tensorboard save_total_limit=3, # Keep only last 3 checkpoints load_best_model_at_end=True, metric_for_best_model="eval_loss", ) # Create trainer trainer = SFTTrainer( model=model, tokenizer=tokenizer, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, max_seq_length=MAX_SEQ_LENGTH, dataset_text_field="text", # Column with formatted prompts packing=False, # Don't pack multiple examples (for clarity) ) # Train! print(f"📊 Training config:") print(f" - Epochs: {NUM_EPOCHS}") print(f" - Batch size: {BATCH_SIZE}") print(f" - Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}") print(f" - Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}") print(f" - Learning rate: {LEARNING_RATE}") print(f" - Total training steps: ~{len(train_dataset) * NUM_EPOCHS // (BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS)}") print(f" - Estimated time: 6-10 hours on RTX 3090") print() start_time = datetime.now() trainer.train() end_time = datetime.now() training_time = (end_time - start_time).total_seconds() / 3600 # Hours print(f"\n✅ Training complete in {training_time:.2f} hours") # Save final model print("💾 Saving model...") trainer.save_model(f"{OUTPUT_DIR}/final") # Save training stats stats = { "model": MODEL_NAME, "training_time_hours": training_time, "num_epochs": NUM_EPOCHS, "batch_size": BATCH_SIZE, "learning_rate": LEARNING_RATE, "train_examples": len(train_dataset), "eval_examples": len(eval_dataset), "lora_r": LORA_R, "lora_alpha": LORA_ALPHA, "timestamp": datetime.now().isoformat(), } with open(f"{OUTPUT_DIR}/training_stats.json", "w") as f: json.dump(stats, f, indent=2) print(f"✅ Model saved to {OUTPUT_DIR}/final") print(f"✅ Stats saved to {OUTPUT_DIR}/training_stats.json") return trainer def test_inference(model, tokenizer): """Test the fine-tuned model with sample prompts""" print("\n🧪 Testing inference...") test_prompts = [ "Explain this Zig code:\nfn factorial(n: u32) u32 { if (n <= 1) return 1; return n * factorial(n - 1); }", "Write a Zig function to calculate Fibonacci numbers using comptime", "Fix this Zig error: expected type 'i32', found '[]const u8'", ] for i, prompt in enumerate(test_prompts, 1): print(f"\n--- Test {i} ---") print(f"Prompt: {prompt}") inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=256, temperature=0.7, top_p=0.9, do_sample=True, ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(f"Response: {response[len(prompt):]}") # Remove prompt from output def main(): """Main training pipeline""" print("=" * 80) print("ZigNet Fine-Tuning - QLoRA (Standard, NO Unsloth)") print("=" * 80) print(f"Model: {MODEL_NAME}") print(f"Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}") print(f"CUDA: {torch.version.cuda}") print(f"PyTorch: {torch.__version__}") print("=" * 80) print() # Check CUDA if not torch.cuda.is_available(): raise RuntimeError("CUDA not available! This script requires GPU.") # Create output directory os.makedirs(OUTPUT_DIR, exist_ok=True) # Pipeline dataset = load_training_data() model, tokenizer = setup_model() trainer = train_model(model, tokenizer, dataset) test_inference(model, tokenizer) print("\n" + "=" * 80) print("✅ ALL DONE! Model ready for deployment.") print(f"📦 Next steps:") print(f" 1. Test model: python -c 'from transformers import AutoModel; ...'") print(f" 2. Convert to GGUF: python convert.py {OUTPUT_DIR}/final") print(f" 3. Quantize: ./quantize zignet-qwen-7b.gguf zignet-qwen-7b-Q4_K_M.gguf Q4_K_M") print(f" 4. Upload to HuggingFace: huggingface-cli upload {HF_REPO_NAME}") print("=" * 80) if __name__ == "__main__": main()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/fulgidus/zignet'

If you have feedback or need assistance with the MCP directory API, please join our Discord server