# --- Configuration for RustyFlow Training and Chatting ---
# This file is sourced by train.sh and chat.sh.
# You can create multiple config files and pass the path as an argument.
# e.g., ./train.sh my_wikitext_config.env
# --- Common Settings ---
# Dataset to use: tinyshakespeare, wikitext-2, short, or a path to a text file.
# The active device (CPU or GPU) is determined by which script you run
# (e.g., train_cpu.sh vs train_gpu.sh).
DATASET="tinyshakespeare"
#DATASET="wikitext-2"
# Sequence length for training and context for chat.
SEQ_LEN=64
# --- Training Hyperparameters ---
NUM_EPOCHS=10
BATCH_SIZE=16
LEARNING_RATE=0.01
# --- Model Architecture ---
EMBED_DIM=128
NUM_HEADS=4
NUM_LAYERS=2
# Path to save the trained model to, or load from for chatting.
# The training and chat scripts will automatically append '-cpu.bin' or '-gpu.bin'
# to this path based on which script is run.
MODEL_PATH="models/${DATASET}-L${NUM_LAYERS}-H${NUM_HEADS}-E${EMBED_DIM}.bin"
# --- Chat Hyperparameters ---
# (SEQ_LEN from above is used for chat context, but loaded from model file)
# Temperature for sampling. Lower is more deterministic, higher is more random. (e.g., 0.8)
TEMPERATURE=0.8
# Top-p (nucleus) sampling. The model samples from the smallest set of tokens whose
# cumulative probability exceeds p. 1.0 disables it. (e.g., 0.9)
TOP_P=0.9