From e9c76a1753899c01cfa517810b0871f2819ff17f Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Thu, 2 May 2024 00:11:07 +0300 Subject: [PATCH 1/4] option to keep weights as fp32 --- train_gpt2.cu | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index fbc068424..499d33b2c 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1159,7 +1159,7 @@ __device__ inline float lerp(float start, float end, float weight) { // Termplate type T instead of floatx template -__global__ void adamw_kernel3(Tp* params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters, +__global__ void adamw_kernel3(Tp* params_memory, float* master_params, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters, float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay, unsigned int seed) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -1176,10 +1176,23 @@ __global__ void adamw_kernel3(Tp* params_memory, Tg* grads_memory, float* m_memo m /= beta1_correction; // m_hat v /= beta2_correction; // v_hat // update the parameters (weight/bias) - float param = (float)params_memory[i] - (learning_rate * (m / (sqrtf(v) + eps) + weight_decay * (float)params_memory[i])); - unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed); - // todo - explain stochastic rounding here - stochastic_rounding(param, ¶ms_memory[i], random); + float old_param; + if(master_params) { + old_param = master_params[i]; + } else { + old_param = (float)params_memory[i]; + } + float param = old_param - (learning_rate * (m / (sqrtf(v) + eps) + weight_decay * old_param)); + // if we have master parameters, directly update the two weight copies + if(master_params) { + params_memory[i] = (floatX)param;; + master_params[i] = param; + } else { + // otherwise, one update with stochastic rounding + // todo - explain stochastic rounding here + unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed); + stochastic_rounding(param, ¶ms_memory[i], random); + } } struct SoftmaxParams { @@ -1822,6 +1835,7 @@ typedef struct { // buffers for the AdamW optimizer float* m_memory; float* v_memory; + float* master_weights; // is NULL unless fp32 weights is enabled. // the activations of the model, and their sizes ActivationTensors acts; size_t act_sizes[NUM_ACTIVATION_TENSORS]; @@ -1840,6 +1854,7 @@ typedef struct { float accumulated_mean_loss; // Mean loss after aggregating it on all GPUs floatX* cpu_losses; // CPU buffer to copy the losses to, allocated with cudaMallocHost unsigned long long rng_state; // the RNG state for seeding stochastic rounding etc. + bool use_master_weights; } GPT2; void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) { @@ -1899,6 +1914,7 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) { model->grads_memory = NULL; model->m_memory = NULL; model->v_memory = NULL; + model->master_weights = NULL; model->grads_acts_memory = NULL; model->inputs = NULL; model->targets = NULL; @@ -1907,6 +1923,7 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) { model->seq_len = 0; model->mean_loss = -1.0f; // -1.0f will designate no loss model->rng_state = 13371337; + model->use_master_weights = false; } void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T) { @@ -2229,6 +2246,10 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo cudaCheck(cudaMemset(model->v_memory, 0, model->num_parameters * sizeof(float))); printf0("allocated %zu MiB for AdamW optimizer state m\n", (model->num_parameters * sizeof(float)) >> 20); printf0("allocated %zu MiB for AdamW optimizer state v\n", (model->num_parameters * sizeof(float)) >> 20); + if(model->use_master_weights) { + // mixed precision, need to allocate one more buffer + cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float))); + } } int block_size = 512; @@ -2236,7 +2257,8 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo float beta1_correction = 1.0f - powf(beta1, t); float beta2_correction = 1.0f - powf(beta2, t); unsigned int seed = random_u32(&model->rng_state); - adamw_kernel3<<>>((floatX*)model->params_memory, (floatX*)model->grads_memory, model->m_memory, model->v_memory, + adamw_kernel3<<>>((floatX*)model->params_memory, model->master_weights, + (floatX*)model->grads_memory, model->m_memory, model->v_memory, model->num_parameters, learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed); cudaCheck(cudaGetLastError()); @@ -2408,6 +2430,7 @@ void error_usage() { fprintf(stderr, " -g genT, how many steps of inference we do (default = 64)\n"); fprintf(stderr, " -a overfit a single batch? 0/1. useful for debugging\n"); fprintf(stderr, " -f enable_tf32 override (default: 1, set to 0 to disable tf32)\n"); + fprintf(stderr, " -c keep f32 copy of weights for the optimizer\n"); exit(EXIT_FAILURE); } @@ -2429,6 +2452,7 @@ int main(int argc, char *argv[]) { int overfit_single_batch = 0; // useful for debugging, 1 = only load a single data batch once int max_steps = -1; int override_enable_tf32 = 1; + bool master_weights = false; for (int i = 1; i < argc; i+=2) { if (i + 1 >= argc) { error_usage(); } // must have arg after flag if (argv[i][0] != '-') { error_usage(); } // must start with dash @@ -2446,6 +2470,7 @@ int main(int argc, char *argv[]) { else if (argv[i][1] == 'g') { genT = atoi(argv[i+1]); } else if (argv[i][1] == 'a') { overfit_single_batch = atoi(argv[i+1]); } else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); } + else if (argv[i][1] == 'c') { master_weights = (bool)atoi(argv[i+1]); } else { error_usage(); } } printf0("+-----------------------+----------------------------------------------------+\n"); @@ -2462,6 +2487,7 @@ int main(int argc, char *argv[]) { printf0("| sample_every | %-50d |\n", sample_every); printf0("| genT | %-50d |\n", genT); printf0("| overfit_single_batch | %-50d |\n", overfit_single_batch); + printf0("| master_weights | %-50b |\n", master_weights); printf0("+-----------------------+----------------------------------------------------+\n"); // set up the device From df9f0efde6049c986352356a6628c8332c159c79 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Thu, 2 May 2024 01:00:36 +0300 Subject: [PATCH 2/4] updated letter and properly free buffer --- train_gpt2.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 499d33b2c..ba31fa47b 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -2269,6 +2269,7 @@ void gpt2_free(GPT2 *model) { cudaCheck(cudaFree(model->grads_memory)); cudaCheck(cudaFree(model->m_memory)); cudaCheck(cudaFree(model->v_memory)); + cudaCheck(cudaFree(model->master_weights)); cudaCheck(cudaFree(model->acts_memory)); cudaCheck(cudaFree(model->grads_acts_memory)); cudaCheck(cudaFree(model->inputs)); @@ -2430,7 +2431,7 @@ void error_usage() { fprintf(stderr, " -g genT, how many steps of inference we do (default = 64)\n"); fprintf(stderr, " -a overfit a single batch? 0/1. useful for debugging\n"); fprintf(stderr, " -f enable_tf32 override (default: 1, set to 0 to disable tf32)\n"); - fprintf(stderr, " -c keep f32 copy of weights for the optimizer\n"); + fprintf(stderr, " -w keep f32 copy of weights for the optimizer\n"); exit(EXIT_FAILURE); } @@ -2470,7 +2471,7 @@ int main(int argc, char *argv[]) { else if (argv[i][1] == 'g') { genT = atoi(argv[i+1]); } else if (argv[i][1] == 'a') { overfit_single_batch = atoi(argv[i+1]); } else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); } - else if (argv[i][1] == 'c') { master_weights = (bool)atoi(argv[i+1]); } + else if (argv[i][1] == 'w') { master_weights = (bool)atoi(argv[i+1]); } else { error_usage(); } } printf0("+-----------------------+----------------------------------------------------+\n"); From 795f8b690cc9b3d2255a19941713b34eeff98d7b Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Wed, 1 May 2024 22:50:22 +0000 Subject: [PATCH 3/4] fixes to keep master copy in fp32 of weights optionally --- train_gpt2.cu | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index ba31fa47b..79b59658b 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1176,20 +1176,15 @@ __global__ void adamw_kernel3(Tp* params_memory, float* master_params, Tg* grads m /= beta1_correction; // m_hat v /= beta2_correction; // v_hat // update the parameters (weight/bias) - float old_param; - if(master_params) { - old_param = master_params[i]; - } else { - old_param = (float)params_memory[i]; - } + float old_param = master_params != NULL ? master_params[i] : (float)params_memory[i]; float param = old_param - (learning_rate * (m / (sqrtf(v) + eps) + weight_decay * old_param)); // if we have master parameters, directly update the two weight copies - if(master_params) { - params_memory[i] = (floatX)param;; - master_params[i] = param; + if (master_params != NULL) { + params_memory[i] = (floatX)param; // low-precision copy, for use in the forward pass + master_params[i] = param; // float copy, for use in the next parameter update } else { - // otherwise, one update with stochastic rounding - // todo - explain stochastic rounding here + // without a master copy of params in float, do a direct update in low precision + // and use stochastic rounding to mitigate loss of training stability unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed); stochastic_rounding(param, ¶ms_memory[i], random); } @@ -1290,6 +1285,11 @@ __global__ void fused_classifier_kernel3(floatX* logits, floatX* losses, floatX* } } +__global__ void copy_kernel(float* dst, const floatX* src, size_t n) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) { dst[i] = (float)src[i]; } +} + // ---------------------------------------------------------------------------- // kernel launchers @@ -1854,7 +1854,7 @@ typedef struct { float accumulated_mean_loss; // Mean loss after aggregating it on all GPUs floatX* cpu_losses; // CPU buffer to copy the losses to, allocated with cudaMallocHost unsigned long long rng_state; // the RNG state for seeding stochastic rounding etc. - bool use_master_weights; + int use_master_weights; } GPT2; void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) { @@ -1923,7 +1923,7 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) { model->seq_len = 0; model->mean_loss = -1.0f; // -1.0f will designate no loss model->rng_state = 13371337; - model->use_master_weights = false; + model->use_master_weights = 1; // keep master weights copy in float for optim update? } void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T) { @@ -2246,9 +2246,10 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo cudaCheck(cudaMemset(model->v_memory, 0, model->num_parameters * sizeof(float))); printf0("allocated %zu MiB for AdamW optimizer state m\n", (model->num_parameters * sizeof(float)) >> 20); printf0("allocated %zu MiB for AdamW optimizer state v\n", (model->num_parameters * sizeof(float)) >> 20); - if(model->use_master_weights) { - // mixed precision, need to allocate one more buffer + if (model->use_master_weights == 1) { + // allocate one more buffer to keep the master copy of weights as float, and copy the weights over cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float))); + copy_kernel<<num_parameters, 512), 512>>>(model->master_weights, (floatX*)model->params_memory, model->num_parameters); } } @@ -2431,7 +2432,7 @@ void error_usage() { fprintf(stderr, " -g genT, how many steps of inference we do (default = 64)\n"); fprintf(stderr, " -a overfit a single batch? 0/1. useful for debugging\n"); fprintf(stderr, " -f enable_tf32 override (default: 1, set to 0 to disable tf32)\n"); - fprintf(stderr, " -w keep f32 copy of weights for the optimizer\n"); + fprintf(stderr, " -w keep f32 copy of weights for the optimizer? (default: 1)\n"); exit(EXIT_FAILURE); } @@ -2453,7 +2454,7 @@ int main(int argc, char *argv[]) { int overfit_single_batch = 0; // useful for debugging, 1 = only load a single data batch once int max_steps = -1; int override_enable_tf32 = 1; - bool master_weights = false; + int use_master_weights = 1; for (int i = 1; i < argc; i+=2) { if (i + 1 >= argc) { error_usage(); } // must have arg after flag if (argv[i][0] != '-') { error_usage(); } // must start with dash @@ -2471,7 +2472,7 @@ int main(int argc, char *argv[]) { else if (argv[i][1] == 'g') { genT = atoi(argv[i+1]); } else if (argv[i][1] == 'a') { overfit_single_batch = atoi(argv[i+1]); } else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); } - else if (argv[i][1] == 'w') { master_weights = (bool)atoi(argv[i+1]); } + else if (argv[i][1] == 'w') { use_master_weights = atoi(argv[i+1]); } else { error_usage(); } } printf0("+-----------------------+----------------------------------------------------+\n"); @@ -2488,7 +2489,7 @@ int main(int argc, char *argv[]) { printf0("| sample_every | %-50d |\n", sample_every); printf0("| genT | %-50d |\n", genT); printf0("| overfit_single_batch | %-50d |\n", overfit_single_batch); - printf0("| master_weights | %-50b |\n", master_weights); + printf0("| use_master_weights | %-50s |\n", use_master_weights ? "enabled" : "disabled"); printf0("+-----------------------+----------------------------------------------------+\n"); // set up the device @@ -2525,6 +2526,7 @@ int main(int argc, char *argv[]) { // build the GPT-2 model from a checkpoint GPT2 model; gpt2_build_from_checkpoint(&model, load_filename); + model.use_master_weights = use_master_weights; printf0("| load_filename | %-50s |\n", load_filename); printf0("| max_sequence_length T | %-50d |\n", model.config.max_seq_len); printf0("| vocab_size V | %-50d |\n", model.config.vocab_size); From c177c2694cde28a7bfb22ca98f0db58cdf2c6b78 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Wed, 1 May 2024 23:06:23 +0000 Subject: [PATCH 4/4] make nice and print --- train_gpt2.cu | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/train_gpt2.cu b/train_gpt2.cu index 79b59658b..8412658f1 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1285,7 +1285,8 @@ __global__ void fused_classifier_kernel3(floatX* logits, floatX* losses, floatX* } } -__global__ void copy_kernel(float* dst, const floatX* src, size_t n) { +__global__ void copy_and_cast_kernel(float* dst, const floatX* src, size_t n) { + // a small kernel to copy and cast, i.e. `dst <- (float) src` const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { dst[i] = (float)src[i]; } } @@ -2249,7 +2250,9 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo if (model->use_master_weights == 1) { // allocate one more buffer to keep the master copy of weights as float, and copy the weights over cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float))); - copy_kernel<<num_parameters, 512), 512>>>(model->master_weights, (floatX*)model->params_memory, model->num_parameters); + copy_and_cast_kernel<<num_parameters, 512), 512>>>(model->master_weights, (floatX*)model->params_memory, model->num_parameters); + cudaCheck(cudaGetLastError()); + printf0("allocated %zu MiB for master copy of params\n", (model->num_parameters * sizeof(float)) >> 20); } }