From e9c76a1753899c01cfa517810b0871f2819ff17f Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Thu, 2 May 2024 00:11:07 +0300
Subject: [PATCH 1/4] option to keep weights as fp32

---
 train_gpt2.cu | 38 ++++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)
diff --git a/train_gpt2.cu b/train_gpt2.cu
index fbc068424..499d33b2c 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1159,7 +1159,7 @@ __device__ inline float lerp(float start, float end, float weight) {
 
 // Termplate type T instead of floatx
 template <typename Tp, typename Tg>
-__global__ void adamw_kernel3(Tp* params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
+__global__ void adamw_kernel3(Tp* params_memory, float* master_params, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
                               float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay,
                               unsigned int seed) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -1176,10 +1176,23 @@ __global__ void adamw_kernel3(Tp* params_memory, Tg* grads_memory, float* m_memo
    m /= beta1_correction;  // m_hat
    v /= beta2_correction;  // v_hat
    // update the parameters (weight/bias)
-   float param = (float)params_memory[i] - (learning_rate * (m / (sqrtf(v) + eps) + weight_decay * (float)params_memory[i]));
-   unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed);
-   // todo - explain stochastic rounding here
-   stochastic_rounding(param, &params_memory[i], random);
+   float old_param;
+   if(master_params) {
+       old_param = master_params[i];
+   } else {
+       old_param = (float)params_memory[i];
+   }
+   float param = old_param - (learning_rate * (m / (sqrtf(v) + eps) + weight_decay * old_param));
+   // if we have master parameters, directly update the two weight copies
+    if(master_params) {
+        params_memory[i] = (floatX)param;;
+        master_params[i] = param;
+    } else {
+        // otherwise, one update with stochastic rounding
+        // todo - explain stochastic rounding here
+        unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed);
+        stochastic_rounding(param, &params_memory[i], random);
+    }
 }
 
 struct SoftmaxParams {
@@ -1822,6 +1835,7 @@ typedef struct {
     // buffers for the AdamW optimizer
     float* m_memory;
     float* v_memory;
+    float* master_weights;     // is NULL unless fp32 weights is enabled.
     // the activations of the model, and their sizes
     ActivationTensors acts;
     size_t act_sizes[NUM_ACTIVATION_TENSORS];
@@ -1840,6 +1854,7 @@ typedef struct {
     float accumulated_mean_loss; // Mean loss after aggregating it on all GPUs
     floatX* cpu_losses; // CPU buffer to copy the losses to, allocated with cudaMallocHost
     unsigned long long rng_state; // the RNG state for seeding stochastic rounding etc.
+    bool use_master_weights;
 } GPT2;
 
 void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
@@ -1899,6 +1914,7 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
     model->grads_memory = NULL;
     model->m_memory = NULL;
     model->v_memory = NULL;
+    model->master_weights = NULL;
     model->grads_acts_memory = NULL;
     model->inputs = NULL;
     model->targets = NULL;
@@ -1907,6 +1923,7 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
     model->seq_len = 0;
     model->mean_loss = -1.0f; // -1.0f will designate no loss
     model->rng_state = 13371337;
+    model->use_master_weights = false;
 }
 
 void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T) {
@@ -2229,6 +2246,10 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
         cudaCheck(cudaMemset(model->v_memory, 0, model->num_parameters * sizeof(float)));
         printf0("allocated %zu MiB for AdamW optimizer state m\n", (model->num_parameters * sizeof(float)) >> 20);
         printf0("allocated %zu MiB for AdamW optimizer state v\n", (model->num_parameters * sizeof(float)) >> 20);
+        if(model->use_master_weights) {
+            // mixed precision, need to allocate one more buffer
+            cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float)));
+        }
     }
 
     int block_size = 512;
@@ -2236,7 +2257,8 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
     float beta1_correction = 1.0f - powf(beta1, t);
     float beta2_correction = 1.0f - powf(beta2, t);
     unsigned int seed = random_u32(&model->rng_state);
-    adamw_kernel3<<<num_blocks, block_size>>>((floatX*)model->params_memory, (floatX*)model->grads_memory, model->m_memory, model->v_memory,
+    adamw_kernel3<<<num_blocks, block_size>>>((floatX*)model->params_memory, model->master_weights,
+                                              (floatX*)model->grads_memory, model->m_memory, model->v_memory,
                                               model->num_parameters,
                                               learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, seed);
     cudaCheck(cudaGetLastError());
@@ -2408,6 +2430,7 @@ void error_usage() {
     fprintf(stderr, "  -g <int>    genT, how many steps of inference we do (default = 64)\n");
     fprintf(stderr, "  -a <int>    overfit a single batch? 0/1. useful for debugging\n");
     fprintf(stderr, "  -f <int>    enable_tf32 override (default: 1, set to 0 to disable tf32)\n");
+    fprintf(stderr, "  -c <int>    keep f32 copy of weights for the optimizer\n");
     exit(EXIT_FAILURE);
 }
 
@@ -2429,6 +2452,7 @@ int main(int argc, char *argv[]) {
     int overfit_single_batch = 0; // useful for debugging, 1 = only load a single data batch once
     int max_steps = -1;
     int override_enable_tf32 = 1;
+    bool master_weights = false;
     for (int i = 1; i < argc; i+=2) {
         if (i + 1 >= argc) { error_usage(); } // must have arg after flag
         if (argv[i][0] != '-') { error_usage(); } // must start with dash
@@ -2446,6 +2470,7 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 'g') { genT = atoi(argv[i+1]); }
         else if (argv[i][1] == 'a') { overfit_single_batch = atoi(argv[i+1]); }
         else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'c') { master_weights = (bool)atoi(argv[i+1]); }
         else { error_usage(); }
     }
     printf0("+-----------------------+----------------------------------------------------+\n");
@@ -2462,6 +2487,7 @@ int main(int argc, char *argv[]) {
     printf0("| sample_every          | %-50d |\n", sample_every);
     printf0("| genT                  | %-50d |\n", genT);
     printf0("| overfit_single_batch  | %-50d |\n", overfit_single_batch);
+    printf0("| master_weights        | %-50b |\n", master_weights);
     printf0("+-----------------------+----------------------------------------------------+\n");
 
     // set up the device

From df9f0efde6049c986352356a6628c8332c159c79 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Thu, 2 May 2024 01:00:36 +0300
Subject: [PATCH 2/4] updated letter and properly free buffer

---
 train_gpt2.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 499d33b2c..ba31fa47b 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -2269,6 +2269,7 @@ void gpt2_free(GPT2 *model) {
     cudaCheck(cudaFree(model->grads_memory));
     cudaCheck(cudaFree(model->m_memory));
     cudaCheck(cudaFree(model->v_memory));
+    cudaCheck(cudaFree(model->master_weights));
     cudaCheck(cudaFree(model->acts_memory));
     cudaCheck(cudaFree(model->grads_acts_memory));
     cudaCheck(cudaFree(model->inputs));
@@ -2430,7 +2431,7 @@ void error_usage() {
     fprintf(stderr, "  -g <int>    genT, how many steps of inference we do (default = 64)\n");
     fprintf(stderr, "  -a <int>    overfit a single batch? 0/1. useful for debugging\n");
     fprintf(stderr, "  -f <int>    enable_tf32 override (default: 1, set to 0 to disable tf32)\n");
-    fprintf(stderr, "  -c <int>    keep f32 copy of weights for the optimizer\n");
+    fprintf(stderr, "  -w <int>    keep f32 copy of weights for the optimizer\n");
     exit(EXIT_FAILURE);
 }
 
@@ -2470,7 +2471,7 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 'g') { genT = atoi(argv[i+1]); }
         else if (argv[i][1] == 'a') { overfit_single_batch = atoi(argv[i+1]); }
         else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); }
-        else if (argv[i][1] == 'c') { master_weights = (bool)atoi(argv[i+1]); }
+        else if (argv[i][1] == 'w') { master_weights = (bool)atoi(argv[i+1]); }
         else { error_usage(); }
     }
     printf0("+-----------------------+----------------------------------------------------+\n");

From 795f8b690cc9b3d2255a19941713b34eeff98d7b Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Wed, 1 May 2024 22:50:22 +0000
Subject: [PATCH 3/4] fixes to keep master copy in fp32 of weights optionally

---
 train_gpt2.cu | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index ba31fa47b..79b59658b 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1176,20 +1176,15 @@ __global__ void adamw_kernel3(Tp* params_memory, float* master_params, Tg* grads
    m /= beta1_correction;  // m_hat
    v /= beta2_correction;  // v_hat
    // update the parameters (weight/bias)
-   float old_param;
-   if(master_params) {
-       old_param = master_params[i];
-   } else {
-       old_param = (float)params_memory[i];
-   }
+   float old_param = master_params != NULL ? master_params[i] : (float)params_memory[i];
    float param = old_param - (learning_rate * (m / (sqrtf(v) + eps) + weight_decay * old_param));
    // if we have master parameters, directly update the two weight copies
-    if(master_params) {
-        params_memory[i] = (floatX)param;;
-        master_params[i] = param;
+    if (master_params != NULL) {
+        params_memory[i] = (floatX)param; // low-precision copy, for use in the forward pass
+        master_params[i] = param; // float copy, for use in the next parameter update
     } else {
-        // otherwise, one update with stochastic rounding
-        // todo - explain stochastic rounding here
+        // without a master copy of params in float, do a direct update in low precision
+        // and use stochastic rounding to mitigate loss of training stability
         unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed);
         stochastic_rounding(param, &params_memory[i], random);
     }
@@ -1290,6 +1285,11 @@ __global__ void fused_classifier_kernel3(floatX* logits, floatX* losses, floatX*
     }
 }
 
+__global__ void copy_kernel(float* dst, const floatX* src, size_t n) {
+    const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n) { dst[i] = (float)src[i]; }
+}
+
 // ----------------------------------------------------------------------------
 // kernel launchers
 
@@ -1854,7 +1854,7 @@ typedef struct {
     float accumulated_mean_loss; // Mean loss after aggregating it on all GPUs
     floatX* cpu_losses; // CPU buffer to copy the losses to, allocated with cudaMallocHost
     unsigned long long rng_state; // the RNG state for seeding stochastic rounding etc.
-    bool use_master_weights;
+    int use_master_weights;
 } GPT2;
 
 void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
@@ -1923,7 +1923,7 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
     model->seq_len = 0;
     model->mean_loss = -1.0f; // -1.0f will designate no loss
     model->rng_state = 13371337;
-    model->use_master_weights = false;
+    model->use_master_weights = 1; // keep master weights copy in float for optim update?
 }
 
 void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T) {
@@ -2246,9 +2246,10 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
         cudaCheck(cudaMemset(model->v_memory, 0, model->num_parameters * sizeof(float)));
         printf0("allocated %zu MiB for AdamW optimizer state m\n", (model->num_parameters * sizeof(float)) >> 20);
         printf0("allocated %zu MiB for AdamW optimizer state v\n", (model->num_parameters * sizeof(float)) >> 20);
-        if(model->use_master_weights) {
-            // mixed precision, need to allocate one more buffer
+        if (model->use_master_weights == 1) {
+            // allocate one more buffer to keep the master copy of weights as float, and copy the weights over
             cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float)));
+            copy_kernel<<<CEIL_DIV(model->num_parameters, 512), 512>>>(model->master_weights, (floatX*)model->params_memory, model->num_parameters);
         }
     }
 
@@ -2431,7 +2432,7 @@ void error_usage() {
     fprintf(stderr, "  -g <int>    genT, how many steps of inference we do (default = 64)\n");
     fprintf(stderr, "  -a <int>    overfit a single batch? 0/1. useful for debugging\n");
     fprintf(stderr, "  -f <int>    enable_tf32 override (default: 1, set to 0 to disable tf32)\n");
-    fprintf(stderr, "  -w <int>    keep f32 copy of weights for the optimizer\n");
+    fprintf(stderr, "  -w <int>    keep f32 copy of weights for the optimizer? (default: 1)\n");
     exit(EXIT_FAILURE);
 }
 
@@ -2453,7 +2454,7 @@ int main(int argc, char *argv[]) {
     int overfit_single_batch = 0; // useful for debugging, 1 = only load a single data batch once
     int max_steps = -1;
     int override_enable_tf32 = 1;
-    bool master_weights = false;
+    int use_master_weights = 1;
     for (int i = 1; i < argc; i+=2) {
         if (i + 1 >= argc) { error_usage(); } // must have arg after flag
         if (argv[i][0] != '-') { error_usage(); } // must start with dash
@@ -2471,7 +2472,7 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 'g') { genT = atoi(argv[i+1]); }
         else if (argv[i][1] == 'a') { overfit_single_batch = atoi(argv[i+1]); }
         else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); }
-        else if (argv[i][1] == 'w') { master_weights = (bool)atoi(argv[i+1]); }
+        else if (argv[i][1] == 'w') { use_master_weights = atoi(argv[i+1]); }
         else { error_usage(); }
     }
     printf0("+-----------------------+----------------------------------------------------+\n");
@@ -2488,7 +2489,7 @@ int main(int argc, char *argv[]) {
     printf0("| sample_every          | %-50d |\n", sample_every);
     printf0("| genT                  | %-50d |\n", genT);
     printf0("| overfit_single_batch  | %-50d |\n", overfit_single_batch);
-    printf0("| master_weights        | %-50b |\n", master_weights);
+    printf0("| use_master_weights    | %-50s |\n", use_master_weights ? "enabled" : "disabled");
     printf0("+-----------------------+----------------------------------------------------+\n");
 
     // set up the device
@@ -2525,6 +2526,7 @@ int main(int argc, char *argv[]) {
     // build the GPT-2 model from a checkpoint
     GPT2 model;
     gpt2_build_from_checkpoint(&model, load_filename);
+    model.use_master_weights = use_master_weights;
     printf0("| load_filename         | %-50s |\n", load_filename);
     printf0("| max_sequence_length T | %-50d |\n", model.config.max_seq_len);
     printf0("| vocab_size V          | %-50d |\n", model.config.vocab_size);

From c177c2694cde28a7bfb22ca98f0db58cdf2c6b78 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Wed, 1 May 2024 23:06:23 +0000
Subject: [PATCH 4/4] make nice and print

---
 train_gpt2.cu | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 79b59658b..8412658f1 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1285,7 +1285,8 @@ __global__ void fused_classifier_kernel3(floatX* logits, floatX* losses, floatX*
     }
 }
 
-__global__ void copy_kernel(float* dst, const floatX* src, size_t n) {
+__global__ void copy_and_cast_kernel(float* dst, const floatX* src, size_t n) {
+    // a small kernel to copy and cast, i.e. `dst <- (float) src`
     const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < n) { dst[i] = (float)src[i]; }
 }
@@ -2249,7 +2250,9 @@ void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, flo
         if (model->use_master_weights == 1) {
             // allocate one more buffer to keep the master copy of weights as float, and copy the weights over
             cudaCheck(cudaMalloc((void**)&model->master_weights, model->num_parameters * sizeof(float)));
-            copy_kernel<<<CEIL_DIV(model->num_parameters, 512), 512>>>(model->master_weights, (floatX*)model->params_memory, model->num_parameters);
+            copy_and_cast_kernel<<<CEIL_DIV(model->num_parameters, 512), 512>>>(model->master_weights, (floatX*)model->params_memory, model->num_parameters);
+            cudaCheck(cudaGetLastError());
+            printf0("allocated %zu MiB for master copy of params\n", (model->num_parameters * sizeof(float)) >> 20);
         }
     }