diff --git a/README.md b/README.md
index 6bb1f57d8..cd3b37c63 100644
--- a/README.md
+++ b/README.md
@@ -126,17 +126,17 @@ sudo apt-get -y install libcudnn9-dev-cuda-12
 
 On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. Simply clone the repo to your disk. The Makefile currently looks for it in either your home directory or the current directory. If you have put it elsewhere, add `CUDNN_FRONTEND_PATH=/path/to/your/cudnn-frontend/include` to the `make` command-line.
 
-**multi-GPU training**. As of April 26, 2024 there is now also support for multi-GPU training using MPI and NCCL. Make sure you install MPI, e.g. on Linux:
+**multi-GPU training**. Support for multi-GPU training is availabel using NCCL. Make sure you download and install [NCCL](https://docs.nvidia.com/deeplearning/nccl/install-guide/index.html), e.g. on Linux:
 
 ```bash
-sudo apt install openmpi-bin openmpi-doc libopenmpi-dev
+sudo sudo apt install libnccl2 libnccl-dev
 ```
 
 and then:
 
 ```bash
 make train_gpt2cu
-mpirun -np <number of GPUs> ./train_gpt2cu
+mpirun -np <number of GPUs> bach -c './train_gpt2cu -pn <number of GPUs> -pr $OMPI_COMM_WORLD_RANK'
 ```
 
 **multi-node training**. For SLURM enabled cluster, use the sample script in [scripts/run_gpt2_124M.sbatch](scripts/run_gpt2_124M.sbatch)
diff --git a/scripts/README.md b/scripts/README.md
index 876005955..0cd53d932 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -21,7 +21,7 @@ Long story short, try `-r 1` (recompute GeLU, trading off speed and memory) to c
 It might be that you only have one GPU and not a whole box of them. Every script is fairly easy to change for just a single GPU. For llm.c, simply change line 1 to line 2 and leave everything else the same:
 
 ```bash
-mpirun -np 8 ./train_gpt2cu \
+mpirun -np 8 bach -c './train_gpt2cu -pn 8 -pr $OMPI_COMM_WORLD_RANK'
 ./train_gpt2cu \
 ```
 
diff --git a/scripts/run_gpt2_124M.sbatch b/scripts/run_gpt2_124M.sbatch
index 7ed82817b..d5158600a 100644
--- a/scripts/run_gpt2_124M.sbatch
+++ b/scripts/run_gpt2_124M.sbatch
@@ -9,15 +9,15 @@
 cd /dfs/llm.c/                                      # path to the repo in distributed file system
 mkdir -p log124M
 
-export DFS_PATH="/dfs/llm.c/log124M"                # this path will be used to save nccl unique id and sync it between processes
 # export NCCL_SOCKET_IFNAME=ib0                     # network interface Ethernet or InifiniBand which enables gpu direct rdma
 # export NCCL_IB_HCA=mlx5_0,mlx5_1                  # list of all InfiniBand devices available if available
 
 # GPT-2 (124M) repro on FineWeb100B
-# Batch size is set to (1024 * 64) * 32
-srun ./train_gpt2cu \
-    -i "dev/data/fineweb100B/fineweb_train_*.bin" \
-    -j "dev/data/fineweb100B/fineweb_val_*.bin" \
+# Global batch size is set to (1024 * 64) * 32
+srun bash -c "
+    ./train_gpt2cu \
+    -i 'dev/data/fineweb100B/fineweb_train_*.bin' \
+    -j 'dev/data/fineweb100B/fineweb_val_*.bin' \
     -o "log124M" \
     -v 250 -s 20000 -g 144 \
     -h 1 \
@@ -31,4 +31,8 @@ srun ./train_gpt2cu \
     -u 700 \
     -n 10000 \
     -y 1 \
-    -e "d12"
+    -e d12 \
+    -pn 32 \
+    -pr \$SLURM_PROCID \
+    -pg 8 \
+    -pd "/dfs/llm.c/log124M""
diff --git a/scripts/run_gpt2_124M.sh b/scripts/run_gpt2_124M.sh
index 9ca1f0822..d2b41fa4d 100755
--- a/scripts/run_gpt2_124M.sh
+++ b/scripts/run_gpt2_124M.sh
@@ -20,9 +20,10 @@ while true; do
 
     # run python dev/data/fineweb.py --version 10B to prepro data
     # run python dev/data/hellaswag.py to prepro hellaswag eval
-    mpirun -np 8 ./train_gpt2cu \
-                -i "dev/data/fineweb10B/fineweb_train_*.bin" \
-                -j "dev/data/fineweb10B/fineweb_val_*.bin" \
+    mpirun -np 8 bash -c "
+                ./train_gpt2cu \
+                -i 'dev/data/fineweb10B/fineweb_train_*.bin' \
+                -j 'dev/data/fineweb10B/fineweb_val_*.bin' \
                 -o $out_dir \
                 -v 250 -s 20000 -g 144 \
                 -h 1 \
@@ -36,7 +37,9 @@ while true; do
                 -u 700 \
                 -n 5000 \
                 -y 1 \
-                -e "d12"
+                -e "d12" \
+                -pn 8 \
+                -pr \$OMPI_COMM_WORLD_RANK"
 
     sleep 1
 done
diff --git a/scripts/run_gpt2_350M.sh b/scripts/run_gpt2_350M.sh
index 1f9defc12..d144ac1b1 100644
--- a/scripts/run_gpt2_350M.sh
+++ b/scripts/run_gpt2_350M.sh
@@ -20,9 +20,10 @@ while true; do
 
     # run python dev/data/fineweb.py --version 100B to prepro data
     # run python dev/data/hellaswag.py to prepro hellaswag eval
-    mpirun -np 8 ./train_gpt2cu \
-                -i "dev/data/fineweb100B/fineweb_train_*.bin" \
-                -j "dev/data/fineweb100B/fineweb_val_*.bin" \
+    mpirun -np 8 bash -c "
+                ./train_gpt2cu \
+                -i 'dev/data/fineweb100B/fineweb_train_*.bin' \
+                -j 'dev/data/fineweb100B/fineweb_val_*.bin' \
                 -o $out_dir \
                 -v 250 -s 100000 -g 144 \
                 -h 1 \
@@ -37,7 +38,9 @@ while true; do
                 -n 2000 \
                 -x 60000 \
                 -y 1 \
-                -e "d24"
+                -e "d24" \
+                -pn 8 \
+                -pr \$OMPI_COMM_WORLD_RANK"
 
     sleep 1
 done
diff --git a/scripts/run_gpt3_124M.sh b/scripts/run_gpt3_124M.sh
index bde1e6859..426d83701 100644
--- a/scripts/run_gpt3_124M.sh
+++ b/scripts/run_gpt3_124M.sh
@@ -20,9 +20,10 @@ while true; do
 
     # run python dev/data/fineweb.py --version 10B to prepro data
     # run python dev/data/hellaswag.py to prepro hellaswag eval
-    mpirun -np 8 ./train_gpt2cu \
-                -i "dev/data/fineweb100B/fineweb_train_*.bin" \
-                -j "dev/data/fineweb100B/fineweb_val_*.bin" \
+    mpirun -np 8 bash -c "
+                ./train_gpt2cu \
+                -i 'dev/data/fineweb100B/fineweb_train_*.bin' \
+                -j 'dev/data/fineweb100B/fineweb_val_*.bin' \
                 -o $out_dir \
                 -v 250 -s 20000 -g 144 \
                 -h 1 \
@@ -37,7 +38,9 @@ while true; do
                 -n 10000 \
                 -y 1 \
                 -x 565950 \
-                -e "d12"
+                -e "d12" \
+                -pn 8 \
+                -pr \$OMPI_COMM_WORLD_RANK"
 
     sleep 1
 done