typofix

karpathy · Jun 2, 2024 · d5e4477 · d5e4477
1 parent 3d14fff
commit d5e4477
Showing 1 changed file with 5 additions and 5 deletions.
diff --git a/scripts/run_gpt2_124M.sbatch b/scripts/run_gpt2_124M.sbatch
@@ -1,24 +1,24 @@
 #!/bin/bash
 #SBATCH --job-name=llmc-multinode
-#SBATCH --output=/dfs/llm.c/log1558M/%x_%j_%t.log
+#SBATCH --output=/dfs/llm.c/log124M/%x_%j_%t.log
 #SBATCH --ntasks=32                                 # total number of processes to launch 
 #SBATCH --ntasks-per-node=8                         # assuming each node has 8 gpus
 #SBATCH --gres=gpu:8                                # request 8 gpus from each node
 #SBATCH --nodelist=node[000-003]                    # list of the nodes to dispatch processes (32/8=4)
 
 cd /dfs/llm.c/                                      # path to the repo in distributed file system
-mkdir -p log1558M
+mkdir -p log124M
 
-export DFS_PATH="/dfs/llm.c/log1558M"               # this path will be used to save nccl unique id and sync it between processes
+export DFS_PATH="/dfs/llm.c/log124M"                # this path will be used to save nccl unique id and sync it between processes
 # export NCCL_SOCKET_IFNAME=ib0                     # network interface Ethernet or InifiniBand which enables gpu direct rdma
 # export NCCL_IB_HCA=mlx5_0,mlx5_1                  # list of all InfiniBand devices available if available
 
-# GPT-2 (1558M) repro on FineWeb100B
+# GPT-2 (124M) repro on FineWeb100B
 # Batch size is set to (1024 * 64) * 32
 srun ./train_gpt2cu \
     -i "dev/data/fineweb100B/fineweb_train_*.bin" \
     -j "dev/data/fineweb100B/fineweb_val_*.bin" \
-    -o "log1558M" \
+    -o "log124M" \
     -v 250 -s 20000 -g 144 \
     -h 1 \
     -b 64 -t 1024 \