From d5e4477af766c6bcecdc856b28d9ee791df31f83 Mon Sep 17 00:00:00 2001 From: Chinthaka Gamanayakege Date: Sun, 2 Jun 2024 16:17:08 +0000 Subject: [PATCH] typofix --- scripts/run_gpt2_124M.sbatch | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/run_gpt2_124M.sbatch b/scripts/run_gpt2_124M.sbatch index a163a99d2..7ed82817b 100644 --- a/scripts/run_gpt2_124M.sbatch +++ b/scripts/run_gpt2_124M.sbatch @@ -1,24 +1,24 @@ #!/bin/bash #SBATCH --job-name=llmc-multinode -#SBATCH --output=/dfs/llm.c/log1558M/%x_%j_%t.log +#SBATCH --output=/dfs/llm.c/log124M/%x_%j_%t.log #SBATCH --ntasks=32 # total number of processes to launch #SBATCH --ntasks-per-node=8 # assuming each node has 8 gpus #SBATCH --gres=gpu:8 # request 8 gpus from each node #SBATCH --nodelist=node[000-003] # list of the nodes to dispatch processes (32/8=4) cd /dfs/llm.c/ # path to the repo in distributed file system -mkdir -p log1558M +mkdir -p log124M -export DFS_PATH="/dfs/llm.c/log1558M" # this path will be used to save nccl unique id and sync it between processes +export DFS_PATH="/dfs/llm.c/log124M" # this path will be used to save nccl unique id and sync it between processes # export NCCL_SOCKET_IFNAME=ib0 # network interface Ethernet or InifiniBand which enables gpu direct rdma # export NCCL_IB_HCA=mlx5_0,mlx5_1 # list of all InfiniBand devices available if available -# GPT-2 (1558M) repro on FineWeb100B +# GPT-2 (124M) repro on FineWeb100B # Batch size is set to (1024 * 64) * 32 srun ./train_gpt2cu \ -i "dev/data/fineweb100B/fineweb_train_*.bin" \ -j "dev/data/fineweb100B/fineweb_val_*.bin" \ - -o "log1558M" \ + -o "log124M" \ -v 250 -s 20000 -g 144 \ -h 1 \ -b 64 -t 1024 \