Skip to content

Commit

Permalink
typofix
Browse files Browse the repository at this point in the history
  • Loading branch information
Chinthaka Gamanayakege committed Jun 2, 2024
1 parent 3d14fff commit d5e4477
Showing 1 changed file with 5 additions and 5 deletions.
10 changes: 5 additions & 5 deletions scripts/run_gpt2_124M.sbatch
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
#!/bin/bash
#SBATCH --job-name=llmc-multinode
#SBATCH --output=/dfs/llm.c/log1558M/%x_%j_%t.log
#SBATCH --output=/dfs/llm.c/log124M/%x_%j_%t.log
#SBATCH --ntasks=32 # total number of processes to launch
#SBATCH --ntasks-per-node=8 # assuming each node has 8 gpus
#SBATCH --gres=gpu:8 # request 8 gpus from each node
#SBATCH --nodelist=node[000-003] # list of the nodes to dispatch processes (32/8=4)

cd /dfs/llm.c/ # path to the repo in distributed file system
mkdir -p log1558M
mkdir -p log124M

export DFS_PATH="/dfs/llm.c/log1558M" # this path will be used to save nccl unique id and sync it between processes
export DFS_PATH="/dfs/llm.c/log124M" # this path will be used to save nccl unique id and sync it between processes
# export NCCL_SOCKET_IFNAME=ib0 # network interface Ethernet or InifiniBand which enables gpu direct rdma
# export NCCL_IB_HCA=mlx5_0,mlx5_1 # list of all InfiniBand devices available if available

# GPT-2 (1558M) repro on FineWeb100B
# GPT-2 (124M) repro on FineWeb100B
# Batch size is set to (1024 * 64) * 32
srun ./train_gpt2cu \
-i "dev/data/fineweb100B/fineweb_train_*.bin" \
-j "dev/data/fineweb100B/fineweb_val_*.bin" \
-o "log1558M" \
-o "log124M" \
-v 250 -s 20000 -g 144 \
-h 1 \
-b 64 -t 1024 \
Expand Down

0 comments on commit d5e4477

Please sign in to comment.