From 45d6f31090d40a7149ea51dc6dad610511a7f5a0 Mon Sep 17 00:00:00 2001 From: zhengzhonghui Date: Wed, 14 Aug 2024 10:39:08 +0800 Subject: [PATCH 1/3] set logging_step to 5 with baichuan && qwen benchmark --- .../auto_config_baichun2_13b/pretrain-baichun2_13b-config.json | 2 +- .../auto_config_baichun2_7b/pretrain-baichun2_7b-config.json | 2 +- ...qlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh | 1 + ...qlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh | 1 + .../hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh | 3 ++- 5 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json index e0e4238a71ce..4bea6ad76209 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json +++ b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json @@ -31,7 +31,7 @@ "weight_decay": 0.01, "warmup_ratio": 0.01, "max_grad_norm": 1.0, - "logging_steps": 1, + "logging_steps": 5, "dataloader_num_workers": 1, "eval_steps": 1000, "disable_tqdm": true, diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_7b/pretrain-baichun2_7b-config.json b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_7b/pretrain-baichun2_7b-config.json index 8c1b4a0c61ef..25e4ff3044c6 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_7b/pretrain-baichun2_7b-config.json +++ b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_7b/pretrain-baichun2_7b-config.json @@ -31,7 +31,7 @@ "weight_decay": 0.01, "warmup_ratio": 0.01, "max_grad_norm": 1.0, - "logging_steps": 1, + "logging_steps": 5, "dataloader_num_workers": 1, "eval_steps": 1000, "disable_tqdm": true, diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh index bf6cd5a603b1..7c7abfc2304b 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh @@ -29,6 +29,7 @@ param+="device_num=N4C32 " param+="global_batch_size=32 " param+="model_item=qwen-qwen-14b_seqlen4096_pretrain " param+="max_steps=100 " +param+="logging_steps=5" param+="gradient_accumulation_steps=2 " param+="pp_recompute_interval=1 " param+="tensor_parallel_config=enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add, " diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh index 4a9ed7f11e82..6821182b5ab2 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh @@ -29,6 +29,7 @@ param+="device_num=N4C32 " param+="global_batch_size=32 " param+="model_item=qwen-qwen-7b_seqlen4096_pretrain " param+="max_steps=100 " +param+="logging_steps=5" param+="gradient_accumulation_steps=1 " param+="pp_recompute_interval=1 " param+="tensor_parallel_config=enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add " diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh index 1440bbf85867..391b325e57d2 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh @@ -32,6 +32,7 @@ function _set_params(){ global_batch_size=${global_batch_size:-16} model_item=${model_item:-"qwen-qwen-7b_seqlen2048_pretrain"} max_steps=${max_steps:-150} + logging_steps=${logging_steps:-1} gradient_accumulation_steps=${gradient_accumulation_steps:-8} pp_recompute_interval=${pp_recompute_interval:-1} tensor_parallel_config=${tensor_parallel_config:-"enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add"} @@ -162,7 +163,7 @@ function _train(){ --weight_decay 0.01 \ --warmup_ratio 0.01 \ --max_grad_norm 1.0 \ - --logging_steps 1 \ + --logging_steps ${logging_steps} \ --dataloader_num_workers 1 \ --eval_steps ${eval_steps} \ --sharding ${sharding} \ From e4deb0742cd62c41cea4cf31f26c061004e9b97e Mon Sep 17 00:00:00 2001 From: zhengzhonghui Date: Wed, 14 Aug 2024 11:06:12 +0800 Subject: [PATCH 2/3] polish --- ...eqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh | 2 +- ...eqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh index 7c7abfc2304b..04fcd36b4144 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh @@ -29,7 +29,7 @@ param+="device_num=N4C32 " param+="global_batch_size=32 " param+="model_item=qwen-qwen-14b_seqlen4096_pretrain " param+="max_steps=100 " -param+="logging_steps=5" +param+="logging_steps=5 " param+="gradient_accumulation_steps=2 " param+="pp_recompute_interval=1 " param+="tensor_parallel_config=enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add, " diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh index 6821182b5ab2..acf58c4cf857 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh @@ -29,7 +29,7 @@ param+="device_num=N4C32 " param+="global_batch_size=32 " param+="model_item=qwen-qwen-7b_seqlen4096_pretrain " param+="max_steps=100 " -param+="logging_steps=5" +param+="logging_steps=5 " param+="gradient_accumulation_steps=1 " param+="pp_recompute_interval=1 " param+="tensor_parallel_config=enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add " From a34ea60c0e50dea4e5db20f0b36f29347979e99f Mon Sep 17 00:00:00 2001 From: zhengzhonghui Date: Wed, 14 Aug 2024 11:42:15 +0800 Subject: [PATCH 3/3] polish --- .../auto_config_baichun2_13b/pretrain-baichun2_13b-config.json | 2 +- ...eqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh | 2 +- ...eqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json index 4bea6ad76209..307b9e9a6775 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json +++ b/tests/test_tipc/dygraph/hybrid_parallelism/baichun2/auto_config_baichun2_13b/pretrain-baichun2_13b-config.json @@ -26,7 +26,7 @@ "scale_loss": 1024, "learning_rate": 1e-05, "min_learning_rate": 5e-06, - "max_steps": 100, + "max_steps": 200, "save_steps": 5000, "weight_decay": 0.01, "warmup_ratio": 0.01, diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh index 04fcd36b4144..3de10a6e1431 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-14b_seqlen4096_pretrain_bs32_bf16_MP2-PP1-sharding16-mbs1-acc2.sh @@ -28,7 +28,7 @@ param+="run_mode=MP2-PP1-sharding16-mbs1-acc2 " param+="device_num=N4C32 " param+="global_batch_size=32 " param+="model_item=qwen-qwen-14b_seqlen4096_pretrain " -param+="max_steps=100 " +param+="max_steps=200 " param+="logging_steps=5 " param+="gradient_accumulation_steps=2 " param+="pp_recompute_interval=1 " diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh index acf58c4cf857..6a31ce26d1f8 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/qwen/N4C32/qwen-qwen-7b_seqlen4096_pretrain_bs32_bf16_MP1-PP1-sharding32-mbs1-acc1.sh @@ -28,7 +28,7 @@ param+="run_mode=MP1-PP1-sharding32-mbs1-acc1 " param+="device_num=N4C32 " param+="global_batch_size=32 " param+="model_item=qwen-qwen-7b_seqlen4096_pretrain " -param+="max_steps=100 " +param+="max_steps=200 " param+="logging_steps=5 " param+="gradient_accumulation_steps=1 " param+="pp_recompute_interval=1 "