PaddlePaddle · zhiqiu · Apr 24, 2024 · Apr 22, 2024 · Apr 22, 2024 · Apr 22, 2024
diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh
@@ -60,6 +60,7 @@ function llama_case_list_auto() {
     llama_dygraph_auto_bs8_fp32_DP2-MP2
     llama_dygraph_auto_bs8_fp32_DP2-MP2-PP2
     llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2
+    llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2
 
     llama_static_auto_recompute_bs8_fp32_DP1-MP1-PP1
     llama_static_auto_recompute_bs16_fp32_DP2-MP1-PP1
@@ -1657,6 +1658,99 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
     echo "=========== $FUNCNAME run  end ==========="
 }
 
+function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2() {
+    # Only A100 support this case.
+    if [ $IS_A100 -eq 0 ]; then
+        return
+    fi
+    echo "=========== $FUNCNAME run begin ==========="
+    export PYTHONPATH=$root_path/:$PYTHONPATH
+    export FLAGS_call_stack_level=3
+    export NVIDIA_TF32_OVERRIDE=0
+
+    task_name="llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2"
+    case_out_dir="output/$task_name"
+    case_log_dir="output/$task_name""_log"
+    rm -rf $case_out_dir
+    rm -rf $case_log_dir
+
+    python -u  -m paddle.distributed.launch \
+        --gpus "0,1,2,3,4,5,6,7" \
+        --log_dir  "output/$task_name""_log" \
+        ./run_pretrain_auto.py \
+        --model_name_or_path "meta-llama/Llama-2-13b" \
+        --tokenizer_name_or_path "meta-llama/Llama-2-13b" \
+        --input_dir "./data" \
+        --output_dir "./output" \
+        --split 949,50,1 \
+        --weight_decay 0.01 \
+        --warmup_ratio 0.01 \
+        --max_grad_norm 1.0 \
+        --learning_rate 3e-05 \
+        --min_learning_rate 3e-06 \
+        --max_steps 30 \
+        --logging_steps 10 \
+        --eval_steps 1000 \
+        --save_steps 50000 \
+        --continue_training 0 \
+        --do_train true \
+        --do_eval false \
+        --do_predict false \
+        --disable_tqdm true \
+        --skip_profile_timer true \
+        --save_total_limit 2 \
+        --device gpu \
+        --disable_tqdm true \
+        --dataloader_num_workers 1 \
+        --distributed_dataloader 0 \
+        --enable_auto_parallel 1 \
+        --per_device_train_batch_size 1 \
+        --gradient_accumulation_steps 4 \
+        --per_device_eval_batch_size 1 \
+        --recompute false \
+        --recompute_use_reentrant true \
+        --recompute_granularity full \
+        --pp_recompute_interval 0 \
+        --bf16 true \
+        --fp16_opt_level "O2"  \
+        --amp_master_grad true \
+        --fuse_attention_ffn false \
+        --fuse_attention_qkv true \
+        --fused_linear_param_grad_add 1 \
+        --fuse_sequence_parallel_allreduce false \
+        --use_flash_attention true \
+        --use_fused_rope true \
+        --use_fused_rms_norm true \
+        --max_seq_length 4096 \
+        --sep_parallel_degree 1 \
+        --sequence_parallel false \
+        --pipeline_parallel_degree 2 \
+        --sharding_parallel_degree 2 \
+        --tensor_parallel_degree 1 \
+        --virtual_pp_degree 3 \
+        --pipeline_schedule_mode "VPP" \
+        --sharding "stage2" \
+        --pipeline_parallel_config "enable_send_recv_overlap" \
+        --data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
+        --sharding_parallel_config "enable_stage2_overlap" \
+        --tensor_parallel_config "enable_mp_async_allreduce" \
+        --to_static 1 \
+        --amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
+        --amp_custom_white_list "lookup_table" "lookup_table_v2" \
+        --num_hidden_layers 12 \
+        --skip_memory_metrics 0 \
+        >>${log_path}/$FUNCNAME 2>&1
+    loss=`cat $case_log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
+    ips=`cat $case_log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'interval_samples_per_second: ' '{print $2}' | awk -F ',' '{print $1}'`
+    mem=`cat $case_log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'current_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'`
+    echo "result: loss=$loss ips=$ips mem=$mem"
+    loss_base=7.52383575
+    ips_base=12.4135
+    mem_base=29.140248775482178
+    check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
+    echo "=========== $FUNCNAME run  end ==========="
+}
+
 function llm_gpt_dygraph_auto_bs8_fp32_DP2() {
     echo "=========== $FUNCNAME run begin ==========="
     export PYTHONPATH=$root_path/:$PYTHONPATH

diff --git a/scripts/distribute/run_ci.sh b/scripts/distribute/run_ci.sh
@@ -58,6 +58,14 @@ install_paddlenlp(){
     # cd -
     python -c "import paddlenlp; print('paddlenlp commit:',paddlenlp.version.commit)";
 }
+
+install_external_ops(){
+    echo -e "\033[31m ---- Install extern_ops  \033"
+    export PYTHONPATH=${nlp_dir}:$PYTHONPATH
+    cd ${nlp_dir}/model_zoo/gpt-3/external_ops
+    python setup.py install
+    python -c "import fused_ln;";
+}
 ####################################
 get_diff_TO_case(){
 cd ${nlp_dir}
@@ -127,6 +135,8 @@ if [[ ${#case_list[*]} -ne 0 ]];then
     install_paddle
     # Install paddlenlp
     install_paddlenlp
+    # Install external_ops
+    install_external_ops
 
     case_num=1
     export FLAGS_install_deps=0