Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

【AutoParallel】Add llama2 UT for auto-parallel #8300

Merged
merged 6 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions scripts/distribute/ci_case_auto.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ function llama_case_list_auto() {
llama_dygraph_auto_bs8_fp32_DP2-MP2
llama_dygraph_auto_bs8_fp32_DP2-MP2-PP2
llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2
llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2

llama_static_auto_recompute_bs8_fp32_DP1-MP1-PP1
llama_static_auto_recompute_bs16_fp32_DP2-MP1-PP1
Expand Down Expand Up @@ -1657,6 +1658,99 @@ function llama_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
echo "=========== $FUNCNAME run end ==========="
}

function llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2() {
# Only A100 support this case.
if [ $IS_A100 -eq 0 ]; then
return
fi
echo "=========== $FUNCNAME run begin ==========="
export PYTHONPATH=$root_path/:$PYTHONPATH
export FLAGS_call_stack_level=3
export NVIDIA_TF32_OVERRIDE=0

task_name="llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2"
case_out_dir="output/$task_name"
case_log_dir="output/$task_name""_log"
rm -rf $case_out_dir
rm -rf $case_log_dir

python -u -m paddle.distributed.launch \
--gpus "0,1,2,3,4,5,6,7" \
--log_dir "output/$task_name""_log" \
./run_pretrain_auto.py \
--model_name_or_path "meta-llama/Llama-2-13b" \
--tokenizer_name_or_path "meta-llama/Llama-2-13b" \
--input_dir "./data" \
--output_dir "./output" \
--split 949,50,1 \
--weight_decay 0.01 \
--warmup_ratio 0.01 \
--max_grad_norm 1.0 \
--learning_rate 3e-05 \
--min_learning_rate 3e-06 \
--max_steps 30 \
--logging_steps 10 \
--eval_steps 1000 \
--save_steps 50000 \
--continue_training 0 \
--do_train true \
--do_eval false \
--do_predict false \
--disable_tqdm true \
--skip_profile_timer true \
--save_total_limit 2 \
--device gpu \
--disable_tqdm true \
--dataloader_num_workers 1 \
--distributed_dataloader 0 \
--enable_auto_parallel 1 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 4 \
--per_device_eval_batch_size 1 \
--recompute false \
--recompute_use_reentrant true \
--recompute_granularity full \
--pp_recompute_interval 0 \
--bf16 true \
--fp16_opt_level "O2" \
--amp_master_grad true \
--fuse_attention_ffn false \
--fuse_attention_qkv true \
--fused_linear_param_grad_add 1 \
--fuse_sequence_parallel_allreduce false \
--use_flash_attention true \
--use_fused_rope true \
--use_fused_rms_norm true \
--max_seq_length 4096 \
--sep_parallel_degree 1 \
--sequence_parallel false \
--pipeline_parallel_degree 2 \
--sharding_parallel_degree 2 \
--tensor_parallel_degree 1 \
--virtual_pp_degree 3 \
--pipeline_schedule_mode "VPP" \
--sharding "stage2" \
--pipeline_parallel_config "enable_send_recv_overlap" \
--data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
--sharding_parallel_config "enable_stage2_overlap" \
--tensor_parallel_config "enable_mp_async_allreduce" \
--to_static 1 \
--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
--amp_custom_white_list "lookup_table" "lookup_table_v2" \
--num_hidden_layers 12 \
--skip_memory_metrics 0 \
>>${log_path}/$FUNCNAME 2>&1
loss=`cat $case_log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
ips=`cat $case_log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'interval_samples_per_second: ' '{print $2}' | awk -F ',' '{print $1}'`
mem=`cat $case_log_dir/workerlog.0 | grep 'global_step: 30' | awk -F 'current_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'`
echo "result: loss=$loss ips=$ips mem=$mem"
loss_base=7.52383575
ips_base=12.4135
mem_base=29.140248775482178
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
}

function llm_gpt_dygraph_auto_bs8_fp32_DP2() {
echo "=========== $FUNCNAME run begin ==========="
export PYTHONPATH=$root_path/:$PYTHONPATH
Expand Down
10 changes: 10 additions & 0 deletions scripts/distribute/run_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ install_paddlenlp(){
# cd -
python -c "import paddlenlp; print('paddlenlp commit:',paddlenlp.version.commit)";
}

install_external_ops(){
echo -e "\033[31m ---- Install extern_ops \033"
export PYTHONPATH=${nlp_dir}:$PYTHONPATH
cd ${nlp_dir}/model_zoo/gpt-3/external_ops
python setup.py install
python -c "import fused_ln;";
}
####################################
get_diff_TO_case(){
cd ${nlp_dir}
Expand Down Expand Up @@ -127,6 +135,8 @@ if [[ ${#case_list[*]} -ne 0 ]];then
install_paddle
# Install paddlenlp
install_paddlenlp
# Install external_ops
install_external_ops

case_num=1
export FLAGS_install_deps=0
Expand Down
Loading