Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CI testing for A100 and V100 device #9324

Merged
merged 12 commits into from
Nov 4, 2024
51 changes: 46 additions & 5 deletions scripts/distribute/ci_case_auto.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,39 @@ function is_a100() {

IS_A100=$(is_a100)

function track_case_status() {
local case_name="$1"
local prefix="$2"
local original_path

original_path=$(pwd)
cd ${log_path} || { echo "Failed to enter log_path: $log_path"; return 1; }

total_count=$(ls -1 "$prefix"* 2>/dev/null | wc -l)
run_fail_count=$(ls -1 "$prefix"*_FAIL 2>/dev/null | wc -l)
loss_fail_count=$(grep 'check failed! ' result.log | awk -v prefix="$prefix_var" '{if ($2 ~ "^" prefix) print $2}'| wc -l)

# return original path
echo -e "\033[31m ---- $case_name total tests : $total_count \033"
if [ $run_fail_count -eq 0 ] && [ $loss_fail_count -eq 0 ]; then
echo -e "\033[32m ---- $case_name all cases Success \033"
else
if [[ $run_fail_count -ne 0 ]] ; then
echo -e "\033[31m ---- $case_name runtime failed test : $run_fail_count \033"
ls -1 "$prefix"*_FAIL 2>/dev/null
fi
if [[ $loss_fail_count -ne 0 ]] ; then
echo -e "\033[31m ---- $case_name loss verification failed test : $loss_fail_count \033"
grep 'check failed! ' result.log | awk -v prefix="$prefix_var" '{if ($2 ~ "^" prefix) print $2}'
fi
fi
cd "$original_path" || { echo "Failed to return to original path: $original_path"; return 1; }
}

# NOTE: Please place the new tests as much as possible after the existing tests
function llama_case_list_auto() {
# The test name must have "llama_" as a prefix, which will
# be used for tracking the execution status of the case.
llama_dygraph_auto_bs8_fp32_DP2
llama_dygraph_auto_bs8_fp32_DP2-MP2
llama_dygraph_auto_bs8_fp32_DP2-MP2-PP2
Expand All @@ -55,20 +86,30 @@ function llama_case_list_auto() {
llama_align_dygraph_dy2st_pir_auto_grad_merge_bs2_fp32_DP1-MP1-PP1
llama_align_dy2st_fthenb_and_vpp_auto_bs2_fp32_DP1-MP1-PP4
llama_align_dygraph_dy2st_pir_auto_pp_bs2_bf16_DP1-MP1-PP4

track_case_status $FUNCNAME "llama_"
}

function llm_gpt_case_list_auto() {
# The test name must have "llm_gpt_dygraph_auto_" as a prefix,
# which will be used for tracking the execution status of the case.
llm_gpt_dygraph_auto_bs8_fp32_DP2
llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2
llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2-PP2
llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2

track_case_status $FUNCNAME "llm_gpt_dygraph_auto_"
}

function llm_qwen_case_list_auto() {
# The test name must have "llm_qwen_dygraph_auto_" as a prefix,
# which will be used for tracking the execution status of the case.
llm_qwen_dygraph_auto_bs1_fp32_DP2
llm_qwen_dygraph_auto_bs1_fp32_DP2-MP2
llm_qwen_dygraph_auto_bs1_fp32_DP2-MP2-PP2
llm_qwen_dygraph_auto_bs1_bf16_DP2-MP2-PP2

track_case_status $FUNCNAME "llm_qwen_dygraph_auto_"
}

############ case start ############
Expand Down Expand Up @@ -2083,20 +2124,20 @@ function check_result() {
echo -e "$1" >> ${log_path}/result.log
if [ $? -ne 0 ];then
echo -e "\033[31m $1 run failed! \033[0m" | tee -a ${log_path}/result.log
exit -1
return 0
fi

if [ $# -ne 7 ] && [ $# -ne 8 ]; then
echo -e "\033[31m $1 parameter transfer failed: $@ \033[0m" | tee -a ${log_path}/result.log
exit -1
return 0
fi

diff_loss=$(echo $2 $3|awk '{printf "%0.2f\n", ($2-$1)/$1*100}')
echo -e "loss_base: $2 loss_test: $3 loss_diff: $diff_loss%" | tee -a ${log_path}/result.log
if [ $2 != $3 ];then
if [ -z "$8" ] || [ $8 -ne 1 ] ;then
echo -e "\033[31m $1 loss diff check failed! \033[0m" | tee -a ${log_path}/result.log
exit -1
return 0
else
diff=$(echo "$2 $3" | awk '{print $1-$2}')
gt=$(echo "${diff#-} 1e-5" | awk '{print ($1>$2)?"1":"0"}')
Expand All @@ -2116,7 +2157,7 @@ function check_result() {
fi
if [[ $v2 == 0 ]];then
echo -e "\033[31m $1 IPS diff check failed! \033[0m" | tee -a $log_path/result.log
exit -1
return 0
fi

diff_mem=$(echo $6 $7|awk '{printf "%0.2f\n", ($2-$1)/$1*100}')
Expand All @@ -2125,7 +2166,7 @@ function check_result() {
w2=$(echo $diff_mem -5.0|awk '{print($1<=$2)?"0":"1"}')
if [[ $w1 == 0 ]];then
echo -e "\033[31m $1 MEM diff check failed! \033[0m" | tee -a $log_path/result.log
exit -1
return 0
fi
if [[ $w2 == 0 ]];then
echo -e "$1 MEM decreases greater than 5%, not exit " | tee -a $log_path/result.log
Expand Down
47 changes: 42 additions & 5 deletions scripts/distribute/ci_case_dy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,38 @@ export llm_gpt_data_path=/llm_gpt_data

unset CUDA_VISIBLE_DEVICES

function track_case_status() {
local case_name="$1"
local prefix="$2"
local original_path

original_path=$(pwd)
cd ${log_path} || { echo "Failed to enter log_path: $log_path"; return 1; }

total_count=$(ls -1 "$prefix"* 2>/dev/null | wc -l)
run_fail_count=$(ls -1 "$prefix"*_FAIL 2>/dev/null | wc -l)
loss_fail_count=$(grep 'check failed! ' result.log | awk -v prefix="$prefix_var" '{if ($2 ~ "^" prefix) print $2}'| wc -l)

# return original path
echo -e "\033[31m ---- $case_name total tests : $total_count \033"
if [ $run_fail_count -eq 0 ] && [ $loss_fail_count -eq 0 ]; then
echo -e "\033[32m ---- $case_name all cases Success \033"
else
if [[ $run_fail_count -ne 0 ]] ; then
echo -e "\033[31m ---- $case_name runtime failed test : $run_fail_count \033"
ls -1 "$prefix"*_FAIL 2>/dev/null
fi
if [[ $loss_fail_count -ne 0 ]] ; then
echo -e "\033[31m ---- $case_name loss verification failed test : $loss_fail_count \033"
grep 'check failed! ' result.log | awk -v prefix="$prefix_var" '{if ($2 ~ "^" prefix) print $2}'
fi
fi
cd "$original_path" || { echo "Failed to return to original path: $original_path"; return 1; }
}

function gpt_case_list_dygraph(){
# The test name must have "gpt_" as a prefix, which will
# be used for tracking the execution status of the case.
gpt_preprocess_data
gpt_345M_single
gpt_1.3B_dp
Expand All @@ -50,10 +81,16 @@ function gpt_case_list_dygraph(){
gpt_345M_single_finetune
gpt_eval_WikiText
gpt_eval_LAMBADA

track_case_status $FUNCNAME "gpt_"
}

function llm_gpt_case_list_dygraph() {
# The test name must have "llm_gpt_" as a prefix, which will
# be used for tracking the execution status of the case.
llm_gpt_recompute_bs32_bf16_MP2-SD4-stage1

track_case_status $FUNCNAME "llm_gpt_"
}

############ case start ############
Expand Down Expand Up @@ -465,20 +502,20 @@ function check_result() {
echo -e "$1" >> ${log_path}/result.log
if [ $? -ne 0 ];then
echo -e "\033[31m $1 run failed! \033[0m" | tee -a ${log_path}/result.log
exit -1
return 0
fi

if [[ ! $1 =~ "llm" ]]; then
echo -e "\033 $1 run successfully! \033" | tee -a ${log_path}/result.log
elif [ $# -ne 7 ]; then
echo -e "\033[31m $1 parameter transfer failed: $@ \033[0m" | tee -a ${log_path}/result.log
exit -1
return 0
else
diff_loss=$(echo $2 $3|awk '{printf "%0.2f\n", ($2-$1)/$1*100}')
echo -e "loss_base: $2 loss_test: $3 loss_diff: $diff_loss%" | tee -a ${log_path}/result.log
if [ $2 != $3 ];then
echo -e "\033[31m $1 loss diff check failed! \033[0m" | tee -a ${log_path}/result.log
exit -1
return 0
fi

diff_ips=$(echo $4 $5|awk '{printf "%0.2f\n", ($2-$1)/$1*100}')
Expand All @@ -490,7 +527,7 @@ function check_result() {
fi
if [[ $v2 == 0 ]];then
echo -e "\033[31m $1 IPS diff check failed! \033[0m" | tee -a $log_path/result.log
exit -1
return 0
fi

diff_mem=$(echo $6 $7|awk '{printf "%0.2f\n", ($2-$1)/$1*100}')
Expand All @@ -499,7 +536,7 @@ function check_result() {
w2=$(echo $diff_mem -5.0|awk '{print($1<=$2)?"0":"1"}')
if [[ $w1 == 0 ]];then
echo -e "\033[31m $1 MEM diff check failed! \033[0m" | tee -a $log_path/result.log
exit -1
return 0
fi
if [[ $w2 == 0 ]];then
echo -e "$1 MEM decreases greater than 5%, not exit " | tee -a $log_path/result.log
Expand Down
130 changes: 93 additions & 37 deletions scripts/distribute/run_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -71,35 +71,71 @@ install_external_ops(){
python setup.py install
python -c "import fused_ln;";
}

function is_a100() {
if [ $(nvidia-smi|grep A100|wc -l) -ne 0 ];then
echo 1
else
echo 0
fi
}

IS_A100=$(is_a100)

####################################
get_diff_TO_case(){
cd ${nlp_dir}
for file_name in `git diff --numstat upstream/${AGILE_COMPILE_BRANCH} |awk '{print $NF}'`;do
arr_file_name=(${file_name//// })
dir1=${arr_file_name[0]}
dir2=${arr_file_name[1]}
dir3=${arr_file_name[2]}
dir4=${arr_file_name[3]}
file_item=$dir1/$dir2/$dir3/$dir4
echo "file_name:"${file_name}, "path:"${file_item}
if [ ! -f ${file_name} ];then # 针对pr删掉文件
continue
elif [[ ${file_name##*.} == "md" ]] || [[ ${file_name##*.} == "rst" ]] || [[ ${dir1} == "docs" ]];then
continue
else
for ((i=0; i<${#target_lists_for_gpt[@]}; i++)); do
if [[ ! ${dir3} =~ "benchmarks" ]] && [[ ${file_item} == *${target_lists_for_gpt[i]}* ]];then
case_list[${#case_list[*]}]=gpt-3_auto
case_list[${#case_list[*]}]=gpt-3_dygraph
fi
done
for ((i=0; i<${#target_lists_for_llama[@]}; i++)); do
if [[ ${file_item} == *${target_lists_for_llama[i]}* ]];then
case_list[${#case_list[*]}]=llama_auto
fi
done
fi
done
if [ $IS_A100 -ne 0 ];then
for file_name in `git diff --numstat upstream/${AGILE_COMPILE_BRANCH} |awk '{print $NF}'`;do
arr_file_name=(${file_name//// })
dir1=${arr_file_name[0]}
dir2=${arr_file_name[1]}
dir3=${arr_file_name[2]}
dir4=${arr_file_name[3]}
file_item=$dir1/$dir2/$dir3/$dir4
echo "file_name:"${file_name}, "path:"${file_item}
if [ ! -f ${file_name} ];then # 针对pr删掉文件
continue
elif [[ ${file_name##*.} == "md" ]] || [[ ${file_name##*.} == "rst" ]] || [[ ${dir1} == "docs" ]];then
continue
else
for ((i=0; i<${#target_lists_for_gpt[@]}; i++)); do
if [[ ! ${dir3} =~ "benchmarks" ]] && [[ ${file_item} == *${target_lists_for_gpt[i]}* ]];then
case_list[${#case_list[*]}]=gpt-3_auto
case_list[${#case_list[*]}]=gpt-3_dygraph
fi
done
for ((i=0; i<${#target_lists_for_llama[@]}; i++)); do
if [[ ${file_item} == *${target_lists_for_llama[i]}* ]];then
case_list[${#case_list[*]}]=llama_auto
fi
done
fi
done
else
case_list[${#case_list[*]}]=gpt-3_auto
case_list[${#case_list[*]}]=llama_auto
for file_name in `git diff --numstat upstream/${AGILE_COMPILE_BRANCH} |awk '{print $NF}'`;do
arr_file_name=(${file_name//// })
dir1=${arr_file_name[0]}
dir2=${arr_file_name[1]}
dir3=${arr_file_name[2]}
dir4=${arr_file_name[3]}
file_item=$dir1/$dir2/$dir3/$dir4
echo "file_name:"${file_name}, "path:"${file_item}
if [ ! -f ${file_name} ];then # 针对pr删掉文件
continue
elif [[ ${file_name##*.} == "md" ]] || [[ ${file_name##*.} == "rst" ]] || [[ ${dir1} == "docs" ]];then
continue
else
for ((i=0; i<${#target_lists_for_gpt[@]}; i++)); do
if [[ ! ${dir3} =~ "benchmarks" ]] && [[ ${file_item} == *${target_lists_for_gpt[i]}* ]];then
case_list[${#case_list[*]}]=gpt-3_dygraph
fi
done
fi
done
fi
}
####################################
print_info(){
Expand Down Expand Up @@ -129,6 +165,35 @@ function contain_case(){
return 0
}
####################################
function track_case_status() {
local case_name="$1"
local prefix="$2"
local original_path

original_path=$(pwd)
cd ${log_path} || { echo "Failed to enter log_path: $log_path"; return 1; }

total_count=$(ls -1 "$prefix"* 2>/dev/null | wc -l)
run_fail_count=$(ls -1 "$prefix"*_FAIL 2>/dev/null | wc -l)
loss_fail_count=$(grep 'check failed! ' result.log | awk -v prefix="$prefix_var" '{if ($2 ~ "^" prefix) print $2}'| wc -l)

# return original path
echo -e "\033[31m ---- $case_name total tests : $total_count \033"
if [ $run_fail_count -eq 0 ] && [ $loss_fail_count -eq 0 ]; then
echo -e "\033[32m ---- $case_name all cases Success \033"
else
if [[ $run_fail_count -ne 0 ]] ; then
echo -e "\033[31m ---- $case_name runtime failed test : $run_fail_count \033"
ls -1 "$prefix"*_FAIL 2>/dev/null
fi
if [[ $loss_fail_count -ne 0 ]] ; then
echo -e "\033[31m ---- $case_name loss verification failed test : $loss_fail_count \033"
grep 'check failed! ' result.log | awk -v prefix="$prefix_var" '{if ($2 ~ "^" prefix) print $2}'
fi
fi
cd "$original_path" || { echo "Failed to return to original path: $original_path"; return 1; }
}
####################################
get_diff_TO_case # 获取待执行case列表
case_list=($(awk -v RS=' ' '!a[$1]++' <<< ${case_list[*]})) # 去重并将结果存储回原列表
if [[ ${#case_list[*]} -ne 0 ]];then
Expand Down Expand Up @@ -171,17 +236,8 @@ if [[ ${#case_list[*]} -ne 0 ]];then
let case_num++
fi
echo -e "\033[31m ---- end run case \033"
cd ${log_path}
if [ ! -f *FAIL* ];then
FF=0
EXCODE=0
echo -e "\033[32m ---- all case Success \033"
else
FF=`ls *FAIL*|wc -l`
EXCODE=2
echo -e "\033[31m ---- case Failed number: ${FF} \033"
ls *_FAIL*
fi

track_case_status $FUNCNAME ""
else
echo -e "\033[32m Changed Not CI case, Skips \033"
EXCODE=0
Expand Down
Loading