PaddlePaddle · wawltor · Nov 4, 2024 · Oct 28, 2024 · Oct 28, 2024 · Oct 29, 2024
diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh
@@ -38,8 +38,39 @@ function is_a100() {
 
 IS_A100=$(is_a100)
 
+function track_case_status() {  
+    local case_name="$1"  
+    local prefix="$2"  
+    local original_path  
+
+    original_path=$(pwd)  
+    cd ${log_path} || { echo "Failed to enter log_path: $log_path"; return 1; }  
+
+    total_count=$(ls -1 "$prefix"* 2>/dev/null | wc -l)  
+    run_fail_count=$(ls -1 "$prefix"*_FAIL 2>/dev/null | wc -l)  
+    loss_fail_count=$(grep 'check failed! ' result.log | awk -v prefix="$prefix_var" '{if ($2 ~ "^" prefix) print $2}'| wc -l)
+
+    # return original path 
+    echo -e "\033[31m ---- $case_name total tests :  $total_count \033"
+    if [ $run_fail_count -eq 0 ] && [ $loss_fail_count  -eq 0 ]; then
+        echo -e "\033[32m ---- $case_name all cases Success  \033"
+    else
+        if [[ $run_fail_count -ne 0 ]] ; then
+            echo -e "\033[31m ---- $case_name runtime failed test  :  $run_fail_count \033"
+            ls -1 "$prefix"*_FAIL 2>/dev/null
+        fi
+        if [[ $loss_fail_count -ne 0 ]] ; then
+            echo -e "\033[31m ---- $case_name loss verification failed test  :  $loss_fail_count \033"
+            grep 'check failed! ' result.log | awk -v prefix="$prefix_var" '{if ($2 ~ "^" prefix) print $2}'
+        fi
+    fi
+    cd "$original_path" || { echo "Failed to return to original path: $original_path"; return 1; }  
+} 
+
 # NOTE: Please place the new tests as much as possible after the existing tests
 function llama_case_list_auto() {
+    # The test name must have "llama_" as a prefix, which will 
+    # be used for tracking the execution status of the case.
     llama_dygraph_auto_bs8_fp32_DP2
     llama_dygraph_auto_bs8_fp32_DP2-MP2
     llama_dygraph_auto_bs8_fp32_DP2-MP2-PP2
@@ -55,20 +86,30 @@ function llama_case_list_auto() {
     llama_align_dygraph_dy2st_pir_auto_grad_merge_bs2_fp32_DP1-MP1-PP1
     llama_align_dy2st_fthenb_and_vpp_auto_bs2_fp32_DP1-MP1-PP4
     llama_align_dygraph_dy2st_pir_auto_pp_bs2_bf16_DP1-MP1-PP4
+
+    track_case_status $FUNCNAME "llama_"
 }
 
 function llm_gpt_case_list_auto() {
+    # The test name must have "llm_gpt_dygraph_auto_" as a prefix, 
+    # which will be used for tracking the execution status of the case.
     llm_gpt_dygraph_auto_bs8_fp32_DP2
     llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2
     llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2-PP2
     llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2
+
+    track_case_status $FUNCNAME "llm_gpt_dygraph_auto_"
 }
 
 function llm_qwen_case_list_auto() {
+    # The test name must have "llm_qwen_dygraph_auto_" as a prefix, 
+    # which will be used for tracking the execution status of the case.
     llm_qwen_dygraph_auto_bs1_fp32_DP2
     llm_qwen_dygraph_auto_bs1_fp32_DP2-MP2
     llm_qwen_dygraph_auto_bs1_fp32_DP2-MP2-PP2
     llm_qwen_dygraph_auto_bs1_bf16_DP2-MP2-PP2
+
+    track_case_status $FUNCNAME "llm_qwen_dygraph_auto_"
 }
 
 ############ case start ############
@@ -2083,20 +2124,20 @@ function check_result() {
     echo -e "$1" >> ${log_path}/result.log
     if [ $? -ne 0 ];then
         echo -e "\033[31m $1 run failed! \033[0m" | tee -a ${log_path}/result.log
-        exit -1
+        return 0
     fi
 
     if [ $# -ne 7 ] && [ $# -ne 8 ]; then
         echo -e "\033[31m $1 parameter transfer failed: $@ \033[0m" | tee -a ${log_path}/result.log
-        exit -1
+        return 0
     fi
 
     diff_loss=$(echo $2 $3|awk '{printf "%0.2f\n", ($2-$1)/$1*100}')
     echo -e "loss_base: $2 loss_test: $3 loss_diff: $diff_loss%" | tee -a ${log_path}/result.log
     if [ $2 != $3 ];then
         if [ -z "$8" ] || [ $8 -ne 1 ] ;then
             echo -e "\033[31m $1 loss diff check failed! \033[0m" | tee -a ${log_path}/result.log
-            exit -1
+            return 0
         else
             diff=$(echo "$2 $3" | awk '{print $1-$2}')
             gt=$(echo "${diff#-} 1e-5" | awk '{print ($1>$2)?"1":"0"}')
@@ -2116,7 +2157,7 @@ function check_result() {
     fi
     if [[ $v2 == 0 ]];then
         echo -e "\033[31m $1 IPS diff check failed! \033[0m" | tee -a $log_path/result.log
-        exit -1
+        return 0
     fi
 
     diff_mem=$(echo $6 $7|awk '{printf "%0.2f\n", ($2-$1)/$1*100}')
@@ -2125,7 +2166,7 @@ function check_result() {
     w2=$(echo $diff_mem -5.0|awk '{print($1<=$2)?"0":"1"}')
     if [[ $w1 == 0 ]];then
         echo -e "\033[31m $1 MEM diff check failed! \033[0m" | tee -a $log_path/result.log
-        exit -1
+        return 0
     fi
     if [[ $w2 == 0 ]];then
         echo -e "$1 MEM decreases greater than 5%, not exit " | tee -a $log_path/result.log

diff --git a/scripts/distribute/ci_case_dy.sh b/scripts/distribute/ci_case_dy.sh
@@ -27,7 +27,38 @@ export llm_gpt_data_path=/llm_gpt_data
 
 unset CUDA_VISIBLE_DEVICES
 
+function track_case_status() {  
+    local case_name="$1"  
+    local prefix="$2"  
+    local original_path  
+
+    original_path=$(pwd)  
+    cd ${log_path} || { echo "Failed to enter log_path: $log_path"; return 1; }  
+
+    total_count=$(ls -1 "$prefix"* 2>/dev/null | wc -l)  
+    run_fail_count=$(ls -1 "$prefix"*_FAIL 2>/dev/null | wc -l)  
+    loss_fail_count=$(grep 'check failed! ' result.log | awk -v prefix="$prefix_var" '{if ($2 ~ "^" prefix) print $2}'| wc -l)
+
+    # return original path 
+    echo -e "\033[31m ---- $case_name total tests :  $total_count \033"
+    if [ $run_fail_count -eq 0 ] && [ $loss_fail_count  -eq 0 ]; then
+        echo -e "\033[32m ---- $case_name all cases Success  \033"
+    else
+        if [[ $run_fail_count -ne 0 ]] ; then
+            echo -e "\033[31m ---- $case_name runtime failed test  :  $run_fail_count \033"
+            ls -1 "$prefix"*_FAIL 2>/dev/null
+        fi
+        if [[ $loss_fail_count -ne 0 ]] ; then
+            echo -e "\033[31m ---- $case_name loss verification failed test  :  $loss_fail_count \033"
+            grep 'check failed! ' result.log | awk -v prefix="$prefix_var" '{if ($2 ~ "^" prefix) print $2}'
+        fi
+    fi
+    cd "$original_path" || { echo "Failed to return to original path: $original_path"; return 1; }  
+} 
+
 function gpt_case_list_dygraph(){
+    # The test name must have "gpt_" as a prefix, which will 
+    # be used for tracking the execution status of the case.
     gpt_preprocess_data
     gpt_345M_single
     gpt_1.3B_dp
@@ -50,10 +81,16 @@ function gpt_case_list_dygraph(){
     gpt_345M_single_finetune
     gpt_eval_WikiText
     gpt_eval_LAMBADA
+
+    track_case_status $FUNCNAME "gpt_"
 }
 
 function llm_gpt_case_list_dygraph() {
+    # The test name must have "llm_gpt_" as a prefix, which will 
+    # be used for tracking the execution status of the case.
     llm_gpt_recompute_bs32_bf16_MP2-SD4-stage1
+
+    track_case_status $FUNCNAME "llm_gpt_"
 }
 
 ############ case start ############
@@ -465,20 +502,20 @@ function check_result() {
     echo -e "$1" >> ${log_path}/result.log
     if [ $? -ne 0 ];then
         echo -e "\033[31m $1 run failed! \033[0m" | tee -a ${log_path}/result.log
-        exit -1
+        return 0
     fi
 
     if [[ ! $1 =~ "llm" ]]; then
         echo -e "\033 $1 run successfully! \033" | tee -a ${log_path}/result.log
     elif [ $# -ne 7 ]; then
         echo -e "\033[31m $1 parameter transfer failed: $@ \033[0m" | tee -a ${log_path}/result.log
-        exit -1
+        return 0
     else
         diff_loss=$(echo $2 $3|awk '{printf "%0.2f\n", ($2-$1)/$1*100}')
         echo -e "loss_base: $2 loss_test: $3 loss_diff: $diff_loss%" | tee -a ${log_path}/result.log
         if [ $2 != $3 ];then
             echo -e "\033[31m $1 loss diff check failed! \033[0m" | tee -a ${log_path}/result.log
-            exit -1
+            return 0
         fi
 
         diff_ips=$(echo $4 $5|awk '{printf "%0.2f\n", ($2-$1)/$1*100}')
@@ -490,7 +527,7 @@ function check_result() {
         fi
         if [[ $v2 == 0 ]];then
             echo -e "\033[31m $1 IPS diff check failed! \033[0m" | tee -a $log_path/result.log
-            exit -1
+            return 0
         fi
 
         diff_mem=$(echo $6 $7|awk '{printf "%0.2f\n", ($2-$1)/$1*100}')
@@ -499,7 +536,7 @@ function check_result() {
         w2=$(echo $diff_mem -5.0|awk '{print($1<=$2)?"0":"1"}')
         if [[ $w1 == 0 ]];then
             echo -e "\033[31m $1 MEM diff check failed! \033[0m" | tee -a $log_path/result.log
-            exit -1
+            return 0
         fi
         if [[ $w2 == 0 ]];then
             echo -e "$1 MEM decreases greater than 5%, not exit " | tee -a $log_path/result.log

diff --git a/scripts/distribute/run_ci.sh b/scripts/distribute/run_ci.sh
@@ -71,35 +71,71 @@ install_external_ops(){
     python setup.py install
     python -c "import fused_ln;";
 }
+
+function is_a100() {
+    if [ $(nvidia-smi|grep A100|wc -l)  -ne 0 ];then
+        echo 1
+    else
+        echo 0
+    fi
+}
+
+IS_A100=$(is_a100) 
+
 ####################################
 get_diff_TO_case(){
 cd ${nlp_dir}
-for file_name in `git diff --numstat upstream/${AGILE_COMPILE_BRANCH} |awk '{print $NF}'`;do
-    arr_file_name=(${file_name//// })
-    dir1=${arr_file_name[0]}
-    dir2=${arr_file_name[1]}
-    dir3=${arr_file_name[2]}
-    dir4=${arr_file_name[3]}
-    file_item=$dir1/$dir2/$dir3/$dir4
-    echo "file_name:"${file_name}, "path:"${file_item}
-    if [ ! -f ${file_name} ];then # 针对pr删掉文件
-        continue
-    elif [[ ${file_name##*.} == "md" ]] || [[ ${file_name##*.} == "rst" ]] || [[ ${dir1} == "docs" ]];then
-        continue
-    else
-        for ((i=0; i<${#target_lists_for_gpt[@]}; i++)); do
-            if [[ ! ${dir3} =~ "benchmarks" ]] && [[ ${file_item} == *${target_lists_for_gpt[i]}* ]];then
-                case_list[${#case_list[*]}]=gpt-3_auto
-                case_list[${#case_list[*]}]=gpt-3_dygraph
-            fi
-        done
-        for ((i=0; i<${#target_lists_for_llama[@]}; i++)); do
-            if [[ ${file_item} == *${target_lists_for_llama[i]}* ]];then
-                case_list[${#case_list[*]}]=llama_auto
-            fi
-        done
-    fi
-done
+if [ $IS_A100 -ne 0 ];then
+    for file_name in `git diff --numstat upstream/${AGILE_COMPILE_BRANCH} |awk '{print $NF}'`;do
+        arr_file_name=(${file_name//// })
+        dir1=${arr_file_name[0]}
+        dir2=${arr_file_name[1]}
+        dir3=${arr_file_name[2]}
+        dir4=${arr_file_name[3]}
+        file_item=$dir1/$dir2/$dir3/$dir4
+        echo "file_name:"${file_name}, "path:"${file_item}
+        if [ ! -f ${file_name} ];then # 针对pr删掉文件
+            continue
+        elif [[ ${file_name##*.} == "md" ]] || [[ ${file_name##*.} == "rst" ]] || [[ ${dir1} == "docs" ]];then
+            continue
+        else
+            for ((i=0; i<${#target_lists_for_gpt[@]}; i++)); do
+                if [[ ! ${dir3} =~ "benchmarks" ]] && [[ ${file_item} == *${target_lists_for_gpt[i]}* ]];then
+                    case_list[${#case_list[*]}]=gpt-3_auto
+                    case_list[${#case_list[*]}]=gpt-3_dygraph
+                fi
+            done
+            for ((i=0; i<${#target_lists_for_llama[@]}; i++)); do
+                if [[ ${file_item} == *${target_lists_for_llama[i]}* ]];then
+                    case_list[${#case_list[*]}]=llama_auto
+                fi
+            done
+        fi
+    done
+else
+    case_list[${#case_list[*]}]=gpt-3_auto
+    case_list[${#case_list[*]}]=llama_auto
+    for file_name in `git diff --numstat upstream/${AGILE_COMPILE_BRANCH} |awk '{print $NF}'`;do
+        arr_file_name=(${file_name//// })
+        dir1=${arr_file_name[0]}
+        dir2=${arr_file_name[1]}
+        dir3=${arr_file_name[2]}
+        dir4=${arr_file_name[3]}
+        file_item=$dir1/$dir2/$dir3/$dir4
+        echo "file_name:"${file_name}, "path:"${file_item}
+        if [ ! -f ${file_name} ];then # 针对pr删掉文件
+            continue
+        elif [[ ${file_name##*.} == "md" ]] || [[ ${file_name##*.} == "rst" ]] || [[ ${dir1} == "docs" ]];then
+            continue
+        else
+            for ((i=0; i<${#target_lists_for_gpt[@]}; i++)); do
+                if [[ ! ${dir3} =~ "benchmarks" ]] && [[ ${file_item} == *${target_lists_for_gpt[i]}* ]];then
+                    case_list[${#case_list[*]}]=gpt-3_dygraph
+                fi
+            done
+        fi
+    done
+fi
 }
 ####################################
 print_info(){
@@ -129,6 +165,35 @@ function contain_case(){
     return 0
 }
 ####################################
+function track_case_status() {  
+    local case_name="$1"  
+    local prefix="$2"  
+    local original_path  
+
+    original_path=$(pwd)  
+    cd ${log_path} || { echo "Failed to enter log_path: $log_path"; return 1; }  
+
+    total_count=$(ls -1 "$prefix"* 2>/dev/null | wc -l)  
+    run_fail_count=$(ls -1 "$prefix"*_FAIL 2>/dev/null | wc -l)  
+    loss_fail_count=$(grep 'check failed! ' result.log | awk -v prefix="$prefix_var" '{if ($2 ~ "^" prefix) print $2}'| wc -l)
+
+    # return original path 
+    echo -e "\033[31m ---- $case_name total tests :  $total_count \033"
+    if [ $run_fail_count -eq 0 ] && [ $loss_fail_count  -eq 0 ]; then
+        echo -e "\033[32m ---- $case_name all cases Success  \033"
+    else
+        if [[ $run_fail_count -ne 0 ]] ; then
+            echo -e "\033[31m ---- $case_name runtime failed test  :  $run_fail_count \033"
+            ls -1 "$prefix"*_FAIL 2>/dev/null
+        fi
+        if [[ $loss_fail_count -ne 0 ]] ; then
+            echo -e "\033[31m ---- $case_name loss verification failed test  :  $loss_fail_count \033"
+            grep 'check failed! ' result.log | awk -v prefix="$prefix_var" '{if ($2 ~ "^" prefix) print $2}'
+        fi
+    fi
+    cd "$original_path" || { echo "Failed to return to original path: $original_path"; return 1; }  
+} 
+####################################
 get_diff_TO_case # 获取待执行case列表
 case_list=($(awk -v RS=' ' '!a[$1]++' <<< ${case_list[*]}))  # 去重并将结果存储回原列表
 if [[ ${#case_list[*]} -ne 0 ]];then
@@ -171,17 +236,8 @@ if [[ ${#case_list[*]} -ne 0 ]];then
         let case_num++
     fi
     echo -e "\033[31m ---- end run case  \033"
-    cd ${log_path}
-    if [ ! -f *FAIL* ];then
-        FF=0
-        EXCODE=0
-        echo -e "\033[32m ---- all case Success \033"
-    else
-        FF=`ls *FAIL*|wc -l`
-        EXCODE=2
-        echo -e "\033[31m ---- case Failed number: ${FF} \033"
-        ls *_FAIL*
-    fi
+
+    track_case_status  $FUNCNAME ""
 else
     echo -e "\033[32m Changed Not CI case, Skips \033"
     EXCODE=0