From a188a306a7e7701e7cff173e6d4766ace3e82b2f Mon Sep 17 00:00:00 2001
From: brightnesss <brightness_j@hotmail.com>
Date: Wed, 21 Aug 2024 14:18:21 +0800
Subject: [PATCH 1/3] upgrade for bscvrq to support non-quant

---
 .../fused/fused_seqpool_cvm_kernel.kps        | 112 +++++++++++++++++-
 1 file changed, 111 insertions(+), 1 deletion(-)
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps b/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps
index 9681e852a8870..e8391de955b57 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps
@@ -795,6 +795,26 @@ struct sum_pooling_concate {
     }
 };
 
+// add for bscvrq
+// embedx_concate_filter:true && quant_ratio_valid=false && need_filter=true && embed_threshold_filter=false
+template <typename T>
+struct sum_pooling_concate<T, false, false, true> {
+    static __device__ inline bool filter(T* local_x, 
+                                    int in_dim_size,
+                                    float show_coeff,
+                                    float clk_coeff,
+                                    float threshold,
+                                    int cvm_offset,
+                                    float embed_threshold) {
+        auto &show = local_x[0];
+        auto &click = local_x[1];
+        if ((show - click) * show_coeff + click * clk_coeff < threshold) { 
+            return true;
+        }
+        return false;
+    }
+};
+
 // embedx_concate_filter:true && quant_ratio_valid=true && need_filter=true && embed_threshold_filter=false
 template <typename T>
 struct sum_pooling_concate<T, true, true, false> {
@@ -844,6 +864,96 @@ struct sum_pooling_concate<T, true, true, true> {
     }
 };
 
+// 1)FusedSeqpoolKernelEmbedFilterEmbedxConcate
+// embedx_concate_filter:true && quant_ratio_valid=false && need_filter=true && embed_threshold_filter=true
+// 2)FusedSeqpoolKernelFilterEmbedxConcate
+// embedx_concate_filter:true && quant_ratio_valid=false && need_filter=true && embed_threshold_filter=false
+// 3)FusedSeqpoolKernelEmbedxConcate
+// embedx_concate_filter:true && quant_ratio_valid=false && need_filter=false
+template <typename T, bool use_cvm, bool clk_filter, bool need_filter,
+            bool embed_threshold_filter, typename T2>
+struct do_sum_pooling_and_cvm<T, use_cvm, clk_filter, need_filter, false, embed_threshold_filter, true, T2> {
+    static __device__ inline void kernel(T* local_x,
+                                T* local_result, int local_result_len,
+                                float padding_value,
+                                T2* sum_show_clk,
+                                int batch_start, int batch_end,
+                                int in_dim_size, int out_dim_size,
+                                int dim_start_offset,
+                                int seqid,
+                                int max_seq_len,
+                                int quant_ratio,
+                                float quant_ratio_reciprocal,
+                                float32x16_t &v_scale,
+                                float show_coeff,
+                                float clk_coeff,
+                                float threshold,
+                                int cvm_offset,
+                                float embed_threshold,
+                                int embed_thres_size,
+                                int embedx_concate_size,
+                                bool fix_ctr_to_click,
+                                __global_ptr__ T* cur_x,
+                                __global_ptr__ T* cur_y) {
+        int concate_index = 0;
+        for (int i = batch_start; i < batch_end; i += max_seq_len) {
+            // int len = min<int64_t>(batch_end - i, max_seq_len);
+            int len = min(batch_end - i, max_seq_len);
+            if (len <= 0)
+                continue;
+            mfence();
+
+            for (int j = 0; j < len; j++) {
+                mfence();
+                GM2LM(cur_x + (i + j) * in_dim_size, local_x, in_dim_size * sizeof(T));
+                
+                bool is_filter = sum_pooling_concate<T, false, need_filter, embed_threshold_filter>::filter(
+                    local_x, in_dim_size, show_coeff, clk_coeff, threshold, cvm_offset, embed_threshold);
+                if (is_filter) {
+                    continue;
+                }
+                
+                if (concate_index < embedx_concate_size) {
+                    // first: sum pool
+                    // copy
+
+                    float32x16_t v_src1 = vload_lm_float32x16(local_x);
+                    float32x16_t v_src2 = vload_lm_float32x16(local_x + 16);
+
+                    vstore_lm_float32x16(local_result, v_src1);
+                    vstore_lm_float32x16(local_result + 16, v_src2);
+
+                    mfence_lm();
+                    // cvm_offset = [0, 2]
+                    for (int cvm_i = 0; cvm_i < cvm_offset; cvm_i++) {
+                        local_result[cvm_i] = local_x[cvm_i];
+                    }
+
+                    // second: cvm
+                    int cur_y_index = seqid * embedx_concate_size * out_dim_size + concate_index * out_dim_size;
+                    cvm_engine<T, true, use_cvm, clk_filter, T2>::concat_cvm(local_result,
+                                                        out_dim_size, dim_start_offset,
+                                                        cur_y_index,
+                                                        cur_y);
+                    mfence();
+                    concate_index += 1;
+                }
+            }
+        }
+
+        mfence();
+
+        // second: cvm
+        for (int i = concate_index; i < embedx_concate_size; i++) {
+            memset_value_float(local_result, local_result_len, padding_value);
+            int cur_y_index = seqid * embedx_concate_size * out_dim_size + i * out_dim_size;
+            LM2GM_ASYNC(local_result, cur_y + cur_y_index, out_dim_size * sizeof(T));
+            mfence();
+        }
+    }
+};
+
+
 // 1)FusedSeqpoolKernelEmbedQuantFilterEmbedxConcate
 // embedx_concate_filter:true && quant_ratio_valid=true && need_filter=true && embed_threshold_filter=true
 // 2)FusedSeqpoolKernelQuantFilterEmbedxConcate
@@ -3374,4 +3484,4 @@ template int sequence_sum_pool_cvm_with_conv_grad<float, int>(xpu::Context* ctx,
                                                uint32_t slot_num,
                                                int embedx_concate_size);
 }  // end namespace framework
-}  // end namespace paddle
+}  // end namespace paddle
\ No newline at end of file

From bb24297d56fd861510859b096a53e0ad1b18b58e Mon Sep 17 00:00:00 2001
From: brightnesss <brightness_j@hotmail.com>
Date: Wed, 21 Aug 2024 14:18:21 +0800
Subject: [PATCH 2/3] upgrade for bscvrq to support non-quant

---
 .../fused/fused_seqpool_cvm_kernel.kps        | 148 +++++++++++++++++-
 1 file changed, 146 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps b/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps
index e8391de955b57..8b9233c300a33 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps
@@ -795,10 +795,50 @@ struct sum_pooling_concate {
     }
 };
 
+// add for bscvrq
+// embedx_concate_filter:true && quant_ratio_valid=false && need_filter=false && embed_threshold_filter=false
+template <typename T>
+struct sum_pooling_concate<T, false, true, false, false> {
+    static __device__ inline bool filter(T* local_x, 
+                                    int in_dim_size,
+                                    float show_coeff,
+                                    float clk_coeff,
+                                    float threshold,
+                                    int cvm_offset,
+                                    float embed_threshold) {
+        auto &show = local_x[0];
+        auto &click = local_x[1];
+        if ((show - click) * show_coeff + click * clk_coeff < threshold) { 
+            return true;
+        }
+        return false;
+    }
+};
+
 // add for bscvrq
 // embedx_concate_filter:true && quant_ratio_valid=false && need_filter=true && embed_threshold_filter=false
 template <typename T>
-struct sum_pooling_concate<T, false, false, true> {
+struct sum_pooling_concate<T, false, true, true, false> {
+    static __device__ inline bool filter(T* local_x, 
+                                    int in_dim_size,
+                                    float show_coeff,
+                                    float clk_coeff,
+                                    float threshold,
+                                    int cvm_offset,
+                                    float embed_threshold) {
+        auto &show = local_x[0];
+        auto &click = local_x[1];
+        if ((show - click) * show_coeff + click * clk_coeff < threshold) { 
+            return true;
+        }
+        return false;
+    }
+};
+
+// add for bscvrq
+// embedx_concate_filter:true && quant_ratio_valid=false && need_filter=true && embed_threshold_filter=true
+template <typename T>
+struct sum_pooling_concate<T, false, true, true, true> {
     static __device__ inline bool filter(T* local_x, 
                                     int in_dim_size,
                                     float show_coeff,
@@ -811,6 +851,17 @@ struct sum_pooling_concate<T, false, false, true> {
         if ((show - click) * show_coeff + click * clk_coeff < threshold) { 
             return true;
         }
+            
+        auto &embedw = local_x[cvm_offset]; 
+        auto embedx_weight_score = 0.0; 
+        for (int cvm_i = cvm_offset + 1; cvm_i < in_dim_size; cvm_i++) { 
+            embedx_weight_score += local_x[cvm_i] * local_x[cvm_i];
+                // pow(fabs(local_x[cvm_i]), 2); 
+        } 
+        embedx_weight_score = sqrt(embedx_weight_score) + fabs(embedw); 
+        if (embedx_weight_score < embed_threshold) { 
+            return true;
+        } 
         return false;
     }
 };
@@ -907,7 +958,7 @@ struct do_sum_pooling_and_cvm<T, use_cvm, clk_filter, need_filter, false, embed_
                 mfence();
                 GM2LM(cur_x + (i + j) * in_dim_size, local_x, in_dim_size * sizeof(T));
                 
-                bool is_filter = sum_pooling_concate<T, false, need_filter, embed_threshold_filter>::filter(
+                bool is_filter = sum_pooling_concate<T, false, true, need_filter, embed_threshold_filter>::filter(
                     local_x, in_dim_size, show_coeff, clk_coeff, threshold, cvm_offset, embed_threshold);
                 if (is_filter) {
                     continue;
@@ -1058,6 +1109,99 @@ struct do_sum_pooling_and_cvm<T, use_cvm, clk_filter, need_filter, quant_ratio_v
     }
 };
 
+// 1)FusedSeqpoolKernelEmbedFilterEmbedxConcate
+// embedx_concate_filter:true && quant_ratio_valid=false && need_filter=true && embed_threshold_filter=true
+// 2)FusedSeqpoolKernelFilterEmbedxConcate
+// embedx_concate_filter:true && quant_ratio_valid=false && need_filter=true && embed_threshold_filter=false
+// 3)FusedSeqpoolKernelEmbedxConcate
+// embedx_concate_filter:true && quant_ratio_valid=false && need_filter=false
+template <typename T, bool use_cvm, bool clk_filter, bool need_filter, 
+            bool embed_threshold_filter, typename T2>
+struct do_sum_pooling_and_cvm_with_large_dim<T, use_cvm, clk_filter, need_filter, false, embed_threshold_filter, true, T2> {
+    static __device__ inline void kernel(T* local_x,
+                                T* local_result, int local_result_len,
+                                float padding_value,
+                                T2* sum_show_clk,
+                                int batch_start, int batch_end,
+                                int in_dim_size, int out_dim_size,
+                                int dim_start_offset,
+                                int seqid,
+                                int max_seq_len,
+                                int quant_ratio,
+                                float quant_ratio_reciprocal,
+                                float32x16_t &v_scale,
+                                float show_coeff,
+                                float clk_coeff,
+                                float threshold,
+                                int cvm_offset,
+                                float embed_threshold,
+                                int embed_thres_size,
+                                int embedx_concate_size,
+                                bool fix_ctr_to_click,
+                                __global_ptr__ T* cur_x,
+                                __global_ptr__ T* cur_y) {
+        int concate_index = 0;
+        for (int i = batch_start; i < batch_end; i += max_seq_len) {
+            // int len = min<int64_t>(batch_end - i, max_seq_len);
+            int len = min(batch_end - i, max_seq_len);
+            if (len <= 0)
+                continue;
+            mfence();
+
+            for (int j = 0; j < len; j++) {
+                mfence();
+                GM2LM(cur_x + (i + j) * in_dim_size, local_x, in_dim_size * sizeof(T));
+                
+                bool is_filter = sum_pooling_concate<T, false, true, need_filter, embed_threshold_filter>::filter(
+                    local_x, in_dim_size, show_coeff, clk_coeff, threshold, cvm_offset, embed_threshold);
+                if (is_filter) { 
+                    continue; 
+                } 
+    
+               if (concate_index < embedx_concate_size) {
+                    // first: sum pool
+                    // copy
+                    for(int k = 0; k < in_dim_size ; k += 32) {
+                        float32x16_t v_src1 = vload_lm_float32x16(local_x + k);
+                        float32x16_t v_src2 = vload_lm_float32x16(local_x + k + 16);
+
+                        vstore_lm_float32x16(local_result + k, v_src1);
+                        vstore_lm_float32x16(local_result + k + 16, v_src2);
+
+                        mfence_lm();
+                    }
+
+                    mfence_lm();
+                    // cvm_offset = [0, 2]
+                    for (int cvm_i = 0; cvm_i < cvm_offset; cvm_i++) {
+                        local_result[cvm_i] = local_x[cvm_i];
+                        sum_show_clk[cvm_i] = local_x[cvm_i];
+                    }
+
+                    // second: cvm
+                    int cur_y_index = seqid * embedx_concate_size * out_dim_size + concate_index * out_dim_size;
+                    cvm_engine<T, true, use_cvm, clk_filter, T2>::concat_cvm(local_result,
+                                                        out_dim_size, dim_start_offset,
+                                                        cur_y_index,
+                                                        cur_y);
+                    mfence();
+                    concate_index += 1;
+                }
+            }
+        }
+
+        mfence();
+
+        // second: cvm
+        for (int i = concate_index; i < embedx_concate_size; i++) {
+            memset_value_float(local_result, local_result_len, padding_value);
+            int cur_y_index = seqid * embedx_concate_size * out_dim_size + i * out_dim_size;
+            LM2GM_ASYNC(local_result, cur_y + cur_y_index, out_dim_size * sizeof(T));
+            mfence();
+        }
+    }
+};
+
 // 1)FusedSeqpoolKernelEmbedQuantFilterEmbedxConcate
 // embedx_concate_filter:true && quant_ratio_valid=true && need_filter=true && embed_threshold_filter=true
 // 2)FusedSeqpoolKernelQuantFilterEmbedxConcate

From 7ee910511e7e37caeaf99de4102d310c5b34ef12 Mon Sep 17 00:00:00 2001
From: brightnesss <brightness_j@hotmail.com>
Date: Wed, 21 Aug 2024 14:18:21 +0800
Subject: [PATCH 3/3] upgrade for bscvrq to support non-quant

---
 .../fluid/operators/fused/fused_seqpool_cvm_kernel.kps   | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps b/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps
index 8b9233c300a33..673650ed3690b 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps
@@ -798,7 +798,7 @@ struct sum_pooling_concate {
 // add for bscvrq
 // embedx_concate_filter:true && quant_ratio_valid=false && need_filter=false && embed_threshold_filter=false
 template <typename T>
-struct sum_pooling_concate<T, false, true, false, false> {
+struct sum_pooling_concate<T, false, false, false> {
     static __device__ inline bool filter(T* local_x, 
                                     int in_dim_size,
                                     float show_coeff,
@@ -818,7 +818,7 @@ struct sum_pooling_concate<T, false, true, false, false> {
 // add for bscvrq
 // embedx_concate_filter:true && quant_ratio_valid=false && need_filter=true && embed_threshold_filter=false
 template <typename T>
-struct sum_pooling_concate<T, false, true, true, false> {
+struct sum_pooling_concate<T, false, true, false> {
     static __device__ inline bool filter(T* local_x, 
                                     int in_dim_size,
                                     float show_coeff,
@@ -838,7 +838,7 @@ struct sum_pooling_concate<T, false, true, true, false> {
 // add for bscvrq
 // embedx_concate_filter:true && quant_ratio_valid=false && need_filter=true && embed_threshold_filter=true
 template <typename T>
-struct sum_pooling_concate<T, false, true, true, true> {
+struct sum_pooling_concate<T, false, true, true> {
     static __device__ inline bool filter(T* local_x, 
                                     int in_dim_size,
                                     float show_coeff,
@@ -862,6 +862,7 @@ struct sum_pooling_concate<T, false, true, true, true> {
         if (embedx_weight_score < embed_threshold) { 
             return true;
         } 
+        } 
         return false;
     }
 };
@@ -1152,7 +1153,7 @@ struct do_sum_pooling_and_cvm_with_large_dim<T, use_cvm, clk_filter, need_filter
                 mfence();
                 GM2LM(cur_x + (i + j) * in_dim_size, local_x, in_dim_size * sizeof(T));
                 
-                bool is_filter = sum_pooling_concate<T, false, true, need_filter, embed_threshold_filter>::filter(
+                bool is_filter = sum_pooling_concate<T, false, need_filter, embed_threshold_filter>::filter(
                     local_x, in_dim_size, show_coeff, clk_coeff, threshold, cvm_offset, embed_threshold);
                 if (is_filter) { 
                     continue;