From a188a306a7e7701e7cff173e6d4766ace3e82b2f Mon Sep 17 00:00:00 2001 From: brightnesss Date: Wed, 21 Aug 2024 14:18:21 +0800 Subject: [PATCH 1/3] upgrade for bscvrq to support non-quant --- .../fused/fused_seqpool_cvm_kernel.kps | 112 +++++++++++++++++- 1 file changed, 111 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps b/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps index 9681e852a8870..e8391de955b57 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps @@ -795,6 +795,26 @@ struct sum_pooling_concate { } }; +// add for bscvrq +// embedx_concate_filter:true && quant_ratio_valid=false && need_filter=true && embed_threshold_filter=false +template +struct sum_pooling_concate { + static __device__ inline bool filter(T* local_x, + int in_dim_size, + float show_coeff, + float clk_coeff, + float threshold, + int cvm_offset, + float embed_threshold) { + auto &show = local_x[0]; + auto &click = local_x[1]; + if ((show - click) * show_coeff + click * clk_coeff < threshold) { + return true; + } + return false; + } +}; + // embedx_concate_filter:true && quant_ratio_valid=true && need_filter=true && embed_threshold_filter=false template struct sum_pooling_concate { @@ -844,6 +864,96 @@ struct sum_pooling_concate { } }; +// 1)FusedSeqpoolKernelEmbedFilterEmbedxConcate +// embedx_concate_filter:true && quant_ratio_valid=false && need_filter=true && embed_threshold_filter=true +// 2)FusedSeqpoolKernelFilterEmbedxConcate +// embedx_concate_filter:true && quant_ratio_valid=false && need_filter=true && embed_threshold_filter=false +// 3)FusedSeqpoolKernelEmbedxConcate +// embedx_concate_filter:true && quant_ratio_valid=false && need_filter=false +template +struct do_sum_pooling_and_cvm { + static __device__ inline void kernel(T* local_x, + T* local_result, int local_result_len, + float padding_value, + T2* sum_show_clk, + int batch_start, int batch_end, + int in_dim_size, int out_dim_size, + int dim_start_offset, + int seqid, + int max_seq_len, + int quant_ratio, + float quant_ratio_reciprocal, + float32x16_t &v_scale, + float show_coeff, + float clk_coeff, + float threshold, + int cvm_offset, + float embed_threshold, + int embed_thres_size, + int embedx_concate_size, + bool fix_ctr_to_click, + __global_ptr__ T* cur_x, + __global_ptr__ T* cur_y) { + int concate_index = 0; + for (int i = batch_start; i < batch_end; i += max_seq_len) { + // int len = min(batch_end - i, max_seq_len); + int len = min(batch_end - i, max_seq_len); + if (len <= 0) + continue; + mfence(); + + for (int j = 0; j < len; j++) { + mfence(); + GM2LM(cur_x + (i + j) * in_dim_size, local_x, in_dim_size * sizeof(T)); + + bool is_filter = sum_pooling_concate::filter( + local_x, in_dim_size, show_coeff, clk_coeff, threshold, cvm_offset, embed_threshold); + if (is_filter) { + continue; + } + + if (concate_index < embedx_concate_size) { + // first: sum pool + // copy + + float32x16_t v_src1 = vload_lm_float32x16(local_x); + float32x16_t v_src2 = vload_lm_float32x16(local_x + 16); + + vstore_lm_float32x16(local_result, v_src1); + vstore_lm_float32x16(local_result + 16, v_src2); + + mfence_lm(); + // cvm_offset = [0, 2] + for (int cvm_i = 0; cvm_i < cvm_offset; cvm_i++) { + local_result[cvm_i] = local_x[cvm_i]; + } + + // second: cvm + int cur_y_index = seqid * embedx_concate_size * out_dim_size + concate_index * out_dim_size; + cvm_engine::concat_cvm(local_result, + out_dim_size, dim_start_offset, + cur_y_index, + cur_y); + mfence(); + concate_index += 1; + } + } + } + + mfence(); + + // second: cvm + for (int i = concate_index; i < embedx_concate_size; i++) { + memset_value_float(local_result, local_result_len, padding_value); + int cur_y_index = seqid * embedx_concate_size * out_dim_size + i * out_dim_size; + LM2GM_ASYNC(local_result, cur_y + cur_y_index, out_dim_size * sizeof(T)); + mfence(); + } + } +}; + + // 1)FusedSeqpoolKernelEmbedQuantFilterEmbedxConcate // embedx_concate_filter:true && quant_ratio_valid=true && need_filter=true && embed_threshold_filter=true // 2)FusedSeqpoolKernelQuantFilterEmbedxConcate @@ -3374,4 +3484,4 @@ template int sequence_sum_pool_cvm_with_conv_grad(xpu::Context* ctx, uint32_t slot_num, int embedx_concate_size); } // end namespace framework -} // end namespace paddle +} // end namespace paddle \ No newline at end of file From bb24297d56fd861510859b096a53e0ad1b18b58e Mon Sep 17 00:00:00 2001 From: brightnesss Date: Wed, 21 Aug 2024 14:18:21 +0800 Subject: [PATCH 2/3] upgrade for bscvrq to support non-quant --- .../fused/fused_seqpool_cvm_kernel.kps | 148 +++++++++++++++++- 1 file changed, 146 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps b/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps index e8391de955b57..8b9233c300a33 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps @@ -795,10 +795,50 @@ struct sum_pooling_concate { } }; +// add for bscvrq +// embedx_concate_filter:true && quant_ratio_valid=false && need_filter=false && embed_threshold_filter=false +template +struct sum_pooling_concate { + static __device__ inline bool filter(T* local_x, + int in_dim_size, + float show_coeff, + float clk_coeff, + float threshold, + int cvm_offset, + float embed_threshold) { + auto &show = local_x[0]; + auto &click = local_x[1]; + if ((show - click) * show_coeff + click * clk_coeff < threshold) { + return true; + } + return false; + } +}; + // add for bscvrq // embedx_concate_filter:true && quant_ratio_valid=false && need_filter=true && embed_threshold_filter=false template -struct sum_pooling_concate { +struct sum_pooling_concate { + static __device__ inline bool filter(T* local_x, + int in_dim_size, + float show_coeff, + float clk_coeff, + float threshold, + int cvm_offset, + float embed_threshold) { + auto &show = local_x[0]; + auto &click = local_x[1]; + if ((show - click) * show_coeff + click * clk_coeff < threshold) { + return true; + } + return false; + } +}; + +// add for bscvrq +// embedx_concate_filter:true && quant_ratio_valid=false && need_filter=true && embed_threshold_filter=true +template +struct sum_pooling_concate { static __device__ inline bool filter(T* local_x, int in_dim_size, float show_coeff, @@ -811,6 +851,17 @@ struct sum_pooling_concate { if ((show - click) * show_coeff + click * clk_coeff < threshold) { return true; } + + auto &embedw = local_x[cvm_offset]; + auto embedx_weight_score = 0.0; + for (int cvm_i = cvm_offset + 1; cvm_i < in_dim_size; cvm_i++) { + embedx_weight_score += local_x[cvm_i] * local_x[cvm_i]; + // pow(fabs(local_x[cvm_i]), 2); + } + embedx_weight_score = sqrt(embedx_weight_score) + fabs(embedw); + if (embedx_weight_score < embed_threshold) { + return true; + } return false; } }; @@ -907,7 +958,7 @@ struct do_sum_pooling_and_cvm::filter( + bool is_filter = sum_pooling_concate::filter( local_x, in_dim_size, show_coeff, clk_coeff, threshold, cvm_offset, embed_threshold); if (is_filter) { continue; @@ -1058,6 +1109,99 @@ struct do_sum_pooling_and_cvm +struct do_sum_pooling_and_cvm_with_large_dim { + static __device__ inline void kernel(T* local_x, + T* local_result, int local_result_len, + float padding_value, + T2* sum_show_clk, + int batch_start, int batch_end, + int in_dim_size, int out_dim_size, + int dim_start_offset, + int seqid, + int max_seq_len, + int quant_ratio, + float quant_ratio_reciprocal, + float32x16_t &v_scale, + float show_coeff, + float clk_coeff, + float threshold, + int cvm_offset, + float embed_threshold, + int embed_thres_size, + int embedx_concate_size, + bool fix_ctr_to_click, + __global_ptr__ T* cur_x, + __global_ptr__ T* cur_y) { + int concate_index = 0; + for (int i = batch_start; i < batch_end; i += max_seq_len) { + // int len = min(batch_end - i, max_seq_len); + int len = min(batch_end - i, max_seq_len); + if (len <= 0) + continue; + mfence(); + + for (int j = 0; j < len; j++) { + mfence(); + GM2LM(cur_x + (i + j) * in_dim_size, local_x, in_dim_size * sizeof(T)); + + bool is_filter = sum_pooling_concate::filter( + local_x, in_dim_size, show_coeff, clk_coeff, threshold, cvm_offset, embed_threshold); + if (is_filter) { + continue; + } + + if (concate_index < embedx_concate_size) { + // first: sum pool + // copy + for(int k = 0; k < in_dim_size ; k += 32) { + float32x16_t v_src1 = vload_lm_float32x16(local_x + k); + float32x16_t v_src2 = vload_lm_float32x16(local_x + k + 16); + + vstore_lm_float32x16(local_result + k, v_src1); + vstore_lm_float32x16(local_result + k + 16, v_src2); + + mfence_lm(); + } + + mfence_lm(); + // cvm_offset = [0, 2] + for (int cvm_i = 0; cvm_i < cvm_offset; cvm_i++) { + local_result[cvm_i] = local_x[cvm_i]; + sum_show_clk[cvm_i] = local_x[cvm_i]; + } + + // second: cvm + int cur_y_index = seqid * embedx_concate_size * out_dim_size + concate_index * out_dim_size; + cvm_engine::concat_cvm(local_result, + out_dim_size, dim_start_offset, + cur_y_index, + cur_y); + mfence(); + concate_index += 1; + } + } + } + + mfence(); + + // second: cvm + for (int i = concate_index; i < embedx_concate_size; i++) { + memset_value_float(local_result, local_result_len, padding_value); + int cur_y_index = seqid * embedx_concate_size * out_dim_size + i * out_dim_size; + LM2GM_ASYNC(local_result, cur_y + cur_y_index, out_dim_size * sizeof(T)); + mfence(); + } + } +}; + // 1)FusedSeqpoolKernelEmbedQuantFilterEmbedxConcate // embedx_concate_filter:true && quant_ratio_valid=true && need_filter=true && embed_threshold_filter=true // 2)FusedSeqpoolKernelQuantFilterEmbedxConcate From 7ee910511e7e37caeaf99de4102d310c5b34ef12 Mon Sep 17 00:00:00 2001 From: brightnesss Date: Wed, 21 Aug 2024 14:18:21 +0800 Subject: [PATCH 3/3] upgrade for bscvrq to support non-quant --- .../fluid/operators/fused/fused_seqpool_cvm_kernel.kps | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps b/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps index 8b9233c300a33..673650ed3690b 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps @@ -798,7 +798,7 @@ struct sum_pooling_concate { // add for bscvrq // embedx_concate_filter:true && quant_ratio_valid=false && need_filter=false && embed_threshold_filter=false template -struct sum_pooling_concate { +struct sum_pooling_concate { static __device__ inline bool filter(T* local_x, int in_dim_size, float show_coeff, @@ -818,7 +818,7 @@ struct sum_pooling_concate { // add for bscvrq // embedx_concate_filter:true && quant_ratio_valid=false && need_filter=true && embed_threshold_filter=false template -struct sum_pooling_concate { +struct sum_pooling_concate { static __device__ inline bool filter(T* local_x, int in_dim_size, float show_coeff, @@ -838,7 +838,7 @@ struct sum_pooling_concate { // add for bscvrq // embedx_concate_filter:true && quant_ratio_valid=false && need_filter=true && embed_threshold_filter=true template -struct sum_pooling_concate { +struct sum_pooling_concate { static __device__ inline bool filter(T* local_x, int in_dim_size, float show_coeff, @@ -862,6 +862,7 @@ struct sum_pooling_concate { if (embedx_weight_score < embed_threshold) { return true; } + } return false; } }; @@ -1152,7 +1153,7 @@ struct do_sum_pooling_and_cvm_with_large_dim::filter( + bool is_filter = sum_pooling_concate::filter( local_x, in_dim_size, show_coeff, clk_coeff, threshold, cvm_offset, embed_threshold); if (is_filter) { continue;