Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[hybrid] fused_feedforward op support tensor model parallel #40160

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions paddle/fluid/operators/fused/fused_feedforward_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,8 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(false);
AddAttr<int>("dropout1_seed", "Dropout1 random seed.").SetDefault(0);
AddAttr<int>("dropout2_seed", "Dropout2 random seed.").SetDefault(0);
AddAttr<int>("ring_id", "ring id for tensor model parallel.")
.SetDefault(-1);
AddComment(R"DOC(
the function of fused_feedforward operator is the same as the following pseudo code:
residual = src;
Expand Down
48 changes: 43 additions & 5 deletions paddle/fluid/operators/fused/fused_feedforward_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,39 @@ limitations under the License. */
#include "paddle/fluid/operators/fused/fused_dropout_helper.h"
#include "paddle/fluid/operators/layer_norm_kernel.cu.h"

#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif

namespace paddle {
namespace operators {

using Tensor = framework::Tensor;

template <typename T>
static void AllReduce(framework::Tensor& tensor, // NOLINT
const int ring_id,
const platform::CUDADeviceContext& ctx) {
if (ring_id == -1) return;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto dtype =
platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype()));
int64_t numel = tensor.numel();
const void* sendbuff = tensor.data<T>();
auto place = ctx.GetPlace();
void* recvbuff = tensor.mutable_data<T>(place);
auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
auto stream = ctx.stream();
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
#else
PADDLE_THROW(platform::errors::Unimplemented(
"PaddlePaddle should compile with NCCL or RCCL when used tensor model "
"parallel op."));
#endif
}

template <typename DeviceContext, typename T>
class FusedFeedForwardKernel : public framework::OpKernel<T> {
public:
Expand Down Expand Up @@ -56,7 +84,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
framework::Tensor* dropout1_out, framework::Tensor* dropout2_out,
const int bsz_seq, const int d_model, const int dim_feedforward,
const std::string& act_method, const bool pre_layer_norm,
const float epsilon1, const float epsilon2,
const float epsilon1, const float epsilon2, const int ring_id,
const DropoutParam& dropout_param1,
const DropoutParam& dropout_param2,
const platform::CUDADeviceContext& ctx) const {
Expand Down Expand Up @@ -95,6 +123,10 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
framework::Tensor linear2_out;
linear2_out.mutable_data<T>({bsz_seq, d_model}, place);
MatMul(ctx, *dropout1_out, linear2_weight, &linear2_out);

// tensor model parallel
AllReduce<T>(linear2_out, ring_id, ctx);

if (!pre_layer_norm) {
fused_dropout_layernorm_helper.LayernormResidualDropoutBias(
ctx, linear2_out.data<T>(), x.data<T>(), linear2_bias_ptr,
Expand Down Expand Up @@ -150,6 +182,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {

const float epsilon1 = context.Attr<float>("ln1_epsilon");
const float epsilon2 = context.Attr<float>("ln2_epsilon");
const int ring_id = context.Attr<int>("ring_id");

DropoutParam dropout_param1(context, 1);
DropoutParam dropout_param2(context, 2);
Expand Down Expand Up @@ -186,7 +219,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
dropout2_mask, ln1_mean, ln1_variance, ln2_mean, ln2_variance,
linear1_out, ln1_out, dropout1_out, dropout2_out, bsz_seq, d_model,
dim_feedforward, act_method, pre_layer_norm, epsilon1, epsilon2,
dropout_param1, dropout_param2, context.cuda_device_context());
ring_id, dropout_param1, dropout_param2, context.cuda_device_context());
}
};

Expand Down Expand Up @@ -231,7 +264,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
const int dim_feedforward, const DropoutParam& dropout_param1,
const DropoutParam& dropout_param2, const std::string& act_method,
const bool pre_layer_norm, const float epsilon1, const float epsilon2,
const platform::CUDADeviceContext& ctx) const {
const int ring_id, const platform::CUDADeviceContext& ctx) const {
FusedDropoutLayerNormHelper<T, uint8_t> pre_layernorm_helper(
bsz_seq, d_model, epsilon1);
FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper(
Expand Down Expand Up @@ -295,13 +328,16 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
d_ln1_out.mutable_data<T>({bsz_seq, d_model}, place);
MatMulGrad(ctx, d_linear1_out, *ln1_out, linear1_weight, &d_ln1_out,
d_linear1_weight);

// tensor model parallel
AllReduce<T>(d_ln1_out, ring_id, ctx);
pre_layernorm_helper.LayerNormGrad(
ctx, d_ln1_out.data<T>(), x.data<T>(), ln1_gamma_ptr,
ln1_mean->data<U>(), ln1_variance->data<U>(), d_x->data<T>(),
d_ln1_gamma_ptr, d_ln1_beta_ptr);
} else {
MatMulGrad(ctx, d_linear1_out, x, linear1_weight, d_x, d_linear1_weight);
// tensor model parallel
AllReduce<T>(*d_x, ring_id, ctx);
}
std::vector<const Tensor*> ins(2);
std::vector<Tensor*> outs(1);
Expand Down Expand Up @@ -376,6 +412,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {

const float epsilon1 = context.Attr<float>("ln1_epsilon");
const float epsilon2 = context.Attr<float>("ln2_epsilon");
const int ring_id = context.Attr<int>("ring_id");
const std::string act_method = context.Attr<std::string>("act_method");
DropoutParam dropout_param1(context, 1);
DropoutParam dropout_param2(context, 2);
Expand Down Expand Up @@ -419,7 +456,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
d_linear1_bias, d_linear2_weight, d_linear2_bias, d_ln1_scale,
d_ln1_bias, d_ln2_scale, d_ln2_bias, bsz_seq, d_model,
dim_feedforward, dropout_param1, dropout_param2, act_method,
pre_layer_norm, epsilon1, epsilon2, context.cuda_device_context());
pre_layer_norm, epsilon1, epsilon2, ring_id,
context.cuda_device_context());
}
};
} // namespace operators
Expand Down
2 changes: 2 additions & 0 deletions python/paddle/fluid/tests/unittests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
list(APPEND DIST_TEST_OPS test_pipeline)
list(APPEND DIST_TEST_OPS test_ir_pass_pipeline)
list(APPEND DIST_TEST_OPS test_static_model_parallel)
list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_feedforward)
list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
Expand Down Expand Up @@ -1139,6 +1140,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT 120)
set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT 240)
set_tests_properties(test_static_model_parallel_fused_feedforward PROPERTIES TIMEOUT 120)
set_tests_properties(test_collective_split_embedding
test_collective_split_embedding_none_divisible
test_collective_split_row_linear
Expand Down
Loading