From 358af3daf891933425c419bbb986280e9a4f0c1b Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Tue, 8 Mar 2022 11:35:31 +0000 Subject: [PATCH 01/16] [Phi] Migrate Adam and Adamw into Phi --- .../operators/math/selected_rows_functor.cc | 132 +++- .../operators/math/selected_rows_functor.cu | 89 ++- paddle/fluid/operators/optimizers/adam_op.cc | 158 ++-- paddle/fluid/operators/optimizers/adam_op.cu | 420 ----------- paddle/fluid/operators/optimizers/adam_op.h | 695 ------------------ .../fluid/operators/optimizers/adam_op_npu.cc | 2 +- .../fluid/operators/optimizers/adam_op_xpu.cc | 2 +- paddle/fluid/operators/optimizers/adamw_op.cc | 20 - paddle/fluid/operators/optimizers/adamw_op.cu | 443 ----------- paddle/fluid/operators/optimizers/adamw_op.h | 213 ------ .../operators/optimizers/adamw_op_xpu.cc | 2 +- .../operators/optimizers/merged_adam_op.h | 4 +- paddle/phi/infermeta/multiary.cc | 131 ++++ paddle/phi/infermeta/multiary.h | 49 ++ paddle/phi/kernels/CMakeLists.txt | 4 +- paddle/phi/kernels/adam_kernel.h | 74 ++ paddle/phi/kernels/adamw_kernel.h | 80 ++ paddle/phi/kernels/cpu/adam_kernel.cc | 398 ++++++++++ paddle/phi/kernels/cpu/adamw_kernel.cc | 248 +++++++ paddle/phi/kernels/funcs/adam_functors.h | 548 ++++++++++++++ paddle/phi/kernels/gpu/adam_kernel.cu | 532 ++++++++++++++ paddle/phi/kernels/gpu/adamw_kernel.cu | 581 +++++++++++++++ paddle/phi/ops/compat/adam_sig.cc | 67 ++ paddle/phi/ops/compat/adamw_sig.cc | 70 ++ 24 files changed, 2993 insertions(+), 1969 deletions(-) delete mode 100644 paddle/fluid/operators/optimizers/adam_op.cu delete mode 100644 paddle/fluid/operators/optimizers/adam_op.h delete mode 100644 paddle/fluid/operators/optimizers/adamw_op.cc delete mode 100644 paddle/fluid/operators/optimizers/adamw_op.cu delete mode 100644 paddle/fluid/operators/optimizers/adamw_op.h create mode 100644 paddle/phi/kernels/adam_kernel.h create mode 100644 paddle/phi/kernels/adamw_kernel.h create mode 100644 paddle/phi/kernels/cpu/adam_kernel.cc create mode 100644 paddle/phi/kernels/cpu/adamw_kernel.cc create mode 100644 paddle/phi/kernels/funcs/adam_functors.h create mode 100644 paddle/phi/kernels/gpu/adam_kernel.cu create mode 100644 paddle/phi/kernels/gpu/adamw_kernel.cu create mode 100644 paddle/phi/ops/compat/adam_sig.cc create mode 100644 paddle/phi/ops/compat/adamw_sig.cc diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 5ac39953462b5..977b8dd21d7da 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -294,30 +294,30 @@ template struct SelectedRowsAddToTensor +template typename std::enable_if::value>::type elementwise_add_to( - phi::funcs::BlasT* blas, size_t data_len, - const T* in, T* out) { + phi::funcs::BlasT* blas, size_t data_len, const T* in, + T* out) { blas->AXPY(data_len, T(1.f), in, out); } -template +template typename std::enable_if::value>::type elementwise_add_to( - phi::funcs::BlasT* blas, size_t data_len, - const T* in, T* out) { + phi::funcs::BlasT* blas, size_t data_len, const T* in, + T* out) { for (size_t i = 0; i < data_len; i++) { out[i] += in[i]; } } -template +template typename std::enable_if::value>::type add_sparse_inputs(const std::vector& inputs, const std::unordered_map& rows_to_id, - int64_t input_width, - const platform::CPUDeviceContext& context, T* out_data) { + int64_t input_width, const DeviceContext& context, + T* out_data) { #ifndef PADDLE_WITH_MKLDNN - auto blas = phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); #endif for (auto* input : inputs) { if (input->rows().size() == 0) { @@ -336,22 +336,22 @@ add_sparse_inputs(const std::vector& inputs, #else for (size_t i = 0; i < input_rows.size(); i++) { size_t out_i = rows_to_id.at(input_rows[i]); - elementwise_add_to(&blas, static_cast(input_width), - &input_data[i * input_width], - &out_data[out_i * input_width]); + elementwise_add_to( + &blas, static_cast(input_width), &input_data[i * input_width], + &out_data[out_i * input_width]); } #endif } } -template +template typename std::enable_if::value>::type add_sparse_inputs(const std::vector& inputs, const std::unordered_map& rows_to_id, - int64_t input_width, - const platform::CPUDeviceContext& context, T* out_data) { + int64_t input_width, const DeviceContext& context, + T* out_data) { VLOG(4) << "[CPU] add_sparse_inputs <" << typeid(T).name(); - auto blas = phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); for (auto* input : inputs) { if (input->rows().size() == 0) { continue; @@ -361,16 +361,16 @@ add_sparse_inputs(const std::vector& inputs, for (size_t i = 0; i < input_rows.size(); i++) { size_t out_i = rows_to_id.at(input_rows[i]); - elementwise_add_to(&blas, static_cast(input_width), - &input_data[i * input_width], - &out_data[out_i * input_width]); + elementwise_add_to( + &blas, static_cast(input_width), &input_data[i * input_width], + &out_data[out_i * input_width]); } } } -template -struct MergeAdd { - phi::SelectedRows operator()(const platform::CPUDeviceContext& context, +template +struct MergeAddImpl { + phi::SelectedRows operator()(const DeviceContext& context, const phi::SelectedRows& input, const bool sorted_result = false) { phi::SelectedRows out; @@ -378,15 +378,14 @@ struct MergeAdd { return out; } - void operator()(const platform::CPUDeviceContext& context, - const phi::SelectedRows& input, phi::SelectedRows* output, - const bool sorted_result = false) { + void operator()(const DeviceContext& context, const phi::SelectedRows& input, + phi::SelectedRows* output, const bool sorted_result = false) { std::vector inputs; inputs.push_back(&input); (*this)(context, inputs, output, sorted_result); } - void operator()(const platform::CPUDeviceContext& context, + void operator()(const DeviceContext& context, const std::vector& inputs, phi::SelectedRows* output, const bool sorted_result = false) { if (inputs.size() == 0) { @@ -461,7 +460,7 @@ struct MergeAdd { out.set_rows(merge_rows); - phi::funcs::SetConstant constant_functor; + phi::funcs::SetConstant constant_functor; constant_functor(context, out.mutable_value(), static_cast(0.f)); std::unordered_map rows_to_id; @@ -469,11 +468,75 @@ struct MergeAdd { rows_to_id[merge_rows[i]] = i; } - add_sparse_inputs(inputs, rows_to_id, input_width, context, out_data); + add_sparse_inputs(inputs, rows_to_id, input_width, + context, out_data); } } }; +template +struct MergeAdd { + // unary functor, merge by adding duplicated rows in + // the input SelectedRows object. + phi::SelectedRows operator()(const platform::CPUDeviceContext& context, + const phi::SelectedRows& input, + const bool sorted_result) { + return MergeAddImpl()(context, input, + sorted_result); + } + + void operator()(const platform::CPUDeviceContext& context, + const phi::SelectedRows& input, phi::SelectedRows* output, + const bool sorted_result) { + MergeAddImpl()(context, input, output, + sorted_result); + } + + void operator()(const platform::CPUDeviceContext& context, + const std::vector& inputs, + phi::SelectedRows* output, const bool sorted_result) { + MergeAddImpl()(context, inputs, output, + sorted_result); + } +}; + +template +struct MergeAdd { + // unary functor, merge by adding duplicated rows in + // the input SelectedRows object. + phi::SelectedRows operator()(const phi::CPUContext& context, + const phi::SelectedRows& input, + const bool sorted_result) { + return MergeAddImpl()(context, input, sorted_result); + } + + void operator()(const phi::CPUContext& context, + const phi::SelectedRows& input, phi::SelectedRows* output, + const bool sorted_result) { + MergeAddImpl()(context, input, output, sorted_result); + } + + void operator()(const phi::CPUContext& context, + const std::vector& inputs, + phi::SelectedRows* output, const bool sorted_result) { + MergeAddImpl()(context, inputs, output, sorted_result); + } +}; + +#define TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(dtype) \ + template struct MergeAddImpl; \ + template struct MergeAddImpl; \ + template struct MergeAdd; \ + template struct MergeAdd; + +TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(float) +TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(double) +TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(int) +TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(int64_t) +TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(platform::bfloat16) +TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(platform::complex) +TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(platform::complex) + #ifdef PADDLE_WITH_XPU template struct MergeAdd { @@ -714,17 +777,6 @@ struct MergeAverage { } }; -template struct MergeAdd; -template struct MergeAdd; -template struct MergeAdd; -template struct MergeAdd; -template struct MergeAdd>; -template struct MergeAdd>; -template struct MergeAdd; - #ifdef PADDLE_WITH_XPU template struct MergeAdd; #endif diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index a4678550cf7bd..16ef013f689c4 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -319,9 +319,9 @@ __global__ void MergeAddKernel(const T* input, const int64_t* input_rows, } } -template -struct MergeAdd { - phi::SelectedRows operator()(const platform::CUDADeviceContext& context, +template +struct MergeAddImpl { + phi::SelectedRows operator()(const DeviceContext& context, const phi::SelectedRows& input, const bool sorted_result = false) { phi::SelectedRows out; @@ -329,9 +329,8 @@ struct MergeAdd { return out; } - void operator()(const platform::CUDADeviceContext& context, - const phi::SelectedRows& input, phi::SelectedRows* output, - const bool sorted_result = false) { + void operator()(const DeviceContext& context, const phi::SelectedRows& input, + phi::SelectedRows* output, const bool sorted_result = false) { framework::Vector input_rows(input.rows()); if (input_rows.size() == 0) { return; @@ -350,7 +349,7 @@ struct MergeAdd { phi::make_ddim({static_cast(merge_rows.size()), input_width}), context.GetPlace()); - phi::funcs::SetConstant constant_functor; + phi::funcs::SetConstant constant_functor; constant_functor(context, out.mutable_value(), static_cast(0)); auto* out_data = out.mutable_value()->data(); @@ -369,7 +368,7 @@ struct MergeAdd { mix_vector_out.CopyToCPU(); } - void operator()(const platform::CUDADeviceContext& context, + void operator()(const DeviceContext& context, const std::vector& inputs, phi::SelectedRows* output, const bool sorted_result = false) { if (inputs.size() == 0) { @@ -414,7 +413,7 @@ struct MergeAdd { phi::make_ddim({static_cast(merge_rows.size()), input_width}), context.GetPlace()); - phi::funcs::SetConstant constant_functor; + phi::funcs::SetConstant constant_functor; constant_functor(context, out.mutable_value(), static_cast(0)); auto* out_data = out.mutable_value()->data(); @@ -441,15 +440,69 @@ struct MergeAdd { } }; -template struct MergeAdd; -template struct MergeAdd; -template struct MergeAdd; -template struct MergeAdd; -template struct MergeAdd; -template struct MergeAdd; -template struct MergeAdd>; -template struct MergeAdd>; +template +struct MergeAdd { + // unary functor, merge by adding duplicated rows in + // the input SelectedRows object. + phi::SelectedRows operator()(const platform::CUDADeviceContext& context, + const phi::SelectedRows& input, + const bool sorted_result) { + return MergeAddImpl()(context, input, + sorted_result); + } + + void operator()(const platform::CUDADeviceContext& context, + const phi::SelectedRows& input, phi::SelectedRows* output, + const bool sorted_result) { + MergeAddImpl()(context, input, output, + sorted_result); + } + + void operator()(const platform::CUDADeviceContext& context, + const std::vector& inputs, + phi::SelectedRows* output, const bool sorted_result) { + MergeAddImpl()(context, inputs, output, + sorted_result); + } +}; + +template +struct MergeAdd { + // unary functor, merge by adding duplicated rows in + // the input SelectedRows object. + phi::SelectedRows operator()(const phi::GPUContext& context, + const phi::SelectedRows& input, + const bool sorted_result) { + return MergeAddImpl()(context, input, sorted_result); + } + + void operator()(const phi::GPUContext& context, + const phi::SelectedRows& input, phi::SelectedRows* output, + const bool sorted_result) { + MergeAddImpl()(context, input, output, sorted_result); + } + + void operator()(const phi::GPUContext& context, + const std::vector& inputs, + phi::SelectedRows* output, const bool sorted_result) { + MergeAddImpl()(context, inputs, output, sorted_result); + } +}; + +#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \ + template struct MergeAddImpl; \ + template struct MergeAddImpl; \ + template struct MergeAdd; \ + template struct MergeAdd; + +TEMPLATE_SPECIALIZED_FOR_MERGEADD(float) +TEMPLATE_SPECIALIZED_FOR_MERGEADD(double) +TEMPLATE_SPECIALIZED_FOR_MERGEADD(int) +TEMPLATE_SPECIALIZED_FOR_MERGEADD(int64_t) +TEMPLATE_SPECIALIZED_FOR_MERGEADD(platform::float16) +TEMPLATE_SPECIALIZED_FOR_MERGEADD(platform::bfloat16) +TEMPLATE_SPECIALIZED_FOR_MERGEADD(platform::complex) +TEMPLATE_SPECIALIZED_FOR_MERGEADD(platform::complex) template __global__ void UpdateToTensorKernel(const T* selected_rows, diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index bcb508cd37d68..8225dc8e07d6a 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -12,125 +12,41 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/optimizers/adam_op.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/optimizers/adamw_op.h" + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; -void AdamOp::InferShape(framework::InferShapeContext *ctx) const { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Param"), true, - platform::errors::NotFound("Input(Param) of AdamOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Grad"), true, - platform::errors::NotFound("Input(Grad) of AdamOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Moment1"), true, - platform::errors::NotFound( - "Input(Moment1) of AdamOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Moment2"), true, - platform::errors::NotFound( - "Input(Moment2) of AdamOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true, - platform::errors::NotFound( - "Input(LearningRate) of AdamOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Beta1Pow"), true, - platform::errors::NotFound( - "Input(Beta1Pow) of AdamOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Beta2Pow"), true, - platform::errors::NotFound( - "Input(Beta2Pow) of AdamOp should not be null.")); - - PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true, - platform::errors::NotFound( - "Output(ParamOut) of AdamOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment1Out"), true, - platform::errors::NotFound( - "Output(Moment1Out) of AdamOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment2Out"), true, - platform::errors::NotFound( - "Output(Moment2Out) of AdamOp should not be null.")); - - auto lr_dims = ctx->GetInputDim("LearningRate"); - PADDLE_ENFORCE_NE( - phi::product(lr_dims), 0, - platform::errors::InvalidArgument( - "The number of LearningRate shall not be 0, but received %d. Maybe " - "the Input variable LearningRate has not " - "been initialized. You may need to confirm " - "if you put exe.run(startup_program) " - "after optimizer.minimize function.", - phi::product(lr_dims))); - PADDLE_ENFORCE_EQ( - phi::product(lr_dims), 1, - platform::errors::InvalidArgument( - "Learning rate should have 1 dimension, but received %d", - phi::product(lr_dims))); - auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); - VLOG(3) << "dims of Beta1Pow : [" << beta1_pow_dims << "]"; - PADDLE_ENFORCE_GE(phi::product(beta1_pow_dims), 1, - platform::errors::InvalidArgument( - "The size of Beta1 power accumulator should be greater " - "than 0, but received %d.", - phi::product(beta1_pow_dims))); - auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow"); - VLOG(3) << "dims of Beta2Pow : [" << beta2_pow_dims << "]"; - PADDLE_ENFORCE_GE(phi::product(beta2_pow_dims), 1, - platform::errors::InvalidArgument( - "The size of Beta2 power accumulator should be greater " - "than 0, but received %d.", - phi::product(beta2_pow_dims))); +class AdamOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; - auto param_dims = ctx->GetInputDim("Param"); - if (ctx->GetInputsVarType("Grad")[0] == - framework::proto::VarType::LOD_TENSOR) { - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Grad"), - platform::errors::InvalidArgument( - "Param and Grad input of AdamOp should have same dimension. But " - "received Param dims: [%s], Grad dims: [%s].", - param_dims, ctx->GetInputDim("Grad"))); + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + auto input_data_type = + OperatorWithKernel::IndicateVarDataType(ctx, "Param"); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Moment1"), - platform::errors::InvalidArgument( - "Param and Moment1 input of AdamOp should have same dimension. But " - "received Param dims: [%s], Moment1 dims: [%s].", - param_dims, ctx->GetInputDim("Moment1"))); - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Moment2"), - platform::errors::InvalidArgument( - "Param and Moment2 input of AdamOp should have same dimension. But " - "received Param dims: [%s], Moment2 dims: [%s].", - param_dims, ctx->GetInputDim("Moment2"))); - - ctx->SetOutputDim("ParamOut", param_dims); - ctx->SetOutputDim("Moment1Out", param_dims); - ctx->SetOutputDim("Moment2Out", param_dims); - ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims); - ctx->SetOutputDim("Beta2PowOut", beta2_pow_dims); -} -framework::OpKernelType AdamOp::GetExpectedKernelType( - const framework::ExecutionContext &ctx) const { - auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Param"); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); -} - -framework::OpKernelType AdamOp::GetKernelTypeForVar( - const std::string &var_name, const framework::Tensor &tensor, - const framework::OpKernelType &expected_kernel_type) const { - if (var_name == "Beta1Pow" || var_name == "Beta2Pow" || - var_name == "SkipUpdate") { - return expected_kernel_type; - } else { - return framework::OpKernelType(expected_kernel_type.data_type_, - tensor.place(), tensor.layout()); + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const framework::Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const { + if (var_name == "Beta1Pow" || var_name == "Beta2Pow" || + var_name == "SkipUpdate") { + return expected_kernel_type; + } else { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } } -} +}; class AdamOpMaker : public framework::OpProtoAndCheckerMaker { public: @@ -232,6 +148,10 @@ param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsil } }; +class AdamWOp : public AdamOp { + using AdamOp::AdamOp; +}; + class AdamWOpMaker : public AdamOpMaker { public: void Make() { @@ -255,13 +175,23 @@ class AdamWOpMaker : public AdamOpMaker { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker); - -REGISTER_OP_WITHOUT_GRADIENT(adamw, ops::AdamWOp, ops::AdamWOpMaker); -REGISTER_OP_CPU_KERNEL( - adam, ops::AdamOpKernel, - ops::AdamOpKernel); +DECLARE_INFER_SHAPE_FUNCTOR(adam, AdamInferMetaFunctor, + PD_INFER_META(phi::AdamInferMeta)); + +REGISTER_OPERATOR( + adam, ops::AdamOp, ops::AdamOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + AdamInferMetaFunctor); + +DECLARE_INFER_SHAPE_FUNCTOR(adamw, AdamwInferMetaFunctor, + PD_INFER_META(phi::AdamwInferMeta)); +REGISTER_OPERATOR( + adamw, ops::AdamWOp, ops::AdamWOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + AdamwInferMetaFunctor); REGISTER_OP_VERSION(adam) .AddCheckpoint( diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu deleted file mode 100644 index c1aa392d8a528..0000000000000 --- a/paddle/fluid/operators/optimizers/adam_op.cu +++ /dev/null @@ -1,420 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/optimizers/adam_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -template -__global__ void AdamKernelREG(MT beta1, MT beta2, MT epsilon, MT beta1_pow_, - MT beta2_pow_, const MT* moment1, MT* moment1_out, - const MT* moment2, MT* moment2_out, const MT* lr_, - const T* grad, const T* param, T* param_out, - const MT* master_param, MT* master_param_out, - int ndim) { - MT lr = *lr_; - MT beta1_pow = beta1_pow_; - MT beta2_pow = beta2_pow_; - - int id = blockIdx.x * blockDim.x + threadIdx.x; - - for (; id < ndim; id += gridDim.x * blockDim.x) { - MT p = master_param ? master_param[id] : static_cast(param[id]); - MT g = static_cast(grad[id]); - MT mom1 = static_cast(moment1[id]); - MT mom2 = static_cast(moment2[id]); - mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; - mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - - MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; - p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); - - moment1_out[id] = mom1; - moment2_out[id] = mom2; - param_out[id] = static_cast(p); - if (master_param_out) { - master_param_out[id] = p; - } - } -} - -template -__global__ void AdamKernelMEM(MT beta1, MT beta2, MT epsilon, - const MT* beta1_pow_, const MT* beta2_pow_, - const MT* moment1, MT* moment1_out, - const MT* moment2, MT* moment2_out, const MT* lr_, - const T* grad, const T* param, T* param_out, - const MT* master_param, MT* master_param_out, - int ndim) { - MT lr = *lr_; - MT beta1_pow = *beta1_pow_; - MT beta2_pow = *beta2_pow_; - - int id = blockIdx.x * blockDim.x + threadIdx.x; - - for (; id < ndim; id += gridDim.x * blockDim.x) { - MT p = master_param ? master_param[id] : static_cast(param[id]); - MT g = static_cast(grad[id]); - MT mom1 = static_cast(moment1[id]); - MT mom2 = static_cast(moment2[id]); - mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; - mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - - MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; - p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); - - moment1_out[id] = mom1; - moment2_out[id] = mom2; - param_out[id] = static_cast(p); - if (master_param_out) { - master_param_out[id] = p; - } - } -} -template -__global__ void UpdateBetaPow(T beta1, T beta2, const T* beta1_pow_, - const T* beta2_pow_, T* beta1_pow_out, - T* beta2_pow_out) { - *beta1_pow_out = beta1 * beta1_pow_[0]; - *beta2_pow_out = beta2 * beta2_pow_[0]; -} - -template -__global__ void SparseAdamCUDAKernelREG( - MT beta1, MT beta2, MT epsilon, const MT beta1_pow, const MT beta2_pow, - const MT* mom1_, MT* mom1_out_, const MT* mom2_, MT* mom2_out_, - const MT* lr_, const T* grad_, const T* param_, T* param_out_, - const MT* master_param, MT* master_param_out, const int64_t* rows_, - int64_t row_numel, int64_t row_count, bool lazy_mode, int ndim) { - int id = blockIdx.x * blockDim.x + threadIdx.x; - MT lr = *lr_; - - for (; id < ndim; id += blockDim.x * gridDim.x) { - auto row_idx = - phi::funcs::BinarySearch(rows_, row_count, id / row_numel); - if (lazy_mode && row_idx < 0) { - return; - } else { - MT mom1 = mom1_[id]; - MT mom2 = mom2_[id]; - MT p = master_param ? master_param[id] : static_cast(param_[id]); - MT g = row_idx >= 0 - ? static_cast(grad_[row_idx * row_numel + id % row_numel]) - : static_cast(0); - mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; - mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - - MT denom = - (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; - p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); - - // Write back to global memory - mom1_out_[id] = mom1; - mom2_out_[id] = mom2; - param_out_[id] = static_cast(p); - if (master_param_out) { - master_param_out[id] = p; - } - } - } -} - -template -class AdamOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - - using paddle::framework::LoDTensor; - using MPDType = typename details::MPTypeTrait::Type; - - int64_t min_row_size_to_use_multithread = - ctx.Attr("min_row_size_to_use_multithread"); - bool lazy_mode = ctx.Attr("lazy_mode"); - bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); - VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; - - auto* param = ctx.Input("Param"); - auto* grad_var = ctx.InputVar("Grad"); - auto* mom1 = ctx.Input("Moment1"); - auto* mom2 = ctx.Input("Moment2"); - auto* lr = ctx.Input("LearningRate"); - - auto* beta1_pow = ctx.Input("Beta1Pow"); - auto* beta2_pow = ctx.Input("Beta2Pow"); - - auto* param_out = ctx.Output("ParamOut"); - auto* mom1_out = ctx.Output("Moment1Out"); - auto* mom2_out = ctx.Output("Moment2Out"); - auto* beta1_pow_out = ctx.Output("Beta1PowOut"); - auto* beta2_pow_out = ctx.Output("Beta2PowOut"); - - bool skip_update = false; - if (ctx.HasInput("SkipUpdate")) { - auto* skip_update_tensor = ctx.Input("SkipUpdate"); - PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, - platform::errors::InvalidArgument( - "Input(SkipUpdate) size must be 1, but get %d", - skip_update_tensor->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector(*skip_update_tensor, - ctx.device_context(), &skip_update_vec); - skip_update = skip_update_vec[0]; - } - // skip_update=true, just copy input to output, and TensorCopy will call - // mutable_data - if (skip_update) { - VLOG(4) << "Adam skip update"; - framework::TensorCopy( - *param, ctx.GetPlace(), - ctx.template device_context(), param_out); - framework::TensorCopy( - *mom1, ctx.GetPlace(), - ctx.template device_context(), mom1_out); - framework::TensorCopy( - *mom2, ctx.GetPlace(), - ctx.template device_context(), mom2_out); - framework::TensorCopy( - *beta1_pow, beta1_pow->place(), - ctx.template device_context(), - beta1_pow_out); - framework::TensorCopy( - *beta2_pow, beta2_pow->place(), - ctx.template device_context(), - beta2_pow_out); - return; - } - - MPDType beta1 = static_cast(ctx.Attr("beta1")); - if (ctx.HasInput("Beta1Tensor")) { - auto* beta1_tensor = ctx.Input("Beta1Tensor"); - PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1, - platform::errors::InvalidArgument( - "Input(Beta1Tensor) size must be 1, but get %d", - beta1_tensor->numel())); - beta1 = static_cast(GetAttrFromTensor(beta1_tensor)); - } - MPDType beta2 = static_cast(ctx.Attr("beta2")); - if (ctx.HasInput("Beta2Tensor")) { - auto* beta2_tensor = ctx.Input("Beta2Tensor"); - PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1, - platform::errors::InvalidArgument( - "Input(Beta2Tensor) size must be 1, but get %d", - beta2_tensor->numel())); - beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); - } - MPDType epsilon = static_cast(ctx.Attr("epsilon")); - if (ctx.HasInput("EpsilonTensor")) { - auto* epsilon_tensor = ctx.Input("EpsilonTensor"); - PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1, - platform::errors::InvalidArgument( - "Input(EpsilonTensor) size must be 1, but get %d", - epsilon_tensor->numel())); - epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); - } - VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() - << "beta2_pow.numel() : " << beta2_pow->numel(); - VLOG(3) << "param.numel(): " << param->numel(); - PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1, - platform::errors::InvalidArgument( - "beta1 pow output size should be 1, but received " - "value is:%d.", - beta1_pow_out->numel())); - - PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1, - platform::errors::InvalidArgument( - "beta2 pow output size should be 1, but received " - "value is:%d.", - beta2_pow_out->numel())); - - const bool multi_precision = ctx.Attr("multi_precision"); - const LoDTensor* master_param = nullptr; - LoDTensor* master_param_out = nullptr; - if (multi_precision) { - bool has_master = - ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut"); - PADDLE_ENFORCE_EQ(has_master, true, - platform::errors::InvalidArgument( - "The Input(MasterParam) and Output(MasterParamOut) " - "should not be null when " - "the attr `multi_precision` is true")); - master_param = ctx.Input("MasterParam"); - master_param_out = ctx.Output("MasterParamOut"); - } - const MPDType* master_in_data = - multi_precision ? master_param->data() : nullptr; - MPDType* master_out_data = - multi_precision - ? master_param_out->mutable_data(ctx.GetPlace()) - : nullptr; - - auto& dev_ctx = ctx.template device_context(); - - if (grad_var->IsType()) { - auto* grad = ctx.Input("Grad"); - - // update param and moment - int threads = 512; - int blocks = (param->numel() + threads - 1) / threads; - - if (beta1_pow->place() == platform::CPUPlace() && - beta2_pow->place() == platform::CPUPlace()) { - // Compute with betapow in REG - AdamKernelREG<<>>( - beta1, beta2, epsilon, *beta1_pow->data(), - *beta2_pow->data(), mom1->data(), - mom1_out->mutable_data(ctx.GetPlace()), - mom2->data(), - mom2_out->mutable_data(ctx.GetPlace()), - lr->data(), grad->data(), param->data(), - param_out->mutable_data(ctx.GetPlace()), master_in_data, - master_out_data, param->numel()); - if (!use_global_beta_pow) { - // Cpu update - beta1_pow_out->mutable_data(platform::CPUPlace())[0] = - beta1 * beta1_pow->data()[0]; - beta2_pow_out->mutable_data(platform::CPUPlace())[0] = - beta2 * beta2_pow->data()[0]; - } - } else { - AdamKernelMEM<<>>( - beta1, beta2, epsilon, beta1_pow->data(), - beta2_pow->data(), mom1->data(), - mom1_out->mutable_data(ctx.GetPlace()), - mom2->data(), - mom2_out->mutable_data(ctx.GetPlace()), - lr->data(), grad->data(), param->data(), - param_out->mutable_data(ctx.GetPlace()), master_in_data, - master_out_data, param->numel()); - if (!use_global_beta_pow) { - // Update with gpu - UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( - beta1, beta2, beta1_pow->data(), - beta2_pow->data(), - beta1_pow_out->mutable_data(ctx.GetPlace()), - beta2_pow_out->mutable_data(ctx.GetPlace())); - } - } - } else if (grad_var->IsType()) { - auto* grad = ctx.Input("Grad"); - if (grad->rows().size() == 0) { - VLOG(3) << "grad row size is 0!!"; - return; - } - - std::vector cpu_rows(grad->rows().begin(), grad->rows().end()); - bool is_strict_sorted = true; - for (size_t i = 1; i < cpu_rows.size(); ++i) { - if (cpu_rows[i - 1] >= cpu_rows[i]) { - is_strict_sorted = false; - break; - } - } - - phi::SelectedRows tmp_grad_merge; - const phi::SelectedRows* grad_merge_ptr; - if (is_strict_sorted) { - grad_merge_ptr = grad; - } else { - // merge duplicated rows if any. - // The rows of grad_merge have been sorted inside MergeAdd functor - scatter::MergeAdd merge_func; - merge_func(ctx.template device_context(), - *grad, &tmp_grad_merge, true); - grad_merge_ptr = &tmp_grad_merge; - } - auto& grad_merge = *grad_merge_ptr; - auto& grad_tensor = grad_merge.value(); - const T* grad_data = grad_tensor.template data(); - auto* grad_merge_rows = &grad_merge.rows(); - paddle::framework::MixVector mixv_grad_merge_rows( - grad_merge_rows); - const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace()); - auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); - - if (beta1_pow->place() == platform::CPUPlace() && - beta2_pow->place() == platform::CPUPlace()) { - int threads = 512; - int ndim = param->numel(); - int blocks = (ndim + threads - 1) / threads; - - SparseAdamCUDAKernelREG< - T, MPDType><<>>( - beta1, beta2, epsilon, *beta1_pow->data(), - *beta2_pow->data(), mom1->data(), - mom1_out->mutable_data(ctx.GetPlace()), - mom2->data(), - mom2_out->mutable_data(ctx.GetPlace()), - lr->data(), grad_data, param->data(), - param_out->mutable_data(ctx.GetPlace()), master_in_data, - master_out_data, rows, row_numel, grad_merge.rows().size(), - lazy_mode, ndim); - if (!use_global_beta_pow) { - // Update with cpu - beta1_pow_out->mutable_data(platform::CPUPlace())[0] = - beta1 * beta1_pow->data()[0]; - beta2_pow_out->mutable_data(platform::CPUPlace())[0] = - beta2 * beta2_pow->data()[0]; - } - } else { - SparseAdamFunctor functor( - beta1, beta2, epsilon, beta1_pow->data(), - beta2_pow->data(), mom1->data(), - mom1_out->mutable_data(ctx.GetPlace()), - mom2->data(), - mom2_out->mutable_data(ctx.GetPlace()), - lr->data(), grad_data, param->data(), - param_out->mutable_data(ctx.GetPlace()), master_in_data, - master_out_data, rows, row_numel, grad_merge.rows().size(), - lazy_mode); - - // FIXME(minqiyang): remove BinarySearch in GPU later - platform::ForRange for_range( - static_cast( - ctx.device_context()), - param->numel()); - for_range(functor); - if (!use_global_beta_pow) { - // update beta1 and beta2 - UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( - beta1, beta2, beta1_pow->data(), - beta2_pow->data(), - beta1_pow_out->mutable_data(ctx.GetPlace()), - beta2_pow_out->mutable_data(ctx.GetPlace())); - } - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Variable type not supported by adam_op")); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(adam, ops::AdamOpCUDAKernel, - ops::AdamOpCUDAKernel, - ops::AdamOpCUDAKernel); diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h deleted file mode 100644 index decab04f1ca26..0000000000000 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ /dev/null @@ -1,695 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include // for sqrt in CPU and CUDA -#include -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/jit/kernels.h" -#include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/fluid/platform/profiler.h" -#include "paddle/phi/kernels/funcs/algorithm.h" - -namespace paddle { -namespace operators { - -namespace scatter = paddle::operators::math::scatter; - -static inline float GetAttrFromTensor(const framework::Tensor* tensor) { - const float* tensor_data = tensor->data(); - framework::Tensor cpu_tensor; - if (platform::is_gpu_place(tensor->place())) { - paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), - &cpu_tensor); - tensor_data = cpu_tensor.data(); - } - if (platform::is_xpu_place(tensor->place())) { - paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), - &cpu_tensor); - tensor_data = cpu_tensor.data(); - } - return tensor_data[0]; -} - -class AdamOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override; - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override; - framework::OpKernelType GetKernelTypeForVar( - const std::string& var_name, const framework::Tensor& tensor, - const framework::OpKernelType& expected_kernel_type) const override; -}; - -struct GPUAdam; -struct CPUAdam; - -template -class AdamFunctor; - -template -class AdamFunctor { - private: - T beta1_; - T beta2_; - T epsilon_; - - const T* beta1_pow_; - const T* beta2_pow_; - const T* moment1_; - T* moment1_out_; - const T* moment2_; - T* moment2_out_; - const T* lr_; - const T* grad_; - const T* param_; - T* param_out_; - - public: - AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, - const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, - T* mom2_out, const T* lr, const T* grad, const T* param, - T* param_out) - : beta1_(beta1), - beta2_(beta2), - epsilon_(epsilon), - beta1_pow_(beta1_pow), - beta2_pow_(beta2_pow), - moment1_(mom1), - moment1_out_(mom1_out), - moment2_(mom2), - moment2_out_(mom2_out), - lr_(lr), - grad_(grad), - param_(param), - param_out_(param_out) {} - - inline HOSTDEVICE void operator()(size_t i) const { - // Merge all memory access together. - T g = grad_[i]; - T mom1 = moment1_[i]; - T mom2 = moment2_[i]; - T lr = *lr_; - T beta1_pow = *beta1_pow_; - T beta2_pow = *beta2_pow_; - T p = param_[i]; - - // Calculation - lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); - - mom1 = beta1_ * mom1 + (1 - beta1_) * g; - mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; - p -= lr * (mom1 / (sqrt(mom2) + epsilon_ * sqrt(1 - beta2_pow))); - - // Write back to global memory - moment1_out_[i] = mom1; - moment2_out_[i] = mom2; - param_out_[i] = p; - } -}; - -template -class AdamFunctor { - private: - T beta1_; - T beta2_; - T epsilon_; - - const T* beta1_pow_; - const T* beta2_pow_; - const T* moment1_; - T* moment1_out_; - const T* moment2_; - T* moment2_out_; - const T* lr_; - const T* grad_; - const T* param_; - T* param_out_; - - public: - AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, - const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, - T* mom2_out, const T* lr, const T* grad, const T* param, - T* param_out) - : beta1_(beta1), - beta2_(beta2), - epsilon_(epsilon), - beta1_pow_(beta1_pow), - beta2_pow_(beta2_pow), - moment1_(mom1), - moment1_out_(mom1_out), - moment2_(mom2), - moment2_out_(mom2_out), - lr_(lr), - grad_(grad), - param_(param), - param_out_(param_out) {} - - void operator()(size_t numel) const { - Eigen::Map> g{ - grad_, static_cast(numel)}; - Eigen::Map> mom1{ - moment1_, static_cast(numel)}; - Eigen::Map> mom2{ - moment2_, static_cast(numel)}; - Eigen::Map> param{ - param_, static_cast(numel)}; - - Eigen::Map> param_out{ - param_out_, static_cast(numel)}; - Eigen::Map> moment1_out{ - moment1_out_, static_cast(numel)}; - Eigen::Map> moment2_out{ - moment2_out_, static_cast(numel)}; - - T lr = *lr_; - T beta1_pow = *beta1_pow_; - T beta2_pow = *beta2_pow_; - - // Calculation - lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); - - moment1_out = beta1_ * mom1 + (1 - beta1_) * g; - moment2_out = beta2_ * mom2 + (1 - beta2_) * g * g; - param_out = param - - lr * (moment1_out / - (moment2_out.sqrt() + epsilon_ * sqrt(1 - beta2_pow))); - } -}; - -template -class SparseAdamFunctor; - -template -class SparseAdamFunctor { - private: - MT beta1_; - MT beta2_; - MT epsilon_; - - const MT* beta1_pow_; - const MT* beta2_pow_; - const MT* moment1_; - MT* moment1_out_; - const MT* moment2_; - MT* moment2_out_; - const MT* lr_; - const T* grad_; - const T* param_; - T* param_out_; - const MT* master_param_; - MT* master_param_out_; - - const int64_t* rows_; - int64_t row_numel_; - int64_t row_count_; - bool lazy_mode_; - - public: - SparseAdamFunctor(MT beta1, MT beta2, MT epsilon, const MT* beta1_pow, - const MT* beta2_pow, const MT* mom1, MT* mom1_out, - const MT* mom2, MT* mom2_out, const MT* lr, const T* grad, - const T* param, T* param_out, const MT* master_param, - MT* master_param_out, const int64_t* rows, - int64_t row_numel, int64_t row_count, bool lazy_mode) - : beta1_(beta1), - beta2_(beta2), - epsilon_(epsilon), - beta1_pow_(beta1_pow), - beta2_pow_(beta2_pow), - moment1_(mom1), - moment1_out_(mom1_out), - moment2_(mom2), - moment2_out_(mom2_out), - lr_(lr), - grad_(grad), - param_(param), - param_out_(param_out), - master_param_(master_param), - master_param_out_(master_param_out), - rows_(rows), - row_numel_(row_numel), - row_count_(row_count), - lazy_mode_(lazy_mode) {} - - inline HOSTDEVICE void adam_update(size_t i, MT g) const { - // The following code is the same as dense - MT mom1 = moment1_[i]; - MT mom2 = moment2_[i]; - MT lr = *lr_; - MT beta1_pow = *beta1_pow_; - MT beta2_pow = *beta2_pow_; - MT p = master_param_ ? master_param_[i] : static_cast(param_[i]); - - // Calculation - lr *= sqrt(static_cast(1.0) - beta2_pow) / - (static_cast(1.0) - beta1_pow); - - mom1 = beta1_ * mom1 + (static_cast(1.0) - beta1_) * g; - mom2 = beta2_ * mom2 + (static_cast(1.0) - beta2_) * g * g; - p -= lr * (mom1 / (sqrt(mom2) + - epsilon_ * sqrt(static_cast(1.0) - beta2_pow))); - - // Write back to global memory - moment1_out_[i] = mom1; - moment2_out_[i] = mom2; - param_out_[i] = static_cast(p); - if (master_param_out_) { - master_param_out_[i] = p; - } - } - - inline HOSTDEVICE void operator()(size_t i) const { - auto row_idx = - phi::funcs::BinarySearch(rows_, row_count_, i / row_numel_); - if (lazy_mode_ && row_idx < 0) { - return; - } else { - MT g = row_idx >= 0 - ? static_cast(grad_[row_idx * row_numel_ + i % row_numel_]) - : static_cast(0); - adam_update(i, g); - } - } -}; - -template -class SparseAdamFunctor { - private: - T beta1_; - T beta2_; - T epsilon_; - - const T* beta1_pow_; - const T* beta2_pow_; - const T* moment1_; - T* moment1_out_; - const T* moment2_; - T* moment2_out_; - const T* lr_; - const T* grad_; - const T* param_; - T* param_out_; - - const int64_t* rows_; - int64_t row_numel_; - int64_t row_count_; - - public: - SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, - const T* beta2_pow, const T* mom1, T* mom1_out, - const T* mom2, T* mom2_out, const T* lr, const T* grad, - const T* param, T* param_out, const int64_t* rows, - int64_t row_numel, int64_t row_count, bool lazy_mode) - : beta1_(beta1), - beta2_(beta2), - epsilon_(epsilon), - beta1_pow_(beta1_pow), - beta2_pow_(beta2_pow), - moment1_(mom1), - moment1_out_(mom1_out), - moment2_(mom2), - moment2_out_(mom2_out), - lr_(lr), - grad_(grad), - param_(param), - param_out_(param_out), - rows_(rows), - row_numel_(row_numel), - row_count_(row_count) {} - - inline HOSTDEVICE void adam_update(size_t i, T g) const { - // The following code is the same as dense - T mom1 = moment1_[i]; - T mom2 = moment2_[i]; - T lr = *lr_; - T beta1_pow = *beta1_pow_; - T beta2_pow = *beta2_pow_; - T p = param_[i]; - - // Calculation - lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); - - mom1 = beta1_ * mom1 + (1 - beta1_) * g; - mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; - p -= lr * (mom1 / (sqrt(mom2) + epsilon_ * sqrt(1 - beta2_pow))); - - // Write back to global memory - moment1_out_[i] = mom1; - moment2_out_[i] = mom2; - param_out_[i] = p; - } - - inline void operator()(size_t numel) const { - // lr could be reuse - T lr = *lr_; - T beta1_pow = *beta1_pow_; - T beta2_pow = *beta2_pow_; - lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); - int64_t row_count = static_cast(numel / row_numel_); - - for (int64_t i = 0, j = 0; i != row_count; ++i) { - if (i == *(rows_ + j)) { - for (int64_t k = 0; k != row_numel_; ++k) { - T g = grad_[j * row_numel_ + k]; - adam_update(i * row_numel_ + k, g); - } - ++j; - } else { - for (int64_t k = 0; k != row_numel_; ++k) { - T mom1 = moment1_[i * row_numel_ + k]; - T mom2 = moment2_[i * row_numel_ + k]; - T p = param_[i * row_numel_ + k]; - - mom1 = beta1_ * mom1; - mom2 = beta2_ * mom2; - - p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); - // Write back to global memory - moment1_out_[i * row_numel_ + k] = mom1; - moment2_out_[i * row_numel_ + k] = mom2; - param_out_[i * row_numel_ + k] = p; - } - } - } - } -}; - -template -class AdamOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - - using paddle::framework::LoDTensor; - - int64_t min_row_size_to_use_multithread = - ctx.Attr("min_row_size_to_use_multithread"); - bool lazy_mode = ctx.Attr("lazy_mode"); - bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); - VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; - - auto* param = ctx.Input("Param"); - auto* grad_var = ctx.InputVar("Grad"); - auto* mom1 = ctx.Input("Moment1"); - auto* mom2 = ctx.Input("Moment2"); - auto* lr = ctx.Input("LearningRate"); - auto* beta1_pow = ctx.Input("Beta1Pow"); - auto* beta2_pow = ctx.Input("Beta2Pow"); - - auto* param_out = ctx.Output("ParamOut"); - auto* mom1_out = ctx.Output("Moment1Out"); - auto* mom2_out = ctx.Output("Moment2Out"); - auto* beta1_pow_out = ctx.Output("Beta1PowOut"); - auto* beta2_pow_out = ctx.Output("Beta2PowOut"); - - bool skip_update = false; - if (ctx.HasInput("SkipUpdate")) { - auto* skip_update_tensor = ctx.Input("SkipUpdate"); - PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, - platform::errors::InvalidArgument( - "Input(SkipUpdate) size must be 1, but get %d", - skip_update_tensor->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector(*skip_update_tensor, - ctx.device_context(), &skip_update_vec); - skip_update = skip_update_vec[0]; - } - // skip_update=true, just copy input to output, and TensorCopy will call - // mutable_data - if (skip_update) { - VLOG(4) << "Adam skip update"; - framework::TensorCopy( - *param, ctx.GetPlace(), - ctx.template device_context(), param_out); - framework::TensorCopy( - *mom1, ctx.GetPlace(), - ctx.template device_context(), mom1_out); - framework::TensorCopy( - *mom2, ctx.GetPlace(), - ctx.template device_context(), mom2_out); - framework::TensorCopy( - *beta1_pow, ctx.GetPlace(), - ctx.template device_context(), - beta1_pow_out); - framework::TensorCopy( - *beta2_pow, ctx.GetPlace(), - ctx.template device_context(), - beta2_pow_out); - return; - } - - T beta1 = static_cast(ctx.Attr("beta1")); - if (ctx.HasInput("Beta1Tensor")) { - auto* beta1_tensor = ctx.Input("Beta1Tensor"); - PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1, - platform::errors::InvalidArgument( - "Input(Beta1Tensor) size must be 1, but get %d", - beta1_tensor->numel())); - beta1 = static_cast(GetAttrFromTensor(beta1_tensor)); - } - T beta2 = static_cast(ctx.Attr("beta2")); - if (ctx.HasInput("Beta2Tensor")) { - auto* beta2_tensor = ctx.Input("Beta2Tensor"); - PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1, - platform::errors::InvalidArgument( - "Input(Beta2Tensor) size must be 1, but get %d", - beta2_tensor->numel())); - beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); - } - T epsilon = static_cast(ctx.Attr("epsilon")); - if (ctx.HasInput("EpsilonTensor")) { - auto* epsilon_tensor = ctx.Input("EpsilonTensor"); - PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1, - platform::errors::InvalidArgument( - "Input(EpsilonTensor) size must be 1, but get %d", - epsilon_tensor->numel())); - epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); - } - - VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() - << "beta2_pow.numel() : " << beta2_pow->numel(); - VLOG(3) << "param.numel(): " << param->numel(); - - PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1, - platform::errors::InvalidArgument( - "beta1 pow output size should be 1, but received " - "value is:%d.", - beta1_pow_out->numel())); - - PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1, - platform::errors::InvalidArgument( - "beta2 pow output size should be 1, but received " - "value is:%d.", - beta2_pow_out->numel())); - - if (grad_var->IsType()) { - T beta1_p = beta1_pow->data()[0]; - T beta2_p = beta2_pow->data()[0]; - - if (!use_global_beta_pow) { - beta1_pow_out->mutable_data(ctx.GetPlace())[0] = - beta1 * beta1_pow->data()[0]; - beta2_pow_out->mutable_data(ctx.GetPlace())[0] = - beta2 * beta2_pow->data()[0]; - } - - auto* grad = ctx.Input("Grad"); - - T* param_out_ptr = param_out->mutable_data(ctx.GetPlace()); - T* mom1_out_ptr = mom1_out->mutable_data(ctx.GetPlace()); - T* mom2_out_ptr = mom2_out->mutable_data(ctx.GetPlace()); - - T learning_rate = lr->data()[0] * (sqrt(1 - beta2_p) / (1 - beta1_p)); - T eps = epsilon * sqrt(1 - beta2_p); - - jit::adam_attr_t attr(beta1, beta2); - int64_t numel = param->numel(); - - const T* param_ptr = param->data(); - const T* mom1_ptr = mom1->data(); - const T* mom2_ptr = mom2->data(); - const T* grad_ptr = grad->data(); - - auto adam = - jit::KernelFuncs, platform::CPUPlace>::Cache().At( - attr); - - static constexpr int64_t chunk_size = 512; - -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int64_t i = 0; i < numel / chunk_size; ++i) { - const int64_t offset = i * chunk_size; - adam(beta1, beta2, -learning_rate, eps, chunk_size, grad_ptr + offset, - mom1_ptr + offset, mom2_ptr + offset, param_ptr + offset, - mom1_out_ptr + offset, mom2_out_ptr + offset, - param_out_ptr + offset); - } - - if (numel % chunk_size != 0) { - const int64_t offset = (numel / chunk_size) * chunk_size; - const int64_t tail_numel = numel % chunk_size; - adam(beta1, beta2, -learning_rate, eps, tail_numel, grad_ptr + offset, - mom1_ptr + offset, mom2_ptr + offset, param_ptr + offset, - mom1_out_ptr + offset, mom2_out_ptr + offset, - param_out_ptr + offset); - } - } else if (grad_var->IsType()) { - auto* grad = ctx.Input("Grad"); - if (grad->rows().size() == 0) { - VLOG(3) << "grad row size is 0!!"; - return; - } - - std::vector cpu_rows(grad->rows().begin(), grad->rows().end()); - bool is_strict_sorted = true; - for (size_t i = 1; i < cpu_rows.size(); ++i) { - if (cpu_rows[i - 1] >= cpu_rows[i]) { - is_strict_sorted = false; - break; - } - } - - phi::SelectedRows tmp_grad_merge; - const phi::SelectedRows* grad_merge_ptr; - if (is_strict_sorted) { - grad_merge_ptr = grad; - } else { - // merge duplicated rows if any. - // The rows of grad_merge have been sorted inside MergeAdd functor - scatter::MergeAdd merge_func; - merge_func(ctx.template device_context(), *grad, - &tmp_grad_merge, true); - grad_merge_ptr = &tmp_grad_merge; - } - - auto& grad_merge = *grad_merge_ptr; - auto& grad_tensor = grad_merge.value(); - const T* grad_data = grad_tensor.template data(); - auto* grad_merge_rows = &grad_merge.rows(); - paddle::framework::MixVector mixv_grad_merge_rows( - grad_merge_rows); - const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace()); - auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); - - SparseAdamFunctor functor( - beta1, beta2, epsilon, beta1_pow->data(), beta2_pow->data(), - mom1->data(), mom1_out->mutable_data(ctx.GetPlace()), - mom2->data(), mom2_out->mutable_data(ctx.GetPlace()), - lr->data(), grad_data, param->data(), - param_out->mutable_data(ctx.GetPlace()), rows, row_numel, - grad_merge.rows().size(), lazy_mode); - // update beta1 and beta2 - if (!use_global_beta_pow) { - beta1_pow_out->mutable_data(ctx.GetPlace())[0] = - beta1 * beta1_pow->data()[0]; - beta2_pow_out->mutable_data(ctx.GetPlace())[0] = - beta2 * beta2_pow->data()[0]; - } - if (lazy_mode) { - VLOG(3) << "run cpu lazy mode"; - size_t row_count = grad_merge.rows().size(); - std::vector cpu_rows(grad_merge.rows()); - for (size_t row_index = 0; row_index < row_count; ++row_index) { - for (size_t offset = 0; offset < row_numel; ++offset) { - size_t i = cpu_rows[row_index] * row_numel + offset; - functor.adam_update(i, grad_data[row_index * row_numel + offset]); - } - } - } -#ifndef _WIN32 - else if (FLAGS_inner_op_parallelism > 1 && // NOLINT - min_row_size_to_use_multithread > 0 && - param->dims()[0] > min_row_size_to_use_multithread) { - VLOG(3) << "use multi thread, inner_op_parallelism=" - << FLAGS_inner_op_parallelism - << " min_row_size_to_use_multithread=" - << min_row_size_to_use_multithread; - if (FLAGS_inner_op_parallelism > 10) { - VLOG(1) << "FLAGS_inner_op_parallelism " << FLAGS_inner_op_parallelism - << " is two large!"; - } - auto& grad_rows = grad_merge.rows(); - std::unordered_map row_id_to_grad_row_offset; - size_t param_row_count = param->numel() / row_numel; - if (param_row_count < 1000) { - VLOG(1) << "param_row_count should be larger then 1000 to use " - "multi thread, currently " - << param_row_count; - } - for (size_t i = 0; i < grad_rows.size(); ++i) { - row_id_to_grad_row_offset[grad_rows[i]] = i; - } - std::vector> fs; - int64_t line_in_each_thread = - param_row_count / FLAGS_inner_op_parallelism + 1; - for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) { - int64_t start = i * line_in_each_thread; - int64_t end = (i + 1) * line_in_each_thread; - if (start >= static_cast(param_row_count)) { - break; - } - if (end > static_cast(param_row_count)) { - end = static_cast(param_row_count); - } - fs.push_back(framework::Async([&functor, &row_id_to_grad_row_offset, - &grad_data, row_numel, start, end]() { - for (int64_t row_id = start; row_id < end; ++row_id) { - auto iter = row_id_to_grad_row_offset.find(row_id); - if (iter != row_id_to_grad_row_offset.end()) { - for (size_t row_offset = 0U; row_offset < row_numel; - ++row_offset) { - functor.adam_update( - row_id * row_numel + row_offset, - grad_data[iter->second * row_numel + row_offset]); - } - } else { - for (size_t row_offset = 0U; row_offset < row_numel; - ++row_offset) { - functor.adam_update(row_id * row_numel + row_offset, 0); - } - } - } - })); - } - for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); - } -#endif // !_WIN32 - else { // NOLINT - functor(param->numel()); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Variable type not supported by adam_op")); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc index 56c5d48b9f497..1ea91f6ebfa3e 100644 --- a/paddle/fluid/operators/optimizers/adam_op_npu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc @@ -15,8 +15,8 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/optimizers/adam_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc index 6c47b3906e062..00613f6825ffa 100644 --- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/optimizers/adam_op.h" #include "gflags/gflags.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" namespace paddle { diff --git a/paddle/fluid/operators/optimizers/adamw_op.cc b/paddle/fluid/operators/optimizers/adamw_op.cc deleted file mode 100644 index c2111d53f3a45..0000000000000 --- a/paddle/fluid/operators/optimizers/adamw_op.cc +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -namespace ops = paddle::operators; -REGISTER_OP_CPU_KERNEL( - adamw, ops::AdamWOpKernel, - ops::AdamWOpKernel); diff --git a/paddle/fluid/operators/optimizers/adamw_op.cu b/paddle/fluid/operators/optimizers/adamw_op.cu deleted file mode 100644 index 1d61bdec26d58..0000000000000 --- a/paddle/fluid/operators/optimizers/adamw_op.cu +++ /dev/null @@ -1,443 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/optimizers/adamw_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -template -__global__ void AdamWKernelREG(MT beta1, MT beta2, MT epsilon, MT coeff, - MT lr_ratio, MT beta1_pow_, MT beta2_pow_, - const MT* moment1, MT* moment1_out, - const MT* moment2, MT* moment2_out, - const MT* lr_, const T* grad, const T* param, - T* param_out, const MT* master_param, - MT* master_param_out, int ndim) { - MT lr = *lr_ * lr_ratio; - MT beta1_pow = beta1_pow_; - MT beta2_pow = beta2_pow_; - - int id = blockIdx.x * blockDim.x + threadIdx.x; - - for (; id < ndim; id += gridDim.x * blockDim.x) { - MT p = master_param ? master_param[id] : static_cast(param[id]); - MT g = static_cast(grad[id]); - MT mom1 = static_cast(moment1[id]); - MT mom2 = static_cast(moment2[id]); - - p *= (static_cast(1.0) - lr * coeff); - - mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; - mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - - MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; - - p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); - - moment1_out[id] = mom1; - moment2_out[id] = mom2; - param_out[id] = static_cast(p); - if (master_param_out) { - master_param_out[id] = p; - } - } -} - -template -__global__ void AdamWKernelMEM( - MT beta1, MT beta2, MT epsilon, MT coeff, MT lr_ratio, const MT* beta1_pow_, - const MT* beta2_pow_, const MT* moment1, MT* moment1_out, const MT* moment2, - MT* moment2_out, const MT* lr_, const T* grad, const T* param, T* param_out, - const MT* master_param, MT* master_param_out, int ndim) { - MT lr = *lr_ * lr_ratio; - MT beta1_pow = *beta1_pow_; - MT beta2_pow = *beta2_pow_; - - int id = blockIdx.x * blockDim.x + threadIdx.x; - - for (; id < ndim; id += gridDim.x * blockDim.x) { - MT p = master_param ? master_param[id] : static_cast(param[id]); - MT g = static_cast(grad[id]); - MT mom1 = static_cast(moment1[id]); - MT mom2 = static_cast(moment2[id]); - - p *= (static_cast(1.0) - lr * coeff); - - mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; - mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - - MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; - - p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); - - moment1_out[id] = mom1; - moment2_out[id] = mom2; - param_out[id] = static_cast(p); - if (master_param_out) { - master_param_out[id] = p; - } - } -} -template -__global__ void UpdateAdamWBetaPow(T beta1, T beta2, const T* beta1_pow_, - const T* beta2_pow_, T* beta1_pow_out, - T* beta2_pow_out) { - *beta1_pow_out = beta1 * beta1_pow_[0]; - *beta2_pow_out = beta2 * beta2_pow_[0]; -} - -template -__global__ void SparseAdamWCUDAKernelREG( - MT beta1, MT beta2, MT epsilon, MT coeff, MT lr_ratio, const MT beta1_pow, - const MT beta2_pow, const MT* mom1_, MT* mom1_out_, const MT* mom2_, - MT* mom2_out_, const MT* lr_, const T* grad_, const T* param_, - T* param_out_, const MT* master_param, MT* master_param_out, - const int64_t* rows_, int64_t row_numel, int64_t row_count, bool lazy_mode, - int ndim) { - int id = blockIdx.x * blockDim.x + threadIdx.x; - MT lr = *lr_ * lr_ratio; - - for (; id < ndim; id += blockDim.x * gridDim.x) { - auto row_idx = - phi::funcs::BinarySearch(rows_, row_count, id / row_numel); - if (lazy_mode && row_idx < 0) { - return; - } else { - MT mom1 = static_cast(mom1_[id]); - MT mom2 = static_cast(mom2_[id]); - - MT p = master_param ? master_param[id] : static_cast(param_[id]); - MT g = row_idx >= 0 - ? static_cast(grad_[row_idx * row_numel + id % row_numel]) - : static_cast(0); - - p *= (static_cast(1.0) - lr * coeff); - - mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; - mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - - MT denom = - (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; - - p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); - - // Write back to global memory - mom1_out_[id] = mom1; - mom2_out_[id] = mom2; - param_out_[id] = static_cast(p); - if (master_param_out) { - master_param_out[id] = p; - } - } - } -} - -template -class AdamWOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - - using paddle::framework::LoDTensor; - using MPDType = typename details::MPTypeTrait::Type; - - int64_t min_row_size_to_use_multithread = - ctx.Attr("min_row_size_to_use_multithread"); - bool lazy_mode = ctx.Attr("lazy_mode"); - bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); - VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; - - MPDType coeff = static_cast(ctx.Attr("coeff")); - MPDType lr_ratio = static_cast(ctx.Attr("lr_ratio")); - - auto* param = ctx.Input("Param"); - auto* grad_var = ctx.InputVar("Grad"); - auto* mom1 = ctx.Input("Moment1"); - auto* mom2 = ctx.Input("Moment2"); - auto* lr = ctx.Input("LearningRate"); - - auto* beta1_pow = ctx.Input("Beta1Pow"); - auto* beta2_pow = ctx.Input("Beta2Pow"); - - auto* param_out = ctx.Output("ParamOut"); - auto* mom1_out = ctx.Output("Moment1Out"); - auto* mom2_out = ctx.Output("Moment2Out"); - auto* beta1_pow_out = ctx.Output("Beta1PowOut"); - auto* beta2_pow_out = ctx.Output("Beta2PowOut"); - - bool skip_update = false; - if (ctx.HasInput("SkipUpdate")) { - auto* skip_update_tensor = ctx.Input("SkipUpdate"); - PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, - platform::errors::InvalidArgument( - "Input(SkipUpdate) size must be 1, but get %d", - skip_update_tensor->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector(*skip_update_tensor, - ctx.device_context(), &skip_update_vec); - skip_update = skip_update_vec[0]; - } - - // skip_update=true, just copy input to output, and TensorCopy will call - // mutable_data - if (skip_update) { - VLOG(4) << "Adamw skip update"; - framework::TensorCopy( - *param, ctx.GetPlace(), - ctx.template device_context(), param_out); - framework::TensorCopy( - *mom1, ctx.GetPlace(), - ctx.template device_context(), mom1_out); - framework::TensorCopy( - *mom2, ctx.GetPlace(), - ctx.template device_context(), mom2_out); - framework::TensorCopy( - *beta1_pow, ctx.GetPlace(), - ctx.template device_context(), - beta1_pow_out); - framework::TensorCopy( - *beta2_pow, ctx.GetPlace(), - ctx.template device_context(), - beta2_pow_out); - return; - } - - // if with_decay = false, coeff = 0 - bool with_decay = ctx.Attr("with_decay"); - if (!with_decay) { - coeff = static_cast(0.0); - } - - MPDType beta1 = static_cast(ctx.Attr("beta1")); - if (ctx.HasInput("Beta1Tensor")) { - auto* beta1_tensor = ctx.Input("Beta1Tensor"); - PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1, - platform::errors::InvalidArgument( - "Input(Beta1Tensor) size must be 1, but get %d", - beta1_tensor->numel())); - beta1 = static_cast(GetAttrFromTensor(beta1_tensor)); - } - MPDType beta2 = static_cast(ctx.Attr("beta2")); - if (ctx.HasInput("Beta2Tensor")) { - auto* beta2_tensor = ctx.Input("Beta2Tensor"); - PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1, - platform::errors::InvalidArgument( - "Input(Beta2Tensor) size must be 1, but get %d", - beta2_tensor->numel())); - beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); - } - MPDType epsilon = static_cast(ctx.Attr("epsilon")); - if (ctx.HasInput("EpsilonTensor")) { - auto* epsilon_tensor = ctx.Input("EpsilonTensor"); - PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1, - platform::errors::InvalidArgument( - "Input(EpsilonTensor) size must be 1, but get %d", - epsilon_tensor->numel())); - epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); - } - VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() - << "beta2_pow.numel() : " << beta2_pow->numel(); - VLOG(3) << "param.numel(): " << param->numel(); - PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1, - platform::errors::InvalidArgument( - "beta1 pow output size should be 1, but received " - "value is:%d.", - beta1_pow_out->numel())); - - PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1, - platform::errors::InvalidArgument( - "beta2 pow output size should be 1, but received " - "value is:%d.", - beta2_pow_out->numel())); - - const bool multi_precision = ctx.Attr("multi_precision"); - const LoDTensor* master_param = nullptr; - LoDTensor* master_param_out = nullptr; - if (multi_precision) { - bool has_master = - ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut"); - PADDLE_ENFORCE_EQ(has_master, true, - platform::errors::InvalidArgument( - "The Input(MasterParam) and Output(MasterParamOut) " - "should not be null when " - "the attr `multi_precision` is true")); - master_param = ctx.Input("MasterParam"); - master_param_out = ctx.Output("MasterParamOut"); - } - const MPDType* master_in_data = - multi_precision ? master_param->data() : nullptr; - MPDType* master_out_data = - multi_precision - ? master_param_out->mutable_data(ctx.GetPlace()) - : nullptr; - - auto& dev_ctx = ctx.template device_context(); - - if (grad_var->IsType()) { - auto* grad = ctx.Input("Grad"); - - // update param and moment - int threads = 512; - int blocks = (param->numel() + threads - 1) / threads; - - if (beta1_pow->place() == platform::CPUPlace() && - beta2_pow->place() == platform::CPUPlace()) { - // Compute with betapow in REG - AdamWKernelREG<<>>( - beta1, beta2, epsilon, coeff, lr_ratio, *beta1_pow->data(), - *beta2_pow->data(), mom1->data(), - mom1_out->mutable_data(ctx.GetPlace()), - mom2->data(), - mom2_out->mutable_data(ctx.GetPlace()), - lr->data(), grad->data(), param->data(), - param_out->mutable_data(ctx.GetPlace()), master_in_data, - master_out_data, param->numel()); - if (!use_global_beta_pow) { - // Cpu update - beta1_pow_out->mutable_data(platform::CPUPlace())[0] = - beta1 * beta1_pow->data()[0]; - beta2_pow_out->mutable_data(platform::CPUPlace())[0] = - beta2 * beta2_pow->data()[0]; - } - } else { - AdamWKernelMEM<<>>( - beta1, beta2, epsilon, coeff, lr_ratio, beta1_pow->data(), - beta2_pow->data(), mom1->data(), - mom1_out->mutable_data(ctx.GetPlace()), - mom2->data(), - mom2_out->mutable_data(ctx.GetPlace()), - lr->data(), grad->data(), param->data(), - param_out->mutable_data(ctx.GetPlace()), master_in_data, - master_out_data, param->numel()); - if (!use_global_beta_pow) { - // Update with gpu - UpdateAdamWBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( - beta1, beta2, beta1_pow->data(), - beta2_pow->data(), - beta1_pow_out->mutable_data(ctx.GetPlace()), - beta2_pow_out->mutable_data(ctx.GetPlace())); - } - } - } else if (grad_var->IsType()) { - auto* grad = ctx.Input("Grad"); - if (grad->rows().size() == 0) { - VLOG(3) << "grad row size is 0!!"; - return; - } - - std::vector cpu_rows(grad->rows().begin(), grad->rows().end()); - bool is_strict_sorted = true; - for (size_t i = 1; i < cpu_rows.size(); ++i) { - if (cpu_rows[i - 1] >= cpu_rows[i]) { - is_strict_sorted = false; - break; - } - } - - phi::SelectedRows tmp_grad_merge; - const phi::SelectedRows* grad_merge_ptr; - if (is_strict_sorted) { - grad_merge_ptr = grad; - } else { - // merge duplicated rows if any. - // The rows of grad_merge have been sorted inside MergeAdd functor - scatter::MergeAdd merge_func; - merge_func(ctx.template device_context(), - *grad, &tmp_grad_merge, true); - grad_merge_ptr = &tmp_grad_merge; - } - auto& grad_merge = *grad_merge_ptr; - auto& grad_tensor = grad_merge.value(); - const T* grad_data = grad_tensor.template data(); - auto* grad_merge_rows = &grad_merge.rows(); - paddle::framework::MixVector mixv_grad_merge_rows( - grad_merge_rows); - const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace()); - auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); - - if (beta1_pow->place() == platform::CPUPlace() && - beta2_pow->place() == platform::CPUPlace()) { - int threads = 512; - int ndim = param->numel(); - int blocks = (ndim + threads - 1) / threads; - - SparseAdamWCUDAKernelREG< - T, MPDType><<>>( - beta1, beta2, epsilon, coeff, lr_ratio, *beta1_pow->data(), - *beta2_pow->data(), mom1->data(), - mom1_out->mutable_data(ctx.GetPlace()), - mom2->data(), - mom2_out->mutable_data(ctx.GetPlace()), - lr->data(), grad_data, param->data(), - param_out->mutable_data(ctx.GetPlace()), master_in_data, - master_out_data, rows, row_numel, grad_merge.rows().size(), - lazy_mode, ndim); - if (!use_global_beta_pow) { - // Update with cpu - beta1_pow_out->mutable_data(platform::CPUPlace())[0] = - beta1 * beta1_pow->data()[0]; - beta2_pow_out->mutable_data(platform::CPUPlace())[0] = - beta2 * beta2_pow->data()[0]; - } - } else { - SparseAdamWFunctor functor( - beta1, beta2, epsilon, coeff, lr_ratio, beta1_pow->data(), - beta2_pow->data(), mom1->data(), - mom1_out->mutable_data(ctx.GetPlace()), - mom2->data(), - mom2_out->mutable_data(ctx.GetPlace()), - lr->data(), grad_data, param->data(), - param_out->mutable_data(ctx.GetPlace()), master_in_data, - master_out_data, rows, row_numel, grad_merge.rows().size(), - lazy_mode); - - // FIXME(minqiyang): remove BinarySearch in GPU later - platform::ForRange for_range( - static_cast( - ctx.device_context()), - param->numel()); - for_range(functor); - if (!use_global_beta_pow) { - // update beta1 and beta2 - UpdateAdamWBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( - beta1, beta2, beta1_pow->data(), - beta2_pow->data(), - beta1_pow_out->mutable_data(ctx.GetPlace()), - beta2_pow_out->mutable_data(ctx.GetPlace())); - } - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Variable type not supported by adamw_op")); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(adamw, ops::AdamWOpCUDAKernel, - ops::AdamWOpCUDAKernel, - ops::AdamWOpCUDAKernel); diff --git a/paddle/fluid/operators/optimizers/adamw_op.h b/paddle/fluid/operators/optimizers/adamw_op.h deleted file mode 100644 index 91ab58b0577bd..0000000000000 --- a/paddle/fluid/operators/optimizers/adamw_op.h +++ /dev/null @@ -1,213 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -namespace paddle { -namespace operators { - -class AdamWOp : public AdamOp { - using AdamOp::AdamOp; -}; - -struct GPUAdamW; -struct CPUAdamW; - -template -class AdamWFunctor; - -template -class AdamWFunctor { - private: - const T coeff_; - const T lr_ratio_; - const T* lr_; - T* param_; - - public: - AdamWFunctor(const T coeff, const T lr_ratio, const T* lr, T* param) - : coeff_(coeff), lr_ratio_(lr_ratio), lr_(lr), param_(param) {} - - inline HOSTDEVICE void operator()(size_t numel) const { - Eigen::Map> param{ - param_, static_cast(numel)}; - - T lr = *lr_; - - // Calculation - param -= lr * lr_ratio_ * coeff_ * param; - } -}; - -template -class SparseAdamWFunctor; - -template -class SparseAdamWFunctor { - private: - MT beta1_; - MT beta2_; - MT epsilon_; - MT coeff_; - MT lr_ratio_; - - const MT* beta1_pow_; - const MT* beta2_pow_; - const MT* moment1_; - MT* moment1_out_; - const MT* moment2_; - MT* moment2_out_; - const MT* lr_; - const T* grad_; - const T* param_; - T* param_out_; - const MT* master_param_; - MT* master_param_out_; - - const int64_t* rows_; - int64_t row_numel_; - int64_t row_count_; - bool lazy_mode_; - - public: - SparseAdamWFunctor(MT beta1, MT beta2, MT epsilon, MT coeff, MT lr_ratio, - const MT* beta1_pow, const MT* beta2_pow, const MT* mom1, - MT* mom1_out, const MT* mom2, MT* mom2_out, const MT* lr, - const T* grad, const T* param, T* param_out, - const MT* master_param, MT* master_param_out, - const int64_t* rows, int64_t row_numel, int64_t row_count, - bool lazy_mode) - : beta1_(beta1), - beta2_(beta2), - epsilon_(epsilon), - coeff_(coeff), - lr_ratio_(lr_ratio), - beta1_pow_(beta1_pow), - beta2_pow_(beta2_pow), - moment1_(mom1), - moment1_out_(mom1_out), - moment2_(mom2), - moment2_out_(mom2_out), - lr_(lr), - grad_(grad), - param_(param), - param_out_(param_out), - master_param_(master_param), - master_param_out_(master_param_out), - rows_(rows), - row_numel_(row_numel), - row_count_(row_count), - lazy_mode_(lazy_mode) {} - - inline HOSTDEVICE void adamw_update(size_t i, MT g) const { - // The following code is the same as dense - MT mom1 = moment1_[i]; - MT mom2 = moment2_[i]; - MT lr = *lr_ * lr_ratio_; - MT lr_orig = lr; - MT beta1_pow = *beta1_pow_; - MT beta2_pow = *beta2_pow_; - MT p = master_param_ ? master_param_[i] : static_cast(param_[i]); - - // Calculation - lr *= sqrt(static_cast(1.0) - beta2_pow) / - (static_cast(1.0) - beta1_pow); - - mom1 = beta1_ * mom1 + (static_cast(1.0) - beta1_) * g; - mom2 = beta2_ * mom2 + (static_cast(1.0) - beta2_) * g * g; - p -= lr_orig * coeff_ * p; - p -= lr * (mom1 / (sqrt(mom2) + - epsilon_ * sqrt(static_cast(1.0) - beta2_pow))); - - // Write back to global memory - moment1_out_[i] = mom1; - moment2_out_[i] = mom2; - param_out_[i] = static_cast(p); - if (master_param_out_) { - master_param_out_[i] = p; - } - } - - inline HOSTDEVICE void operator()(size_t i) const { - auto row_idx = - phi::funcs::BinarySearch(rows_, row_count_, i / row_numel_); - if (lazy_mode_ && row_idx < 0) { - return; - } else { - MT g = row_idx >= 0 - ? static_cast(grad_[row_idx * row_numel_ + i % row_numel_]) - : static_cast(0); - adamw_update(i, g); - } - } -}; - -template -class AdamWOpKernel : public AdamOpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - - using paddle::framework::LoDTensor; - bool skip_update = false; - // TODO(liupeng): - if (ctx.HasInput("SkipUpdate")) { - VLOG(3) << "Has SkipUpdate"; - auto* skip_update_tensor = ctx.Input("SkipUpdate"); - PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, - platform::errors::InvalidArgument( - "Input(SkipUpdate) size must be 1, but get %d", - skip_update_tensor->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector(*skip_update_tensor, - ctx.device_context(), &skip_update_vec); - skip_update = skip_update_vec[0]; - } - VLOG(3) << "Skip update" << skip_update; - bool with_decay = ctx.Attr("with_decay"); - - if (skip_update || !with_decay) { - AdamOpKernel::Compute(ctx); - return; - } - - T coeff = static_cast(ctx.Attr("coeff")); - T lr_ratio = static_cast(ctx.Attr("lr_ratio")); - auto* lr = ctx.Input("LearningRate"); - - LoDTensor* param; - - if (ctx.HasInput("MasterParam")) { - // TODO(liupeng): master - param = const_cast(ctx.Input("MasterParam")); - } else { - param = const_cast(ctx.Input("Param")); - } - - AdamWFunctor functor(coeff, lr_ratio, lr->data(), - param->data()); - functor(param->numel()); - - AdamOpKernel::Compute(ctx); - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/adamw_op_xpu.cc b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc index 56fa11d2b0857..5f2bcac31fa25 100644 --- a/paddle/fluid/operators/optimizers/adamw_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "gflags/gflags.h" -#include "paddle/fluid/operators/optimizers/adam_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.h b/paddle/fluid/operators/optimizers/merged_adam_op.h index c9417158fe772..b7aad68e44940 100644 --- a/paddle/fluid/operators/optimizers/merged_adam_op.h +++ b/paddle/fluid/operators/optimizers/merged_adam_op.h @@ -11,7 +11,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/optimizers/adam_op.h" +#include "paddle/phi/kernels/funcs/adam_functors.h" namespace paddle { namespace operators { @@ -82,7 +82,7 @@ class MergedAdamOpKernel : public framework::OpKernel { size_t param_num = param.size(); for (size_t idx = 0; idx < param_num; idx++) { - AdamFunctor functor( + phi::funcs::AdamFunctor functor( beta1, beta2, epsilon, beta1_pow[idx]->data(), beta2_pow[idx]->data(), mom1[idx]->data(), mom1_out[idx]->mutable_data(ctx.GetPlace()), mom2[idx]->data(), diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index a21f077c09f09..8ceda722794a0 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -120,6 +120,137 @@ void AdadeltaInferMeta(const MetaTensor& param, avg_squared_update_out->set_dtype(avg_squared_update.dtype()); } +void AdamInferMeta(const MetaTensor& param, + const MetaTensor& grad, + const MetaTensor& learning_rate, + const MetaTensor& moment1, + const MetaTensor& moment2, + const MetaTensor& beta1_pow, + const MetaTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + MetaTensor* param_out, + MetaTensor* moment1_out, + MetaTensor* moment2_out, + MetaTensor* beta1_pow_out, + MetaTensor* beta2_pow_out, + MetaTensor* master_param_outs) { + auto lr_dims = learning_rate.dims(); + PADDLE_ENFORCE_EQ( + phi::product(lr_dims), + 1, + errors::InvalidArgument( + "The number of LearningRate shall be 1, but received %d. Maybe " + "the Input variable LearningRate has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function.", + phi::product(lr_dims))); + auto beta1_pow_dims = beta1_pow.dims(); + VLOG(3) << "dims of Beta1Pow : [" << beta1_pow_dims << "]"; + PADDLE_ENFORCE_GE(phi::product(beta1_pow_dims), + 1, + errors::InvalidArgument( + "The size of Beta1 power accumulator should be greater " + "than 0, but received %d.", + phi::product(beta1_pow_dims))); + auto beta2_pow_dims = beta2_pow.dims(); + VLOG(3) << "dims of Beta2Pow : [" << beta2_pow_dims << "]"; + PADDLE_ENFORCE_GE(phi::product(beta2_pow_dims), + 1, + errors::InvalidArgument( + "The size of Beta2 power accumulator should be greater " + "than 0, but received %d.", + phi::product(beta2_pow_dims))); + + auto param_dims = param.dims(); + PADDLE_ENFORCE_EQ( + param_dims, + moment1.dims(), + errors::InvalidArgument( + "Param and Moment1 input of AdamOp should have same dimension. But " + "received Param dims: [%s], Moment1 dims: [%s].", + param_dims, + moment1.dims())); + PADDLE_ENFORCE_EQ( + param_dims, + moment2.dims(), + errors::InvalidArgument( + "Param and Moment2 input of AdamOp should have same dimension. But " + "received Param dims: [%s], Moment2 dims: [%s].", + param_dims, + moment2.dims())); + + param_out->set_dims(param_dims); + param_out->set_dtype(param.dtype()); + + moment1_out->set_dims(param_dims); + moment1_out->set_dtype(moment1.dtype()); + moment2_out->set_dims(param_dims); + moment2_out->set_dtype(moment2.dtype()); + + beta1_pow_out->set_dims(beta1_pow_dims); + beta1_pow_out->set_dtype(beta1_pow.dtype()); + beta2_pow_out->set_dims(beta2_pow_dims); + beta2_pow_out->set_dtype(beta2_pow.dtype()); +} + +void AdamwInferMeta(const MetaTensor& param, + const MetaTensor& grad, + const MetaTensor& learning_rate, + const MetaTensor& moment1, + const MetaTensor& moment2, + const MetaTensor& beta1_pow, + const MetaTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + float lr_ratio, + float coeff, + bool with_decay, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + MetaTensor* param_out, + MetaTensor* moment1_out, + MetaTensor* moment2_out, + MetaTensor* beta1_pow_out, + MetaTensor* beta2_pow_out, + MetaTensor* master_param_outs) { + AdamInferMeta(param, + grad, + learning_rate, + moment1, + moment2, + beta1_pow, + beta2_pow, + master_param, + skip_update, + beta1, + beta2, + epsilon, + lazy_mode, + min_row_size_to_use_multithread, + multi_precision, + use_global_beta_pow, + param_out, + moment1_out, + moment2_out, + beta1_pow_out, + beta2_pow_out, + master_param_outs); +} + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 8cb6f70481de3..7db387a5bf855 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -63,4 +63,53 @@ void AdadeltaInferMeta(const MetaTensor& param, MetaTensor* avg_squared_grad_out, MetaTensor* avg_squared_update_out); +void AdamInferMeta(const MetaTensor& param, + const MetaTensor& grad, + const MetaTensor& learning_rate, + const MetaTensor& moment1, + const MetaTensor& moment2, + const MetaTensor& beta1_pow, + const MetaTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + MetaTensor* param_out, + MetaTensor* moment1_out, + MetaTensor* moment2_out, + MetaTensor* beta1_pow_out, + MetaTensor* beta2_pow_out, + MetaTensor* master_param_outs); + +void AdamwInferMeta(const MetaTensor& param, + const MetaTensor& grad, + const MetaTensor& learning_rate, + const MetaTensor& moment1, + const MetaTensor& moment2, + const MetaTensor& beta1_pow, + const MetaTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + float lr_ratio, + float coeff, + bool with_decay, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + MetaTensor* param_out, + MetaTensor* moment1_out, + MetaTensor* moment2_out, + MetaTensor* beta1_pow_out, + MetaTensor* beta2_pow_out, + MetaTensor* master_param_outs); + } // namespace phi diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 16fae8d879cc3..14b99337151de 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -27,11 +27,13 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel) +set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel adam_kernel adamw_kernel) kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) +kernel_library(adam_kernel DEPS ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper gflags glog) +kernel_library(adamw_kernel DEPS ${COMMON_KERNEL_DEPS} adam_kernel) # 4. auto parse and build kernel targets by cmake register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} ) diff --git a/paddle/phi/kernels/adam_kernel.h b/paddle/phi/kernels/adam_kernel.h new file mode 100644 index 0000000000000..1d1f2d81e0c4b --- /dev/null +++ b/paddle/phi/kernels/adam_kernel.h @@ -0,0 +1,74 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/selected_rows.h" + +namespace phi { + +template +void AdamDenseKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs); + +template +void AdamDenseParamSparseGradKernel( + const Context& dev_ctx, + const DenseTensor& param, + const SelectedRows& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs); + +} // namespace phi diff --git a/paddle/phi/kernels/adamw_kernel.h b/paddle/phi/kernels/adamw_kernel.h new file mode 100644 index 0000000000000..088128158965d --- /dev/null +++ b/paddle/phi/kernels/adamw_kernel.h @@ -0,0 +1,80 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/selected_rows.h" + +namespace phi { + +template +void AdamwDenseKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + float lr_ratio, + float coeff, + bool with_decay, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs); + +template +void AdamwDenseParamSparseGradKernel( + const Context& dev_ctx, + const DenseTensor& param, + const SelectedRows& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + float lr_ratio, + float coeff, + bool with_decay, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/adam_kernel.cc b/paddle/phi/kernels/cpu/adam_kernel.cc new file mode 100644 index 0000000000000..2fb2ea743c8e8 --- /dev/null +++ b/paddle/phi/kernels/cpu/adam_kernel.cc @@ -0,0 +1,398 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/phi/kernels/adam_kernel.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/jit/kernels.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/adam_functors.h" + +DECLARE_int32(inner_op_parallelism); + +namespace phi { + +template +void AdamDenseKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs) { + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + + bool skip_update_ = false; + if (skip_update.is_initialized()) { + PADDLE_ENFORCE_EQ( + skip_update->numel(), + 1, + errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", + skip_update->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); + skip_update_ = skip_update_vec[0]; + } + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update_) { + VLOG(4) << "Adam skip update"; + paddle::framework::TensorCopy( + param, dev_ctx.GetPlace(), dev_ctx, param_out); + paddle::framework::TensorCopy( + moment1, dev_ctx.GetPlace(), dev_ctx, moment1_out); + paddle::framework::TensorCopy( + moment2, dev_ctx.GetPlace(), dev_ctx, moment2_out); + paddle::framework::TensorCopy( + beta1_pow, dev_ctx.GetPlace(), dev_ctx, beta1_pow_out); + paddle::framework::TensorCopy( + beta2_pow, dev_ctx.GetPlace(), dev_ctx, beta2_pow_out); + return; + } + + T beta1_ = beta1.to(); + T beta2_ = beta2.to(); + T epsilon_ = epsilon.to(); + + VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel(); + VLOG(3) << "beta2_pow.numel() : " << beta2_pow.numel(); + VLOG(3) << "param.numel(): " << param.numel(); + + PADDLE_ENFORCE_EQ( + beta1_pow_out->numel(), + 1, + errors::InvalidArgument("beta1 pow output size should be 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ( + beta2_pow_out->numel(), + 1, + errors::InvalidArgument("beta2 pow output size should be 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + T beta1_p = beta1_pow.data()[0]; + T beta2_p = beta2_pow.data()[0]; + + if (!use_global_beta_pow) { + dev_ctx.template Alloc(beta1_pow_out)[0] = beta1_ * beta1_p; + dev_ctx.template Alloc(beta2_pow_out)[0] = beta2_ * beta2_p; + } + + T* param_out_ptr = dev_ctx.template Alloc(param_out); + T* mom1_out_ptr = dev_ctx.template Alloc(moment1_out); + T* mom2_out_ptr = dev_ctx.template Alloc(moment2_out); + + T learning_rate_ = + learning_rate.data()[0] * (sqrt(1 - beta2_p) / (1 - beta1_p)); + T eps = epsilon_ * sqrt(1 - beta2_p); + + paddle::operators::jit::adam_attr_t attr(beta1_, beta2_); + int64_t numel = param.numel(); + + const T* param_ptr = param.data(); + const T* mom1_ptr = moment1.data(); + const T* mom2_ptr = moment2.data(); + const T* grad_ptr = grad.data(); + + auto adam = + paddle::operators::jit::KernelFuncs, + phi::CPUPlace>::Cache() + .At(attr); + + static constexpr int64_t chunk_size = 512; + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int64_t i = 0; i < numel / chunk_size; ++i) { + const int64_t offset = i * chunk_size; + adam(beta1_, + beta2_, + -learning_rate_, + eps, + chunk_size, + grad_ptr + offset, + mom1_ptr + offset, + mom2_ptr + offset, + param_ptr + offset, + mom1_out_ptr + offset, + mom2_out_ptr + offset, + param_out_ptr + offset); + } + + if (numel % chunk_size != 0) { + const int64_t offset = (numel / chunk_size) * chunk_size; + const int64_t tail_numel = numel % chunk_size; + adam(beta1_, + beta2_, + -learning_rate_, + eps, + tail_numel, + grad_ptr + offset, + mom1_ptr + offset, + mom2_ptr + offset, + param_ptr + offset, + mom1_out_ptr + offset, + mom2_out_ptr + offset, + param_out_ptr + offset); + } +} + +template +void AdamDenseParamSparseGradKernel( + const Context& dev_ctx, + const DenseTensor& param, + const SelectedRows& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs) { + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + + bool skip_update_ = false; + if (skip_update.is_initialized()) { + PADDLE_ENFORCE_EQ( + skip_update->numel(), + 1, + errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", + skip_update->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); + skip_update_ = skip_update_vec[0]; + } + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update_) { + VLOG(4) << "Adam skip update"; + paddle::framework::TensorCopy( + param, dev_ctx.GetPlace(), dev_ctx, param_out); + paddle::framework::TensorCopy( + moment1, dev_ctx.GetPlace(), dev_ctx, moment1_out); + paddle::framework::TensorCopy( + moment2, dev_ctx.GetPlace(), dev_ctx, moment2_out); + paddle::framework::TensorCopy( + beta1_pow, dev_ctx.GetPlace(), dev_ctx, beta1_pow_out); + paddle::framework::TensorCopy( + beta2_pow, dev_ctx.GetPlace(), dev_ctx, beta2_pow_out); + return; + } + + T beta1_ = beta1.to(); + T beta2_ = beta2.to(); + T epsilon_ = epsilon.to(); + + VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel(); + VLOG(3) << "beta2_pow.numel() : " << beta2_pow.numel(); + VLOG(3) << "param.numel(): " << param.numel(); + + PADDLE_ENFORCE_EQ( + beta1_pow_out->numel(), + 1, + errors::InvalidArgument("beta1 pow output size should be 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ( + beta2_pow_out->numel(), + 1, + errors::InvalidArgument("beta2 pow output size should be 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + if (grad.rows().size() == 0) { + VLOG(3) << "grad row size is 0!!"; + return; + } + + std::vector cpu_rows(grad.rows().begin(), grad.rows().end()); + bool is_strict_sorted = true; + for (size_t i = 1; i < cpu_rows.size(); ++i) { + if (cpu_rows[i - 1] >= cpu_rows[i]) { + is_strict_sorted = false; + break; + } + } + + phi::SelectedRows tmp_grad_merge; + const phi::SelectedRows* grad_merge_ptr; + if (is_strict_sorted) { + grad_merge_ptr = &grad; + } else { + // merge duplicated rows if any. + // The rows of grad_merge have been sorted inside MergeAdd functor + paddle::operators::math::scatter::MergeAdd merge_func; + merge_func(dev_ctx, grad, &tmp_grad_merge, true); + grad_merge_ptr = &tmp_grad_merge; + } + + auto& grad_merge = *grad_merge_ptr; + auto& grad_tensor = grad_merge.value(); + const T* grad_data = grad_tensor.template data(); + auto* grad_merge_rows = &grad_merge.rows(); + paddle::framework::MixVector mixv_grad_merge_rows(grad_merge_rows); + const int64_t* rows = mixv_grad_merge_rows.Data(dev_ctx.GetPlace()); + auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); + + funcs::SparseAdamFunctor functor( + beta1_, + beta2_, + epsilon_, + beta1_pow.data(), + beta2_pow.data(), + moment1.data(), + dev_ctx.template Alloc(moment1_out), + moment2.data(), + dev_ctx.template Alloc(moment2_out), + learning_rate.data(), + grad_data, + param.data(), + dev_ctx.template Alloc(param_out), + rows, + row_numel, + grad_merge.rows().size(), + lazy_mode); + // update beta1 and beta2 + if (!use_global_beta_pow) { + dev_ctx.template Alloc(beta1_pow_out)[0] = + beta1_ * beta1_pow.data()[0]; + dev_ctx.template Alloc(beta2_pow_out)[0] = + beta2_ * beta2_pow.data()[0]; + } + if (lazy_mode) { + VLOG(3) << "run cpu lazy mode"; + size_t row_count = grad_merge.rows().size(); + std::vector cpu_rows(grad_merge.rows()); + for (size_t row_index = 0; row_index < row_count; ++row_index) { + for (size_t offset = 0; offset < row_numel; ++offset) { + size_t i = cpu_rows[row_index] * row_numel + offset; + functor.adam_update(i, grad_data[row_index * row_numel + offset]); + } + } + } +#ifndef _WIN32 + else if (FLAGS_inner_op_parallelism > 1 && // NOLINT + min_row_size_to_use_multithread > 0 && + param.dims()[0] > min_row_size_to_use_multithread) { + VLOG(3) << "use multi thread, inner_op_parallelism=" + << FLAGS_inner_op_parallelism << " min_row_size_to_use_multithread=" + << min_row_size_to_use_multithread; + if (FLAGS_inner_op_parallelism > 10) { + VLOG(1) << "FLAGS_inner_op_parallelism " << FLAGS_inner_op_parallelism + << " is two large!"; + } + auto& grad_rows = grad_merge.rows(); + std::unordered_map row_id_to_grad_row_offset; + size_t param_row_count = param.numel() / row_numel; + if (param_row_count < 1000) { + VLOG(1) << "param_row_count should be larger then 1000 to use " + "multi thread, currently " + << param_row_count; + } + for (size_t i = 0; i < grad_rows.size(); ++i) { + row_id_to_grad_row_offset[grad_rows[i]] = i; + } + std::vector> fs; + int64_t line_in_each_thread = + param_row_count / FLAGS_inner_op_parallelism + 1; + for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) { + int64_t start = i * line_in_each_thread; + int64_t end = (i + 1) * line_in_each_thread; + if (start >= static_cast(param_row_count)) { + break; + } + if (end > static_cast(param_row_count)) { + end = static_cast(param_row_count); + } + fs.push_back(paddle::framework::Async([&functor, + &row_id_to_grad_row_offset, + &grad_data, + row_numel, + start, + end]() { + for (int64_t row_id = start; row_id < end; ++row_id) { + auto iter = row_id_to_grad_row_offset.find(row_id); + if (iter != row_id_to_grad_row_offset.end()) { + for (size_t row_offset = 0U; row_offset < row_numel; ++row_offset) { + functor.adam_update( + row_id * row_numel + row_offset, + grad_data[iter->second * row_numel + row_offset]); + } + } else { + for (size_t row_offset = 0U; row_offset < row_numel; ++row_offset) { + functor.adam_update(row_id * row_numel + row_offset, 0); + } + } + } + })); + } + for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); + } +#endif // !_WIN32 + else { // NOLINT + functor(param.numel()); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(adam, CPU, ALL_LAYOUT, phi::AdamDenseKernel, float, double) { +} + +PD_REGISTER_KERNEL(adam_dense_param_sparse_grad, + CPU, + ALL_LAYOUT, + phi::AdamDenseParamSparseGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/adamw_kernel.cc b/paddle/phi/kernels/cpu/adamw_kernel.cc new file mode 100644 index 0000000000000..6c1d3fdc2d1ca --- /dev/null +++ b/paddle/phi/kernels/cpu/adamw_kernel.cc @@ -0,0 +1,248 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/phi/kernels/adam_kernel.h" +#include "paddle/phi/kernels/adamw_kernel.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/adam_functors.h" + +namespace phi { + +template +void AdamwDenseKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + float lr_ratio, + float coeff, + bool with_decay, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs) { + bool skip_update_ = false; + if (skip_update.is_initialized()) { + PADDLE_ENFORCE_EQ( + skip_update->numel(), + 1, + errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", + skip_update->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); + skip_update_ = skip_update_vec[0]; + } + VLOG(3) << "Skip update" << skip_update_; + + if (skip_update_ || !with_decay) { + AdamDenseKernel(dev_ctx, + param, + grad, + learning_rate, + moment1, + moment2, + beta1_pow, + beta2_pow, + master_param, + skip_update, + beta1, + beta2, + epsilon, + lazy_mode, + min_row_size_to_use_multithread, + multi_precision, + use_global_beta_pow, + param_out, + moment1_out, + moment2_out, + beta1_pow_out, + beta2_pow_out, + master_param_outs); + return; + } + + auto* param_ = + master_param.is_initialized() ? master_param.get_ptr() : ¶m; + T coeff_ = static_cast(coeff); + T lr_ratio_ = static_cast(lr_ratio); + + funcs::AdamWFunctor functor( + coeff_, + lr_ratio_, + learning_rate.data(), + const_cast(param_->data())); + functor(param_->numel()); + + AdamDenseKernel(dev_ctx, + param, + grad, + learning_rate, + moment1, + moment2, + beta1_pow, + beta2_pow, + master_param, + skip_update, + beta1, + beta2, + epsilon, + lazy_mode, + min_row_size_to_use_multithread, + multi_precision, + use_global_beta_pow, + param_out, + moment1_out, + moment2_out, + beta1_pow_out, + beta2_pow_out, + master_param_outs); +} + +template +void AdamwDenseParamSparseGradKernel( + const Context& dev_ctx, + const DenseTensor& param, + const SelectedRows& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + float lr_ratio, + float coeff, + bool with_decay, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs) { + bool skip_update_ = false; + if (skip_update.is_initialized()) { + PADDLE_ENFORCE_EQ( + skip_update->numel(), + 1, + errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", + skip_update->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); + skip_update_ = skip_update_vec[0]; + } + VLOG(3) << "Skip update" << skip_update_; + + if (skip_update_ || !with_decay) { + AdamDenseParamSparseGradKernel(dev_ctx, + param, + grad, + learning_rate, + moment1, + moment2, + beta1_pow, + beta2_pow, + master_param, + skip_update, + beta1, + beta2, + epsilon, + lazy_mode, + min_row_size_to_use_multithread, + multi_precision, + use_global_beta_pow, + param_out, + moment1_out, + moment2_out, + beta1_pow_out, + beta2_pow_out, + master_param_outs); + return; + } + + auto* param_ = + master_param.is_initialized() ? master_param.get_ptr() : ¶m; + T coeff_ = static_cast(coeff); + T lr_ratio_ = static_cast(lr_ratio); + funcs::AdamWFunctor functor( + coeff_, + lr_ratio_, + learning_rate.data(), + const_cast(param_->data())); + functor(param_->numel()); + + AdamDenseParamSparseGradKernel(dev_ctx, + param, + grad, + learning_rate, + moment1, + moment2, + beta1_pow, + beta2_pow, + master_param, + skip_update, + beta1, + beta2, + epsilon, + lazy_mode, + min_row_size_to_use_multithread, + multi_precision, + use_global_beta_pow, + param_out, + moment1_out, + moment2_out, + beta1_pow_out, + beta2_pow_out, + master_param_outs); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + adamw, CPU, ALL_LAYOUT, phi::AdamwDenseKernel, float, double) {} + +PD_REGISTER_KERNEL(adamw_dense_param_sparse_grad, + CPU, + ALL_LAYOUT, + phi::AdamwDenseParamSparseGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/funcs/adam_functors.h b/paddle/phi/kernels/funcs/adam_functors.h new file mode 100644 index 0000000000000..2f706f0ef1c36 --- /dev/null +++ b/paddle/phi/kernels/funcs/adam_functors.h @@ -0,0 +1,548 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include // for sqrt in CPU and CUDA +#include + +#include "paddle/phi/kernels/funcs/algorithm.h" + +namespace phi { +namespace funcs { + +struct GPUAdam; +struct CPUAdam; + +template +class AdamFunctor; + +template +class AdamFunctor { + private: + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* lr_; + const T* grad_; + const T* param_; + T* param_out_; + + public: + AdamFunctor(T beta1, + T beta2, + T epsilon, + const T* beta1_pow, + const T* beta2_pow, + const T* mom1, + T* mom1_out, + const T* mom2, + T* mom2_out, + const T* lr, + const T* grad, + const T* param, + T* param_out) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out) {} + + inline HOSTDEVICE void operator()(size_t i) const { + // Merge all memory access together. + T g = grad_[i]; + T mom1 = moment1_[i]; + T mom2 = moment2_[i]; + T lr = *lr_; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + T p = param_[i]; + + // Calculation + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + + mom1 = beta1_ * mom1 + (1 - beta1_) * g; + mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + p -= lr * (mom1 / (sqrt(mom2) + epsilon_ * sqrt(1 - beta2_pow))); + + // Write back to global memory + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + param_out_[i] = p; + } +}; + +template +class AdamFunctor { + private: + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* lr_; + const T* grad_; + const T* param_; + T* param_out_; + + public: + AdamFunctor(T beta1, + T beta2, + T epsilon, + const T* beta1_pow, + const T* beta2_pow, + const T* mom1, + T* mom1_out, + const T* mom2, + T* mom2_out, + const T* lr, + const T* grad, + const T* param, + T* param_out) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out) {} + + void operator()(size_t numel) const { + Eigen::Map> g{ + grad_, static_cast(numel)}; + Eigen::Map> mom1{ + moment1_, static_cast(numel)}; + Eigen::Map> mom2{ + moment2_, static_cast(numel)}; + Eigen::Map> param{ + param_, static_cast(numel)}; + + Eigen::Map> param_out{ + param_out_, static_cast(numel)}; + Eigen::Map> moment1_out{ + moment1_out_, static_cast(numel)}; + Eigen::Map> moment2_out{ + moment2_out_, static_cast(numel)}; + + T lr = *lr_; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + + // Calculation + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + + moment1_out = beta1_ * mom1 + (1 - beta1_) * g; + moment2_out = beta2_ * mom2 + (1 - beta2_) * g * g; + param_out = param - + lr * (moment1_out / + (moment2_out.sqrt() + epsilon_ * sqrt(1 - beta2_pow))); + } +}; + +template +class SparseAdamFunctor; + +template +class SparseAdamFunctor { + private: + MT beta1_; + MT beta2_; + MT epsilon_; + + const MT* beta1_pow_; + const MT* beta2_pow_; + const MT* moment1_; + MT* moment1_out_; + const MT* moment2_; + MT* moment2_out_; + const MT* lr_; + const T* grad_; + const T* param_; + T* param_out_; + const MT* master_param_; + MT* master_param_out_; + + const int64_t* rows_; + int64_t row_numel_; + int64_t row_count_; + bool lazy_mode_; + + public: + SparseAdamFunctor(MT beta1, + MT beta2, + MT epsilon, + const MT* beta1_pow, + const MT* beta2_pow, + const MT* mom1, + MT* mom1_out, + const MT* mom2, + MT* mom2_out, + const MT* lr, + const T* grad, + const T* param, + T* param_out, + const MT* master_param, + MT* master_param_out, + const int64_t* rows, + int64_t row_numel, + int64_t row_count, + bool lazy_mode) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out), + master_param_(master_param), + master_param_out_(master_param_out), + rows_(rows), + row_numel_(row_numel), + row_count_(row_count), + lazy_mode_(lazy_mode) {} + + inline HOSTDEVICE void adam_update(size_t i, MT g) const { + // The following code is the same as dense + MT mom1 = moment1_[i]; + MT mom2 = moment2_[i]; + MT lr = *lr_; + MT beta1_pow = *beta1_pow_; + MT beta2_pow = *beta2_pow_; + MT p = master_param_ ? master_param_[i] : static_cast(param_[i]); + + // Calculation + lr *= sqrt(static_cast(1.0) - beta2_pow) / + (static_cast(1.0) - beta1_pow); + + mom1 = beta1_ * mom1 + (static_cast(1.0) - beta1_) * g; + mom2 = beta2_ * mom2 + (static_cast(1.0) - beta2_) * g * g; + p -= lr * (mom1 / (sqrt(mom2) + + epsilon_ * sqrt(static_cast(1.0) - beta2_pow))); + + // Write back to global memory + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + param_out_[i] = static_cast(p); + if (master_param_out_) { + master_param_out_[i] = p; + } + } + + inline HOSTDEVICE void operator()(size_t i) const { + auto row_idx = + phi::funcs::BinarySearch(rows_, row_count_, i / row_numel_); + if (lazy_mode_ && row_idx < 0) { + return; + } else { + MT g = row_idx >= 0 + ? static_cast(grad_[row_idx * row_numel_ + i % row_numel_]) + : static_cast(0); + adam_update(i, g); + } + } +}; + +template +class SparseAdamFunctor { + private: + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* lr_; + const T* grad_; + const T* param_; + T* param_out_; + + const int64_t* rows_; + int64_t row_numel_; + int64_t row_count_; + + public: + SparseAdamFunctor(T beta1, + T beta2, + T epsilon, + const T* beta1_pow, + const T* beta2_pow, + const T* mom1, + T* mom1_out, + const T* mom2, + T* mom2_out, + const T* lr, + const T* grad, + const T* param, + T* param_out, + const int64_t* rows, + int64_t row_numel, + int64_t row_count, + bool lazy_mode) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out), + rows_(rows), + row_numel_(row_numel), + row_count_(row_count) {} + + inline HOSTDEVICE void adam_update(size_t i, T g) const { + // The following code is the same as dense + T mom1 = moment1_[i]; + T mom2 = moment2_[i]; + T lr = *lr_; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + T p = param_[i]; + + // Calculation + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + + mom1 = beta1_ * mom1 + (1 - beta1_) * g; + mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + p -= lr * (mom1 / (sqrt(mom2) + epsilon_ * sqrt(1 - beta2_pow))); + + // Write back to global memory + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + param_out_[i] = p; + } + + inline void operator()(size_t numel) const { + // lr could be reuse + T lr = *lr_; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + int64_t row_count = static_cast(numel / row_numel_); + + for (int64_t i = 0, j = 0; i != row_count; ++i) { + if (i == *(rows_ + j)) { + for (int64_t k = 0; k != row_numel_; ++k) { + T g = grad_[j * row_numel_ + k]; + adam_update(i * row_numel_ + k, g); + } + ++j; + } else { + for (int64_t k = 0; k != row_numel_; ++k) { + T mom1 = moment1_[i * row_numel_ + k]; + T mom2 = moment2_[i * row_numel_ + k]; + T p = param_[i * row_numel_ + k]; + + mom1 = beta1_ * mom1; + mom2 = beta2_ * mom2; + + p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + // Write back to global memory + moment1_out_[i * row_numel_ + k] = mom1; + moment2_out_[i * row_numel_ + k] = mom2; + param_out_[i * row_numel_ + k] = p; + } + } + } + } +}; + +struct GPUAdamW; +struct CPUAdamW; + +template +class AdamWFunctor; + +template +class AdamWFunctor { + private: + const T coeff_; + const T lr_ratio_; + const T* lr_; + T* param_; + + public: + AdamWFunctor(const T coeff, const T lr_ratio, const T* lr, T* param) + : coeff_(coeff), lr_ratio_(lr_ratio), lr_(lr), param_(param) {} + + inline HOSTDEVICE void operator()(size_t numel) const { + Eigen::Map> param{ + param_, static_cast(numel)}; + + T lr = *lr_; + + // Calculation + param -= lr * lr_ratio_ * coeff_ * param; + } +}; + +template +class SparseAdamWFunctor; + +template +class SparseAdamWFunctor { + private: + MT beta1_; + MT beta2_; + MT epsilon_; + MT coeff_; + MT lr_ratio_; + + const MT* beta1_pow_; + const MT* beta2_pow_; + const MT* moment1_; + MT* moment1_out_; + const MT* moment2_; + MT* moment2_out_; + const MT* lr_; + const T* grad_; + const T* param_; + T* param_out_; + const MT* master_param_; + MT* master_param_out_; + + const int64_t* rows_; + int64_t row_numel_; + int64_t row_count_; + bool lazy_mode_; + + public: + SparseAdamWFunctor(MT beta1, + MT beta2, + MT epsilon, + MT coeff, + MT lr_ratio, + const MT* beta1_pow, + const MT* beta2_pow, + const MT* mom1, + MT* mom1_out, + const MT* mom2, + MT* mom2_out, + const MT* lr, + const T* grad, + const T* param, + T* param_out, + const MT* master_param, + MT* master_param_out, + const int64_t* rows, + int64_t row_numel, + int64_t row_count, + bool lazy_mode) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + coeff_(coeff), + lr_ratio_(lr_ratio), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out), + master_param_(master_param), + master_param_out_(master_param_out), + rows_(rows), + row_numel_(row_numel), + row_count_(row_count), + lazy_mode_(lazy_mode) {} + + inline HOSTDEVICE void adamw_update(size_t i, MT g) const { + // The following code is the same as dense + MT mom1 = moment1_[i]; + MT mom2 = moment2_[i]; + MT lr = *lr_ * lr_ratio_; + MT lr_orig = lr; + MT beta1_pow = *beta1_pow_; + MT beta2_pow = *beta2_pow_; + MT p = master_param_ ? master_param_[i] : static_cast(param_[i]); + + // Calculation + lr *= sqrt(static_cast(1.0) - beta2_pow) / + (static_cast(1.0) - beta1_pow); + + mom1 = beta1_ * mom1 + (static_cast(1.0) - beta1_) * g; + mom2 = beta2_ * mom2 + (static_cast(1.0) - beta2_) * g * g; + p -= lr_orig * coeff_ * p; + p -= lr * (mom1 / (sqrt(mom2) + + epsilon_ * sqrt(static_cast(1.0) - beta2_pow))); + + // Write back to global memory + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + param_out_[i] = static_cast(p); + if (master_param_out_) { + master_param_out_[i] = p; + } + } + + inline HOSTDEVICE void operator()(size_t i) const { + auto row_idx = + phi::funcs::BinarySearch(rows_, row_count_, i / row_numel_); + if (lazy_mode_ && row_idx < 0) { + return; + } else { + MT g = row_idx >= 0 + ? static_cast(grad_[row_idx * row_numel_ + i % row_numel_]) + : static_cast(0); + adamw_update(i, g); + } + } +}; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu new file mode 100644 index 0000000000000..f051c1d642a28 --- /dev/null +++ b/paddle/phi/kernels/gpu/adam_kernel.cu @@ -0,0 +1,532 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include // for sqrt in CPU and CUDA +#include +#include +#include +#include + +#include "paddle/phi/kernels/adam_kernel.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/adam_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" + +namespace phi { + +template +__global__ void AdamKernelREG(MT beta1, + MT beta2, + MT epsilon, + MT beta1_pow_, + MT beta2_pow_, + const MT* moment1, + MT* moment1_out, + const MT* moment2, + MT* moment2_out, + const MT* lr_, + const T* grad, + const T* param, + T* param_out, + const MT* master_param, + MT* master_param_out, + int ndim) { + MT lr = *lr_; + MT beta1_pow = beta1_pow_; + MT beta2_pow = beta2_pow_; + + int id = blockIdx.x * blockDim.x + threadIdx.x; + + for (; id < ndim; id += gridDim.x * blockDim.x) { + MT p = master_param ? master_param[id] : static_cast(param[id]); + MT g = static_cast(grad[id]); + MT mom1 = static_cast(moment1[id]); + MT mom2 = static_cast(moment2[id]); + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + + MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; + p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); + + moment1_out[id] = mom1; + moment2_out[id] = mom2; + param_out[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } +} + +template +__global__ void AdamKernelMEM(MT beta1, + MT beta2, + MT epsilon, + const MT* beta1_pow_, + const MT* beta2_pow_, + const MT* moment1, + MT* moment1_out, + const MT* moment2, + MT* moment2_out, + const MT* lr_, + const T* grad, + const T* param, + T* param_out, + const MT* master_param, + MT* master_param_out, + int ndim) { + MT lr = *lr_; + MT beta1_pow = *beta1_pow_; + MT beta2_pow = *beta2_pow_; + + int id = blockIdx.x * blockDim.x + threadIdx.x; + + for (; id < ndim; id += gridDim.x * blockDim.x) { + MT p = master_param ? master_param[id] : static_cast(param[id]); + MT g = static_cast(grad[id]); + MT mom1 = static_cast(moment1[id]); + MT mom2 = static_cast(moment2[id]); + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + + MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; + p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); + + moment1_out[id] = mom1; + moment2_out[id] = mom2; + param_out[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } +} + +template +__global__ void UpdateBetaPow(T beta1, + T beta2, + const T* beta1_pow_, + const T* beta2_pow_, + T* beta1_pow_out, + T* beta2_pow_out) { + *beta1_pow_out = beta1 * beta1_pow_[0]; + *beta2_pow_out = beta2 * beta2_pow_[0]; +} + +template +__global__ void SparseAdamCUDAKernelREG(MT beta1, + MT beta2, + MT epsilon, + const MT beta1_pow, + const MT beta2_pow, + const MT* mom1_, + MT* mom1_out_, + const MT* mom2_, + MT* mom2_out_, + const MT* lr_, + const T* grad_, + const T* param_, + T* param_out_, + const MT* master_param, + MT* master_param_out, + const int64_t* rows_, + int64_t row_numel, + int64_t row_count, + bool lazy_mode, + int ndim) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + MT lr = *lr_; + + for (; id < ndim; id += blockDim.x * gridDim.x) { + auto row_idx = + phi::funcs::BinarySearch(rows_, row_count, id / row_numel); + if (lazy_mode && row_idx < 0) { + return; + } else { + MT mom1 = mom1_[id]; + MT mom2 = mom2_[id]; + MT p = master_param ? master_param[id] : static_cast(param_[id]); + MT g = row_idx >= 0 + ? static_cast(grad_[row_idx * row_numel + id % row_numel]) + : static_cast(0); + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + + MT denom = + (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; + p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); + + // Write back to global memory + mom1_out_[id] = mom1; + mom2_out_[id] = mom2; + param_out_[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } + } +} + +template +void AdamDenseKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs) { + using MPDType = typename paddle::operators::details::MPTypeTrait::Type; + + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + + bool skip_update_ = false; + if (skip_update.is_initialized()) { + PADDLE_ENFORCE_EQ( + skip_update->numel(), + 1, + errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", + skip_update->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); + skip_update_ = skip_update_vec[0]; + } + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update_) { + VLOG(4) << "Adam skip update"; + paddle::framework::TensorCopy( + param, dev_ctx.GetPlace(), dev_ctx, param_out); + paddle::framework::TensorCopy( + moment1, dev_ctx.GetPlace(), dev_ctx, moment1_out); + paddle::framework::TensorCopy( + moment2, dev_ctx.GetPlace(), dev_ctx, moment2_out); + paddle::framework::TensorCopy( + beta1_pow, dev_ctx.GetPlace(), dev_ctx, beta1_pow_out); + paddle::framework::TensorCopy( + beta2_pow, dev_ctx.GetPlace(), dev_ctx, beta2_pow_out); + return; + } + + MPDType beta1_ = beta1.to(); + MPDType beta2_ = beta2.to(); + MPDType epsilon_ = epsilon.to(); + VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel() + << "beta2_pow.numel() : " << beta2_pow.numel(); + VLOG(3) << "param.numel(): " << param.numel(); + PADDLE_ENFORCE_EQ( + beta1_pow_out->numel(), + 1, + errors::InvalidArgument("beta1 pow output size should be 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ( + beta2_pow_out->numel(), + 1, + errors::InvalidArgument("beta2 pow output size should be 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + const MPDType* master_in_data = + multi_precision ? master_param->data() : nullptr; + MPDType* master_out_data = + multi_precision ? dev_ctx.template Alloc(master_param_outs) + : nullptr; + + // update param and moment + int threads = 512; + int blocks = (param.numel() + threads - 1) / threads; + + if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) { + // Compute with betapow in REG + AdamKernelREG<<>>( + beta1_, + beta2_, + epsilon_, + *beta1_pow.data(), + *beta2_pow.data(), + moment1.data(), + dev_ctx.template Alloc(moment1_out), + moment2.data(), + dev_ctx.template Alloc(moment2_out), + learning_rate.data(), + grad.data(), + param.data(), + dev_ctx.template Alloc(param_out), + master_in_data, + master_out_data, + param.numel()); + if (!use_global_beta_pow) { + // Cpu update + dev_ctx.template HostAlloc(beta1_pow_out)[0] = + beta1_ * beta1_pow.data()[0]; + dev_ctx.template HostAlloc(beta2_pow_out)[0] = + beta2_ * beta2_pow.data()[0]; + } + } else { + AdamKernelMEM<<>>( + beta1_, + beta2_, + epsilon_, + beta1_pow.data(), + beta2_pow.data(), + moment1.data(), + dev_ctx.template Alloc(moment1_out), + moment2.data(), + dev_ctx.template Alloc(moment2_out), + learning_rate.data(), + grad.data(), + param.data(), + dev_ctx.template Alloc(param_out), + master_in_data, + master_out_data, + param.numel()); + if (!use_global_beta_pow) { + // Update with gpu + UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + beta1_, + beta2_, + beta1_pow.data(), + beta2_pow.data(), + dev_ctx.template Alloc(beta1_pow_out), + dev_ctx.template Alloc(beta2_pow_out)); + } + } +} + +template +void AdamDenseParamSparseGradKernel( + const Context& dev_ctx, + const DenseTensor& param, + const SelectedRows& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs) { + using MPDType = typename paddle::operators::details::MPTypeTrait::Type; + + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + + bool skip_update_ = false; + if (skip_update.is_initialized()) { + PADDLE_ENFORCE_EQ( + skip_update->numel(), + 1, + errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", + skip_update->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); + skip_update_ = skip_update_vec[0]; + } + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update_) { + VLOG(4) << "Adam skip update"; + paddle::framework::TensorCopy( + param, dev_ctx.GetPlace(), dev_ctx, param_out); + paddle::framework::TensorCopy( + moment1, dev_ctx.GetPlace(), dev_ctx, moment1_out); + paddle::framework::TensorCopy( + moment2, dev_ctx.GetPlace(), dev_ctx, moment2_out); + paddle::framework::TensorCopy( + beta1_pow, dev_ctx.GetPlace(), dev_ctx, beta1_pow_out); + paddle::framework::TensorCopy( + beta2_pow, dev_ctx.GetPlace(), dev_ctx, beta2_pow_out); + return; + } + + MPDType beta1_ = beta1.to(); + MPDType beta2_ = beta2.to(); + MPDType epsilon_ = epsilon.to(); + VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel() + << "beta2_pow.numel() : " << beta2_pow.numel(); + VLOG(3) << "param.numel(): " << param.numel(); + PADDLE_ENFORCE_EQ( + beta1_pow_out->numel(), + 1, + errors::InvalidArgument("beta1 pow output size should be 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ( + beta2_pow_out->numel(), + 1, + errors::InvalidArgument("beta2 pow output size should be 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + const MPDType* master_in_data = + multi_precision ? master_param->data() : nullptr; + MPDType* master_out_data = + multi_precision ? dev_ctx.template Alloc(master_param_outs) + : nullptr; + + if (grad.rows().size() == 0) { + VLOG(3) << "grad row size is 0!!"; + return; + } + + std::vector cpu_rows(grad.rows().begin(), grad.rows().end()); + bool is_strict_sorted = true; + for (size_t i = 1; i < cpu_rows.size(); ++i) { + if (cpu_rows[i - 1] >= cpu_rows[i]) { + is_strict_sorted = false; + break; + } + } + + phi::SelectedRows tmp_grad_merge; + const phi::SelectedRows* grad_merge_ptr; + if (is_strict_sorted) { + grad_merge_ptr = &grad; + } else { + // merge duplicated rows if any. + // The rows of grad_merge have been sorted inside MergeAdd functor + paddle::operators::math::scatter::MergeAdd merge_func; + merge_func(dev_ctx, grad, &tmp_grad_merge, true); + grad_merge_ptr = &tmp_grad_merge; + } + auto& grad_merge = *grad_merge_ptr; + auto& grad_tensor = grad_merge.value(); + const T* grad_data = grad_tensor.template data(); + auto* grad_merge_rows = &grad_merge.rows(); + paddle::framework::MixVector mixv_grad_merge_rows(grad_merge_rows); + const int64_t* rows = mixv_grad_merge_rows.Data(dev_ctx.GetPlace()); + auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); + + if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) { + int threads = 512; + int ndim = param.numel(); + int blocks = (ndim + threads - 1) / threads; + + SparseAdamCUDAKernelREG<<>>( + beta1_, + beta2_, + epsilon_, + *beta1_pow.data(), + *beta2_pow.data(), + moment1.data(), + dev_ctx.template Alloc(moment1_out), + moment2.data(), + dev_ctx.template Alloc(moment2_out), + learning_rate.data(), + grad_data, + param.data(), + dev_ctx.template Alloc(param_out), + master_in_data, + master_out_data, + rows, + row_numel, + grad_merge.rows().size(), + lazy_mode, + ndim); + if (!use_global_beta_pow) { + // Update with cpu + dev_ctx.template HostAlloc(beta1_pow_out)[0] = + beta1_ * beta1_pow.data()[0]; + dev_ctx.template HostAlloc(beta2_pow_out)[0] = + beta2_ * beta2_pow.data()[0]; + } + } else { + funcs::SparseAdamFunctor functor( + beta1_, + beta2_, + epsilon_, + beta1_pow.data(), + beta2_pow.data(), + moment1.data(), + dev_ctx.template Alloc(moment1_out), + moment2.data(), + dev_ctx.template Alloc(moment2_out), + learning_rate.data(), + grad_data, + param.data(), + dev_ctx.template Alloc(param_out), + master_in_data, + master_out_data, + rows, + row_numel, + grad_merge.rows().size(), + lazy_mode); + + // FIXME(minqiyang): remove BinarySearch in GPU later + funcs::ForRange for_range(dev_ctx, param.numel()); + for_range(functor); + if (!use_global_beta_pow) { + // update beta1 and beta2 + UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + beta1_, + beta2_, + beta1_pow.data(), + beta2_pow.data(), + dev_ctx.template Alloc(beta1_pow_out), + dev_ctx.template Alloc(beta2_pow_out)); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(adam, + GPU, + ALL_LAYOUT, + phi::AdamDenseKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(adam_dense_param_sparse_grad, + GPU, + ALL_LAYOUT, + phi::AdamDenseParamSparseGradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu new file mode 100644 index 0000000000000..fc68d59c90faa --- /dev/null +++ b/paddle/phi/kernels/gpu/adamw_kernel.cu @@ -0,0 +1,581 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include // for sqrt in CPU and CUDA +#include +#include +#include +#include + +#include "paddle/phi/kernels/adamw_kernel.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/adam_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" + +namespace phi { +template +__global__ void AdamWKernelREG(MT beta1, + MT beta2, + MT epsilon, + MT coeff, + MT lr_ratio, + MT beta1_pow_, + MT beta2_pow_, + const MT* moment1, + MT* moment1_out, + const MT* moment2, + MT* moment2_out, + const MT* lr_, + const T* grad, + const T* param, + T* param_out, + const MT* master_param, + MT* master_param_out, + int ndim) { + MT lr = *lr_ * lr_ratio; + MT beta1_pow = beta1_pow_; + MT beta2_pow = beta2_pow_; + + int id = blockIdx.x * blockDim.x + threadIdx.x; + + for (; id < ndim; id += gridDim.x * blockDim.x) { + MT p = master_param ? master_param[id] : static_cast(param[id]); + MT g = static_cast(grad[id]); + MT mom1 = static_cast(moment1[id]); + MT mom2 = static_cast(moment2[id]); + + p *= (static_cast(1.0) - lr * coeff); + + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + + MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; + + p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); + + moment1_out[id] = mom1; + moment2_out[id] = mom2; + param_out[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } +} + +template +__global__ void AdamWKernelMEM(MT beta1, + MT beta2, + MT epsilon, + MT coeff, + MT lr_ratio, + const MT* beta1_pow_, + const MT* beta2_pow_, + const MT* moment1, + MT* moment1_out, + const MT* moment2, + MT* moment2_out, + const MT* lr_, + const T* grad, + const T* param, + T* param_out, + const MT* master_param, + MT* master_param_out, + int ndim) { + MT lr = *lr_ * lr_ratio; + MT beta1_pow = *beta1_pow_; + MT beta2_pow = *beta2_pow_; + + int id = blockIdx.x * blockDim.x + threadIdx.x; + + for (; id < ndim; id += gridDim.x * blockDim.x) { + MT p = master_param ? master_param[id] : static_cast(param[id]); + MT g = static_cast(grad[id]); + MT mom1 = static_cast(moment1[id]); + MT mom2 = static_cast(moment2[id]); + + p *= (static_cast(1.0) - lr * coeff); + + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + + MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; + + p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); + + moment1_out[id] = mom1; + moment2_out[id] = mom2; + param_out[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } +} +template +__global__ void UpdateAdamWBetaPow(T beta1, + T beta2, + const T* beta1_pow_, + const T* beta2_pow_, + T* beta1_pow_out, + T* beta2_pow_out) { + *beta1_pow_out = beta1 * beta1_pow_[0]; + *beta2_pow_out = beta2 * beta2_pow_[0]; +} + +template +__global__ void SparseAdamWCUDAKernelREG(MT beta1, + MT beta2, + MT epsilon, + MT coeff, + MT lr_ratio, + const MT beta1_pow, + const MT beta2_pow, + const MT* mom1_, + MT* mom1_out_, + const MT* mom2_, + MT* mom2_out_, + const MT* lr_, + const T* grad_, + const T* param_, + T* param_out_, + const MT* master_param, + MT* master_param_out, + const int64_t* rows_, + int64_t row_numel, + int64_t row_count, + bool lazy_mode, + int ndim) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + MT lr = *lr_ * lr_ratio; + + for (; id < ndim; id += blockDim.x * gridDim.x) { + auto row_idx = + phi::funcs::BinarySearch(rows_, row_count, id / row_numel); + if (lazy_mode && row_idx < 0) { + return; + } else { + MT mom1 = static_cast(mom1_[id]); + MT mom2 = static_cast(mom2_[id]); + + MT p = master_param ? master_param[id] : static_cast(param_[id]); + MT g = row_idx >= 0 + ? static_cast(grad_[row_idx * row_numel + id % row_numel]) + : static_cast(0); + + p *= (static_cast(1.0) - lr * coeff); + + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + + MT denom = + (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; + + p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); + + // Write back to global memory + mom1_out_[id] = mom1; + mom2_out_[id] = mom2; + param_out_[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } + } +} + +template +void AdamwDenseKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + float lr_ratio, + float coeff, + bool with_decay, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs) { + using MPDType = typename paddle::operators::details::MPTypeTrait::Type; + + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + + MPDType coeff_ = static_cast(coeff); + MPDType lr_ratio_ = static_cast(lr_ratio); + + bool skip_update_ = false; + if (skip_update.is_initialized()) { + PADDLE_ENFORCE_EQ( + skip_update->numel(), + 1, + errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", + skip_update->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); + skip_update_ = skip_update_vec[0]; + } + + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update_) { + VLOG(4) << "Adamw skip update"; + paddle::framework::TensorCopy( + param, dev_ctx.GetPlace(), dev_ctx, param_out); + paddle::framework::TensorCopy( + moment1, dev_ctx.GetPlace(), dev_ctx, moment1_out); + paddle::framework::TensorCopy( + moment2, dev_ctx.GetPlace(), dev_ctx, moment2_out); + paddle::framework::TensorCopy( + beta1_pow, dev_ctx.GetPlace(), dev_ctx, beta1_pow_out); + paddle::framework::TensorCopy( + beta2_pow, dev_ctx.GetPlace(), dev_ctx, beta2_pow_out); + return; + } + + // if with_decay = false, coeff = 0 + if (!with_decay) { + coeff_ = static_cast(0.0); + } + + MPDType beta1_ = beta1.to(); + MPDType beta2_ = beta2.to(); + MPDType epsilon_ = epsilon.to(); + VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel() + << "beta2_pow.numel() : " << beta2_pow.numel(); + VLOG(3) << "param.numel(): " << param.numel(); + PADDLE_ENFORCE_EQ( + beta1_pow_out->numel(), + 1, + errors::InvalidArgument("beta1 pow output size should be 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ( + beta2_pow_out->numel(), + 1, + errors::InvalidArgument("beta2 pow output size should be 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + const MPDType* master_in_data = + multi_precision ? master_param->data() : nullptr; + MPDType* master_out_data = + multi_precision ? dev_ctx.template Alloc(master_param_outs) + : nullptr; + + // update param and moment + int threads = 512; + int blocks = (param.numel() + threads - 1) / threads; + + if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) { + // Compute with betapow in REG + AdamWKernelREG<<>>( + beta1_, + beta2_, + epsilon_, + coeff_, + lr_ratio_, + *beta1_pow.data(), + *beta2_pow.data(), + moment1.data(), + dev_ctx.template Alloc(moment1_out), + moment2.data(), + dev_ctx.template Alloc(moment2_out), + learning_rate.data(), + grad.data(), + param.data(), + dev_ctx.template Alloc(param_out), + master_in_data, + master_out_data, + param.numel()); + if (!use_global_beta_pow) { + // Cpu update + dev_ctx.template HostAlloc(beta1_pow_out)[0] = + beta1_ * beta1_pow.data()[0]; + dev_ctx.template HostAlloc(beta2_pow_out)[0] = + beta2_ * beta2_pow.data()[0]; + } + } else { + AdamWKernelMEM<<>>( + beta1_, + beta2_, + epsilon_, + coeff_, + lr_ratio_, + beta1_pow.data(), + beta2_pow.data(), + moment1.data(), + dev_ctx.template Alloc(moment1_out), + moment2.data(), + dev_ctx.template Alloc(moment2_out), + learning_rate.data(), + grad.data(), + param.data(), + dev_ctx.template Alloc(param_out), + master_in_data, + master_out_data, + param.numel()); + if (!use_global_beta_pow) { + // Update with gpu + UpdateAdamWBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + beta1_, + beta2_, + beta1_pow.data(), + beta2_pow.data(), + dev_ctx.template Alloc(beta1_pow_out), + dev_ctx.template Alloc(beta2_pow_out)); + } + } +} + +template +void AdamwDenseParamSparseGradKernel( + const Context& dev_ctx, + const DenseTensor& param, + const SelectedRows& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + float lr_ratio, + float coeff, + bool with_decay, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs) { + using MPDType = typename paddle::operators::details::MPTypeTrait::Type; + + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + + MPDType coeff_ = static_cast(coeff); + MPDType lr_ratio_ = static_cast(lr_ratio); + + bool skip_update_ = false; + if (skip_update.is_initialized()) { + PADDLE_ENFORCE_EQ( + skip_update->numel(), + 1, + errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", + skip_update->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); + skip_update_ = skip_update_vec[0]; + } + + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update_) { + VLOG(4) << "Adamw skip update"; + paddle::framework::TensorCopy( + param, dev_ctx.GetPlace(), dev_ctx, param_out); + paddle::framework::TensorCopy( + moment1, dev_ctx.GetPlace(), dev_ctx, moment1_out); + paddle::framework::TensorCopy( + moment2, dev_ctx.GetPlace(), dev_ctx, moment2_out); + paddle::framework::TensorCopy( + beta1_pow, dev_ctx.GetPlace(), dev_ctx, beta1_pow_out); + paddle::framework::TensorCopy( + beta2_pow, dev_ctx.GetPlace(), dev_ctx, beta2_pow_out); + return; + } + + // if with_decay = false, coeff = 0 + if (!with_decay) { + coeff_ = static_cast(0.0); + } + + MPDType beta1_ = beta1.to(); + MPDType beta2_ = beta2.to(); + MPDType epsilon_ = epsilon.to(); + VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel() + << "beta2_pow.numel() : " << beta2_pow.numel(); + VLOG(3) << "param.numel(): " << param.numel(); + PADDLE_ENFORCE_EQ( + beta1_pow_out->numel(), + 1, + errors::InvalidArgument("beta1 pow output size should be 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ( + beta2_pow_out->numel(), + 1, + errors::InvalidArgument("beta2 pow output size should be 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + const MPDType* master_in_data = + multi_precision ? master_param->data() : nullptr; + MPDType* master_out_data = + multi_precision ? dev_ctx.template Alloc(master_param_outs) + : nullptr; + + if (grad.rows().size() == 0) { + VLOG(3) << "grad row size is 0!!"; + return; + } + + std::vector cpu_rows(grad.rows().begin(), grad.rows().end()); + bool is_strict_sorted = true; + for (size_t i = 1; i < cpu_rows.size(); ++i) { + if (cpu_rows[i - 1] >= cpu_rows[i]) { + is_strict_sorted = false; + break; + } + } + + phi::SelectedRows tmp_grad_merge; + const phi::SelectedRows* grad_merge_ptr; + if (is_strict_sorted) { + grad_merge_ptr = &grad; + } else { + // merge duplicated rows if any. + // The rows of grad_merge have been sorted inside MergeAdd functor + paddle::operators::math::scatter::MergeAdd merge_func; + merge_func(dev_ctx, grad, &tmp_grad_merge, true); + grad_merge_ptr = &tmp_grad_merge; + } + auto& grad_merge = *grad_merge_ptr; + auto& grad_tensor = grad_merge.value(); + const T* grad_data = grad_tensor.template data(); + auto* grad_merge_rows = &grad_merge.rows(); + paddle::framework::MixVector mixv_grad_merge_rows(grad_merge_rows); + const int64_t* rows = mixv_grad_merge_rows.Data(dev_ctx.GetPlace()); + auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); + + if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) { + int threads = 512; + int ndim = param.numel(); + int blocks = (ndim + threads - 1) / threads; + + SparseAdamWCUDAKernelREG<<>>( + beta1_, + beta2_, + epsilon_, + coeff_, + lr_ratio_, + *beta1_pow.data(), + *beta2_pow.data(), + moment1.data(), + dev_ctx.template Alloc(moment1_out), + moment2.data(), + dev_ctx.template Alloc(moment2_out), + learning_rate.data(), + grad_data, + param.data(), + dev_ctx.template Alloc(param_out), + master_in_data, + master_out_data, + rows, + row_numel, + grad_merge.rows().size(), + lazy_mode, + ndim); + if (!use_global_beta_pow) { + // Update with cpu + dev_ctx.template HostAlloc(beta1_pow_out)[0] = + beta1_ * beta1_pow.data()[0]; + dev_ctx.template HostAlloc(beta2_pow_out)[0] = + beta2_ * beta2_pow.data()[0]; + } + } else { + funcs::SparseAdamWFunctor functor( + beta1_, + beta2_, + epsilon_, + coeff_, + lr_ratio_, + beta1_pow.data(), + beta2_pow.data(), + moment1.data(), + dev_ctx.template Alloc(moment1_out), + moment2.data(), + dev_ctx.template Alloc(moment2_out), + learning_rate.data(), + grad_data, + param.data(), + dev_ctx.template Alloc(param_out), + master_in_data, + master_out_data, + rows, + row_numel, + grad_merge.rows().size(), + lazy_mode); + + // FIXME(minqiyang): remove BinarySearch in GPU later + funcs::ForRange for_range(dev_ctx, param.numel()); + for_range(functor); + if (!use_global_beta_pow) { + // update beta1 and beta2 + UpdateAdamWBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + beta1_, + beta2_, + beta1_pow.data(), + beta2_pow.data(), + dev_ctx.template Alloc(beta1_pow_out), + dev_ctx.template Alloc(beta2_pow_out)); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(adamw, + GPU, + ALL_LAYOUT, + phi::AdamwDenseKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(adamw_dense_param_sparse_grad, + GPU, + ALL_LAYOUT, + phi::AdamwDenseParamSparseGradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/ops/compat/adam_sig.cc b/paddle/phi/ops/compat/adam_sig.cc new file mode 100644 index 0000000000000..0bc31cd28cb6c --- /dev/null +++ b/paddle/phi/ops/compat/adam_sig.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include + +#include "paddle/phi/core/compat/op_utils.h" +#include "paddle/utils/small_vector.h" + +namespace phi { + +KernelSignature AdamOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::SmallVector in_names = {"Param", + "Grad", + "LearningRate", + "Moment1", + "Moment2", + "Beta1Pow", + "Beta2Pow", + "MasterParam", + "SkipUpdate"}; + paddle::SmallVector out_names = {"ParamOut", + "Moment1Out", + "Moment2Out", + "Beta1PowOut", + "Beta2PowOut", + "MasterParamOut"}; + paddle::SmallVector attr_names; + + attr_names.emplace_back(ctx.HasInput("Beta1Tensor") ? "Beta1Tensor" + : "beta1"); + attr_names.emplace_back(ctx.HasInput("Beta2Tensor") ? "Beta2Tensor" + : "beta2"); + attr_names.emplace_back(ctx.HasInput("EpsilonTensor") ? "EpsilonTensor" + : "epsilon"); + attr_names.emplace_back("lazy_mode"); + attr_names.emplace_back("min_row_size_to_use_multithread"); + attr_names.emplace_back("multi_precision"); + attr_names.emplace_back("use_global_beta_pow"); + + if (ctx.IsSelectedRowsInput("Grad")) { + return KernelSignature("adam_dense_param_sparse_grad", + std::move(in_names), + std::move(attr_names), + std::move(out_names)); + } else if (ctx.IsDenseTensorInput("Grad")) { + return KernelSignature("adam", + std::move(in_names), + std::move(attr_names), + std::move(out_names)); + } else { + return KernelSignature("unregistered", {}, {}, {}); + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(adam, phi::AdamOpArgumentMapping); diff --git a/paddle/phi/ops/compat/adamw_sig.cc b/paddle/phi/ops/compat/adamw_sig.cc new file mode 100644 index 0000000000000..763304bdf3511 --- /dev/null +++ b/paddle/phi/ops/compat/adamw_sig.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include + +#include "paddle/phi/core/compat/op_utils.h" +#include "paddle/utils/small_vector.h" + +namespace phi { + +KernelSignature AdamwOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::SmallVector in_names = {"Param", + "Grad", + "LearningRate", + "Moment1", + "Moment2", + "Beta1Pow", + "Beta2Pow", + "MasterParam", + "SkipUpdate"}; + paddle::SmallVector out_names = {"ParamOut", + "Moment1Out", + "Moment2Out", + "Beta1PowOut", + "Beta2PowOut", + "MasterParamOut"}; + paddle::SmallVector attr_names; + + attr_names.emplace_back(ctx.HasInput("Beta1Tensor") ? "Beta1Tensor" + : "beta1"); + attr_names.emplace_back(ctx.HasInput("Beta2Tensor") ? "Beta2Tensor" + : "beta2"); + attr_names.emplace_back(ctx.HasInput("EpsilonTensor") ? "EpsilonTensor" + : "epsilon"); + attr_names.emplace_back("lr_ratio"); + attr_names.emplace_back("coeff"); + attr_names.emplace_back("with_decay"); + attr_names.emplace_back("lazy_mode"); + attr_names.emplace_back("min_row_size_to_use_multithread"); + attr_names.emplace_back("multi_precision"); + attr_names.emplace_back("use_global_beta_pow"); + + if (ctx.IsSelectedRowsInput("Grad")) { + return KernelSignature("adamw_dense_param_sparse_grad", + std::move(in_names), + std::move(attr_names), + std::move(out_names)); + } else if (ctx.IsDenseTensorInput("Grad")) { + return KernelSignature("adamw", + std::move(in_names), + std::move(attr_names), + std::move(out_names)); + } else { + return KernelSignature("unregistered", {}, {}, {}); + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(adamw, phi::AdamwOpArgumentMapping); From 231e6078c7fad11bd7c9e8a890c542d9390fc225 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 9 Mar 2022 07:04:01 +0000 Subject: [PATCH 02/16] fix compile error and unittest ok --- paddle/fluid/framework/operator.cc | 2 -- paddle/fluid/operators/optimizers/merged_adam_op.h | 1 + paddle/fluid/platform/flags.cc | 3 +++ paddle/phi/kernels/CMakeLists.txt | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index eff6d9a9102d2..b045537556d7c 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -58,8 +58,6 @@ class DenseTensor; DECLARE_bool(benchmark); DECLARE_bool(check_nan_inf); DECLARE_bool(enable_unused_var_check); -PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0, - "number of threads for inner op"); DECLARE_bool(run_kp_kernel); namespace paddle { diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.h b/paddle/fluid/operators/optimizers/merged_adam_op.h index b7aad68e44940..3ad30f54eaa2f 100644 --- a/paddle/fluid/operators/optimizers/merged_adam_op.h +++ b/paddle/fluid/operators/optimizers/merged_adam_op.h @@ -11,6 +11,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/phi/kernels/funcs/adam_functors.h" namespace paddle { diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index baf043e860be4..e194187e22d08 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -84,6 +84,9 @@ PADDLE_DEFINE_EXPORTED_bool( "input and output must be half precision) and recurrent neural networks " "(RNNs)."); +PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0, + "number of threads for inner op"); + /** * CUDA related FLAG * Name: FLAGS_selected_gpus diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 14b99337151de..4346399fb553b 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -32,7 +32,7 @@ kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) -kernel_library(adam_kernel DEPS ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper gflags glog) +kernel_library(adam_kernel DEPS ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper flags) kernel_library(adamw_kernel DEPS ${COMMON_KERNEL_DEPS} adam_kernel) # 4. auto parse and build kernel targets by cmake From 5ad17105a59a99261b832199ecc56027402525ca Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 9 Mar 2022 10:50:40 +0000 Subject: [PATCH 03/16] fix compile error and unittest ok --- paddle/phi/kernels/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 0eed9996b21db..dd3ce1fec486c 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -29,11 +29,12 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # In this case, you need to manually generate them here. set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel) set(MANUAL_BUILD_KERNELS ${MANUAL_BUILD_KERNELS} adam_kernel adamw_kernel) + kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) -kernel_library(adam_kernel DEPS ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper flags) +kernel_library(adam_kernel DEPS flags ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper) kernel_library(adamw_kernel DEPS ${COMMON_KERNEL_DEPS} adam_kernel) kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) From 1f62ee34bafc4e6c3ca8df1a88544d24719aa17d Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 9 Mar 2022 11:47:19 +0000 Subject: [PATCH 04/16] fix undefined reference to fLI::FLAGS --- paddle/phi/kernels/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index bc0ddc68738ca..f1400818c2fc8 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -34,7 +34,7 @@ kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) -kernel_library(adam_kernel DEPS flags ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper) +kernel_library(adam_kernel DEPS gflags glog flags ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper) kernel_library(adamw_kernel DEPS ${COMMON_KERNEL_DEPS} adam_kernel) kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) From 7e012b2ce526c0f200a48a3ce5127879d58e2e81 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 9 Mar 2022 13:25:34 +0000 Subject: [PATCH 05/16] test depend on operator --- paddle/fluid/framework/operator.cc | 2 ++ paddle/fluid/platform/flags.cc | 3 --- paddle/phi/kernels/CMakeLists.txt | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index b045537556d7c..d21d191b630fd 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -57,6 +57,8 @@ class DenseTensor; DECLARE_bool(benchmark); DECLARE_bool(check_nan_inf); +PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0, + "number of threads for inner op"); DECLARE_bool(enable_unused_var_check); DECLARE_bool(run_kp_kernel); diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index e194187e22d08..baf043e860be4 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -84,9 +84,6 @@ PADDLE_DEFINE_EXPORTED_bool( "input and output must be half precision) and recurrent neural networks " "(RNNs)."); -PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0, - "number of threads for inner op"); - /** * CUDA related FLAG * Name: FLAGS_selected_gpus diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index f1400818c2fc8..2f90b663d59ce 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -34,7 +34,7 @@ kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) -kernel_library(adam_kernel DEPS gflags glog flags ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper) +kernel_library(adam_kernel DEPS gflags glog operator ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper) kernel_library(adamw_kernel DEPS ${COMMON_KERNEL_DEPS} adam_kernel) kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) From 70fd083d950a599a139eb7c641b041e6bda837be Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Thu, 10 Mar 2022 11:35:27 +0000 Subject: [PATCH 06/16] fix cmake --- paddle/fluid/framework/operator.cc | 2 -- paddle/fluid/platform/flags.cc | 3 +++ paddle/phi/kernels/CMakeLists.txt | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index d21d191b630fd..b045537556d7c 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -57,8 +57,6 @@ class DenseTensor; DECLARE_bool(benchmark); DECLARE_bool(check_nan_inf); -PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0, - "number of threads for inner op"); DECLARE_bool(enable_unused_var_check); DECLARE_bool(run_kp_kernel); diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index baf043e860be4..e417b4fd8694b 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -32,6 +32,9 @@ ExportedFlagInfoMap *GetMutableExportedFlagInfoMap() { } // namespace platform } // namespace paddle +PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0, + "number of threads for inner op"); + /** * NOTE(paddle-dev): This file is designed to define all public FLAGS. */ diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 2f90b663d59ce..f1400818c2fc8 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -34,7 +34,7 @@ kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) -kernel_library(adam_kernel DEPS gflags glog operator ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper) +kernel_library(adam_kernel DEPS gflags glog flags ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper) kernel_library(adamw_kernel DEPS ${COMMON_KERNEL_DEPS} adam_kernel) kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) From 97eaddf557095b9d5672c0bc5acec46ccefc5d07 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Fri, 11 Mar 2022 02:30:15 +0000 Subject: [PATCH 07/16] fix xpu compile --- .../operators/optimizers/adam_op_functor.h | 43 +++++++++++++++++++ .../fluid/operators/optimizers/adam_op_xpu.cc | 2 +- .../operators/optimizers/adamw_op_xpu.cc | 1 + 3 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/optimizers/adam_op_functor.h diff --git a/paddle/fluid/operators/optimizers/adam_op_functor.h b/paddle/fluid/operators/optimizers/adam_op_functor.h new file mode 100644 index 0000000000000..e2c1c9abd6dcb --- /dev/null +++ b/paddle/fluid/operators/optimizers/adam_op_functor.h @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { + +namespace scatter = paddle::operators::math::scatter; + +static inline float GetAttrFromTensor(const framework::Tensor* tensor) { + const float* tensor_data = tensor->data(); + framework::Tensor cpu_tensor; + if (platform::is_gpu_place(tensor->place())) { + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), + &cpu_tensor); + tensor_data = cpu_tensor.data(); + } + if (platform::is_xpu_place(tensor->place())) { + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), + &cpu_tensor); + tensor_data = cpu_tensor.data(); + } + return tensor_data[0]; +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc index 00613f6825ffa..6ea0b2054cdea 100644 --- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/operators/optimizers/adam_op_functor.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/optimizers/adamw_op_xpu.cc b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc index 5f2bcac31fa25..d86d2bd2ffb4a 100644 --- a/paddle/fluid/operators/optimizers/adamw_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/optimizers/adam_op_functor.h" namespace paddle { namespace operators { From cc4020c21ef3b88474aaab08c133bea980869cfa Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Fri, 11 Mar 2022 08:40:22 +0000 Subject: [PATCH 08/16] fix infrt --- paddle/phi/infermeta/unary.cc | 1 + paddle/phi/kernels/cpu/adam_kernel.cc | 33 +++++++------------ paddle/phi/kernels/gpu/adam_kernel.cu | 32 +++++++----------- paddle/phi/kernels/gpu/adamw_kernel.cu | 32 +++++++----------- .../phi/kernels/impl/tile_grad_kernel_impl.h | 5 ++- tools/infrt/get_compat_kernel_signature.py | 2 ++ 6 files changed, 39 insertions(+), 66 deletions(-) diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index d6d4efad9fae2..8128e1d550b49 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -461,6 +461,7 @@ void TileInferMeta(const MetaTensor& x, if (out_shape[0] == x_dims[0]) { out->share_lod(x); } + out->set_dtype(x.data_type()); } void ReshapeInferMeta(const MetaTensor& x, diff --git a/paddle/phi/kernels/cpu/adam_kernel.cc b/paddle/phi/kernels/cpu/adam_kernel.cc index 2fb2ea743c8e8..532e596b73132 100644 --- a/paddle/phi/kernels/cpu/adam_kernel.cc +++ b/paddle/phi/kernels/cpu/adam_kernel.cc @@ -18,12 +18,12 @@ #include "paddle/phi/kernels/adam_kernel.h" -#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/adam_functors.h" DECLARE_int32(inner_op_parallelism); @@ -71,16 +71,12 @@ void AdamDenseKernel(const Context& dev_ctx, // mutable_data if (skip_update_) { VLOG(4) << "Adam skip update"; - paddle::framework::TensorCopy( - param, dev_ctx.GetPlace(), dev_ctx, param_out); - paddle::framework::TensorCopy( - moment1, dev_ctx.GetPlace(), dev_ctx, moment1_out); - paddle::framework::TensorCopy( - moment2, dev_ctx.GetPlace(), dev_ctx, moment2_out); - paddle::framework::TensorCopy( - beta1_pow, dev_ctx.GetPlace(), dev_ctx, beta1_pow_out); - paddle::framework::TensorCopy( - beta2_pow, dev_ctx.GetPlace(), dev_ctx, beta2_pow_out); + phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); + phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); + phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); + phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); + phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); + return; } @@ -216,16 +212,11 @@ void AdamDenseParamSparseGradKernel( // mutable_data if (skip_update_) { VLOG(4) << "Adam skip update"; - paddle::framework::TensorCopy( - param, dev_ctx.GetPlace(), dev_ctx, param_out); - paddle::framework::TensorCopy( - moment1, dev_ctx.GetPlace(), dev_ctx, moment1_out); - paddle::framework::TensorCopy( - moment2, dev_ctx.GetPlace(), dev_ctx, moment2_out); - paddle::framework::TensorCopy( - beta1_pow, dev_ctx.GetPlace(), dev_ctx, beta1_pow_out); - paddle::framework::TensorCopy( - beta2_pow, dev_ctx.GetPlace(), dev_ctx, beta2_pow_out); + phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); + phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); + phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); + phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); + phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); return; } diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu index f051c1d642a28..01d5e44becdea 100644 --- a/paddle/phi/kernels/gpu/adam_kernel.cu +++ b/paddle/phi/kernels/gpu/adam_kernel.cu @@ -20,12 +20,12 @@ #include "paddle/phi/kernels/adam_kernel.h" -#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/adam_functors.h" #include "paddle/phi/kernels/funcs/for_range.h" @@ -225,16 +225,11 @@ void AdamDenseKernel(const Context& dev_ctx, // mutable_data if (skip_update_) { VLOG(4) << "Adam skip update"; - paddle::framework::TensorCopy( - param, dev_ctx.GetPlace(), dev_ctx, param_out); - paddle::framework::TensorCopy( - moment1, dev_ctx.GetPlace(), dev_ctx, moment1_out); - paddle::framework::TensorCopy( - moment2, dev_ctx.GetPlace(), dev_ctx, moment2_out); - paddle::framework::TensorCopy( - beta1_pow, dev_ctx.GetPlace(), dev_ctx, beta1_pow_out); - paddle::framework::TensorCopy( - beta2_pow, dev_ctx.GetPlace(), dev_ctx, beta2_pow_out); + phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); + phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); + phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); + phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); + phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); return; } @@ -369,16 +364,11 @@ void AdamDenseParamSparseGradKernel( // mutable_data if (skip_update_) { VLOG(4) << "Adam skip update"; - paddle::framework::TensorCopy( - param, dev_ctx.GetPlace(), dev_ctx, param_out); - paddle::framework::TensorCopy( - moment1, dev_ctx.GetPlace(), dev_ctx, moment1_out); - paddle::framework::TensorCopy( - moment2, dev_ctx.GetPlace(), dev_ctx, moment2_out); - paddle::framework::TensorCopy( - beta1_pow, dev_ctx.GetPlace(), dev_ctx, beta1_pow_out); - paddle::framework::TensorCopy( - beta2_pow, dev_ctx.GetPlace(), dev_ctx, beta2_pow_out); + phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); + phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); + phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); + phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); + phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); return; } diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu index fc68d59c90faa..b2fbfc4cda9e6 100644 --- a/paddle/phi/kernels/gpu/adamw_kernel.cu +++ b/paddle/phi/kernels/gpu/adamw_kernel.cu @@ -20,12 +20,12 @@ #include "paddle/phi/kernels/adamw_kernel.h" -#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/adam_functors.h" #include "paddle/phi/kernels/funcs/for_range.h" @@ -249,16 +249,11 @@ void AdamwDenseKernel(const Context& dev_ctx, // mutable_data if (skip_update_) { VLOG(4) << "Adamw skip update"; - paddle::framework::TensorCopy( - param, dev_ctx.GetPlace(), dev_ctx, param_out); - paddle::framework::TensorCopy( - moment1, dev_ctx.GetPlace(), dev_ctx, moment1_out); - paddle::framework::TensorCopy( - moment2, dev_ctx.GetPlace(), dev_ctx, moment2_out); - paddle::framework::TensorCopy( - beta1_pow, dev_ctx.GetPlace(), dev_ctx, beta1_pow_out); - paddle::framework::TensorCopy( - beta2_pow, dev_ctx.GetPlace(), dev_ctx, beta2_pow_out); + phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); + phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); + phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); + phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); + phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); return; } @@ -409,16 +404,11 @@ void AdamwDenseParamSparseGradKernel( // mutable_data if (skip_update_) { VLOG(4) << "Adamw skip update"; - paddle::framework::TensorCopy( - param, dev_ctx.GetPlace(), dev_ctx, param_out); - paddle::framework::TensorCopy( - moment1, dev_ctx.GetPlace(), dev_ctx, moment1_out); - paddle::framework::TensorCopy( - moment2, dev_ctx.GetPlace(), dev_ctx, moment2_out); - paddle::framework::TensorCopy( - beta1_pow, dev_ctx.GetPlace(), dev_ctx, beta1_pow_out); - paddle::framework::TensorCopy( - beta2_pow, dev_ctx.GetPlace(), dev_ctx, beta2_pow_out); + phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); + phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); + phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); + phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); + phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); return; } diff --git a/paddle/phi/kernels/impl/tile_grad_kernel_impl.h b/paddle/phi/kernels/impl/tile_grad_kernel_impl.h index a2c2720244fe8..b373855eee50a 100644 --- a/paddle/phi/kernels/impl/tile_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/tile_grad_kernel_impl.h @@ -16,7 +16,7 @@ #include #include -#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/tile_grad_kernel.h" @@ -90,8 +90,7 @@ void TileGradKernel(const Context& dev_ctx, if (just_copy) { dev_ctx.template Alloc(x_grad); - paddle::framework::TensorCopy( - out_grad, dev_ctx.GetPlace(), dev_ctx, x_grad); + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); // TensorCopy may change the dims of dx x_grad->Resize(x_dims); } else { diff --git a/tools/infrt/get_compat_kernel_signature.py b/tools/infrt/get_compat_kernel_signature.py index 78d59c2aef10b..332eb8dc1ad31 100644 --- a/tools/infrt/get_compat_kernel_signature.py +++ b/tools/infrt/get_compat_kernel_signature.py @@ -16,6 +16,8 @@ import re import json +skip_list = ["adam_sig.cc", "adamw_sig.cc"] + def parse_compat_registry(kernel_info): name, inputs_str, attrs_str, outputs_str = kernel_info.split(",{") From 6eed365004761010c94fc60114040cbf02093e79 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Fri, 11 Mar 2022 09:32:19 +0000 Subject: [PATCH 09/16] fix amp_type_traits --- paddle/phi/infermeta/unary.cc | 2 +- paddle/phi/kernels/gpu/adam_kernel.cu | 6 +++--- paddle/phi/kernels/gpu/adamw_kernel.cu | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 8128e1d550b49..da37af6a91125 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -461,7 +461,7 @@ void TileInferMeta(const MetaTensor& x, if (out_shape[0] == x_dims[0]) { out->share_lod(x); } - out->set_dtype(x.data_type()); + out->set_dtype(x.dtype()); } void ReshapeInferMeta(const MetaTensor& x, diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu index 01d5e44becdea..64f9df7288bc3 100644 --- a/paddle/phi/kernels/gpu/adam_kernel.cu +++ b/paddle/phi/kernels/gpu/adam_kernel.cu @@ -20,9 +20,9 @@ #include "paddle/phi/kernels/adam_kernel.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" @@ -206,7 +206,7 @@ void AdamDenseKernel(const Context& dev_ctx, DenseTensor* beta1_pow_out, DenseTensor* beta2_pow_out, DenseTensor* master_param_outs) { - using MPDType = typename paddle::operators::details::MPTypeTrait::Type; + using MPDType = typename phi::dtype::MPTypeTrait::Type; VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; @@ -345,7 +345,7 @@ void AdamDenseParamSparseGradKernel( DenseTensor* beta1_pow_out, DenseTensor* beta2_pow_out, DenseTensor* master_param_outs) { - using MPDType = typename paddle::operators::details::MPTypeTrait::Type; + using MPDType = typename phi::dtype::MPTypeTrait::Type; VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu index b2fbfc4cda9e6..b5a86bb980ba0 100644 --- a/paddle/phi/kernels/gpu/adamw_kernel.cu +++ b/paddle/phi/kernels/gpu/adamw_kernel.cu @@ -20,9 +20,9 @@ #include "paddle/phi/kernels/adamw_kernel.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" @@ -226,7 +226,7 @@ void AdamwDenseKernel(const Context& dev_ctx, DenseTensor* beta1_pow_out, DenseTensor* beta2_pow_out, DenseTensor* master_param_outs) { - using MPDType = typename paddle::operators::details::MPTypeTrait::Type; + using MPDType = typename phi::dtype::MPTypeTrait::Type; VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; @@ -381,7 +381,7 @@ void AdamwDenseParamSparseGradKernel( DenseTensor* beta1_pow_out, DenseTensor* beta2_pow_out, DenseTensor* master_param_outs) { - using MPDType = typename paddle::operators::details::MPTypeTrait::Type; + using MPDType = typename phi::dtype::MPTypeTrait::Type; VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; From f11a13b96cc95f945e45b9428bfc973a7853987c Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Fri, 11 Mar 2022 09:36:25 +0000 Subject: [PATCH 10/16] fix amp_type_traits --- paddle/phi/infermeta/multiary.h | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 64c2be5f835d7..2a85d8c387046 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -42,6 +42,7 @@ void AdamaxInferMeta(const MetaTensor& param, MetaTensor* param_out, MetaTensor* moment_out, MetaTensor* inf_norm_out); + void AdamInferMeta(const MetaTensor& param, const MetaTensor& grad, const MetaTensor& learning_rate, From 042324c6a6dd05f3f7cb0c6d3871aeb0b28fe13f Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Mon, 14 Mar 2022 05:52:48 +0000 Subject: [PATCH 11/16] modify according reviewer --- paddle/phi/kernels/adam_kernel.h | 26 -- paddle/phi/kernels/adamw_kernel.h | 29 -- paddle/phi/kernels/cpu/adam_kernel.cc | 222 +------------ paddle/phi/kernels/cpu/adamw_kernel.cc | 119 +------ paddle/phi/kernels/gpu/adamw_kernel.cu | 277 +--------------- .../phi/kernels/selected_rows/adam_kernel.cc | 242 ++++++++++++++ .../phi/kernels/selected_rows/adam_kernel.cu | 286 ++++++++++++++++ .../phi/kernels/selected_rows/adam_kernel.h | 51 +++ .../phi/kernels/selected_rows/adamw_kernel.cc | 140 ++++++++ .../phi/kernels/selected_rows/adamw_kernel.cu | 312 ++++++++++++++++++ .../phi/kernels/selected_rows/adamw_kernel.h | 54 +++ 11 files changed, 1095 insertions(+), 663 deletions(-) create mode 100644 paddle/phi/kernels/selected_rows/adam_kernel.cc create mode 100644 paddle/phi/kernels/selected_rows/adam_kernel.cu create mode 100644 paddle/phi/kernels/selected_rows/adam_kernel.h create mode 100644 paddle/phi/kernels/selected_rows/adamw_kernel.cc create mode 100644 paddle/phi/kernels/selected_rows/adamw_kernel.cu create mode 100644 paddle/phi/kernels/selected_rows/adamw_kernel.h diff --git a/paddle/phi/kernels/adam_kernel.h b/paddle/phi/kernels/adam_kernel.h index 1d1f2d81e0c4b..eadd0e6da699d 100644 --- a/paddle/phi/kernels/adam_kernel.h +++ b/paddle/phi/kernels/adam_kernel.h @@ -45,30 +45,4 @@ void AdamDenseKernel(const Context& dev_ctx, DenseTensor* beta2_pow_out, DenseTensor* master_param_outs); -template -void AdamDenseParamSparseGradKernel( - const Context& dev_ctx, - const DenseTensor& param, - const SelectedRows& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs); - } // namespace phi diff --git a/paddle/phi/kernels/adamw_kernel.h b/paddle/phi/kernels/adamw_kernel.h index 088128158965d..cda90102c003d 100644 --- a/paddle/phi/kernels/adamw_kernel.h +++ b/paddle/phi/kernels/adamw_kernel.h @@ -48,33 +48,4 @@ void AdamwDenseKernel(const Context& dev_ctx, DenseTensor* beta2_pow_out, DenseTensor* master_param_outs); -template -void AdamwDenseParamSparseGradKernel( - const Context& dev_ctx, - const DenseTensor& param, - const SelectedRows& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - float lr_ratio, - float coeff, - bool with_decay, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs); - } // namespace phi diff --git a/paddle/phi/kernels/cpu/adam_kernel.cc b/paddle/phi/kernels/cpu/adam_kernel.cc index 532e596b73132..661d30095d2cd 100644 --- a/paddle/phi/kernels/cpu/adam_kernel.cc +++ b/paddle/phi/kernels/cpu/adam_kernel.cc @@ -12,15 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include -#include - #include "paddle/phi/kernels/adam_kernel.h" -#include "paddle/fluid/framework/threadpool.h" +#include + +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/jit/kernels.h" -#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" @@ -170,220 +167,7 @@ void AdamDenseKernel(const Context& dev_ctx, } } -template -void AdamDenseParamSparseGradKernel( - const Context& dev_ctx, - const DenseTensor& param, - const SelectedRows& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs) { - VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; - - bool skip_update_ = false; - if (skip_update.is_initialized()) { - PADDLE_ENFORCE_EQ( - skip_update->numel(), - 1, - errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", - skip_update->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); - skip_update_ = skip_update_vec[0]; - } - // skip_update=true, just copy input to output, and TensorCopy will call - // mutable_data - if (skip_update_) { - VLOG(4) << "Adam skip update"; - phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); - phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); - phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); - phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); - phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); - return; - } - - T beta1_ = beta1.to(); - T beta2_ = beta2.to(); - T epsilon_ = epsilon.to(); - - VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel(); - VLOG(3) << "beta2_pow.numel() : " << beta2_pow.numel(); - VLOG(3) << "param.numel(): " << param.numel(); - - PADDLE_ENFORCE_EQ( - beta1_pow_out->numel(), - 1, - errors::InvalidArgument("beta1 pow output size should be 1, but received " - "value is:%d.", - beta1_pow_out->numel())); - - PADDLE_ENFORCE_EQ( - beta2_pow_out->numel(), - 1, - errors::InvalidArgument("beta2 pow output size should be 1, but received " - "value is:%d.", - beta2_pow_out->numel())); - - if (grad.rows().size() == 0) { - VLOG(3) << "grad row size is 0!!"; - return; - } - - std::vector cpu_rows(grad.rows().begin(), grad.rows().end()); - bool is_strict_sorted = true; - for (size_t i = 1; i < cpu_rows.size(); ++i) { - if (cpu_rows[i - 1] >= cpu_rows[i]) { - is_strict_sorted = false; - break; - } - } - - phi::SelectedRows tmp_grad_merge; - const phi::SelectedRows* grad_merge_ptr; - if (is_strict_sorted) { - grad_merge_ptr = &grad; - } else { - // merge duplicated rows if any. - // The rows of grad_merge have been sorted inside MergeAdd functor - paddle::operators::math::scatter::MergeAdd merge_func; - merge_func(dev_ctx, grad, &tmp_grad_merge, true); - grad_merge_ptr = &tmp_grad_merge; - } - - auto& grad_merge = *grad_merge_ptr; - auto& grad_tensor = grad_merge.value(); - const T* grad_data = grad_tensor.template data(); - auto* grad_merge_rows = &grad_merge.rows(); - paddle::framework::MixVector mixv_grad_merge_rows(grad_merge_rows); - const int64_t* rows = mixv_grad_merge_rows.Data(dev_ctx.GetPlace()); - auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); - - funcs::SparseAdamFunctor functor( - beta1_, - beta2_, - epsilon_, - beta1_pow.data(), - beta2_pow.data(), - moment1.data(), - dev_ctx.template Alloc(moment1_out), - moment2.data(), - dev_ctx.template Alloc(moment2_out), - learning_rate.data(), - grad_data, - param.data(), - dev_ctx.template Alloc(param_out), - rows, - row_numel, - grad_merge.rows().size(), - lazy_mode); - // update beta1 and beta2 - if (!use_global_beta_pow) { - dev_ctx.template Alloc(beta1_pow_out)[0] = - beta1_ * beta1_pow.data()[0]; - dev_ctx.template Alloc(beta2_pow_out)[0] = - beta2_ * beta2_pow.data()[0]; - } - if (lazy_mode) { - VLOG(3) << "run cpu lazy mode"; - size_t row_count = grad_merge.rows().size(); - std::vector cpu_rows(grad_merge.rows()); - for (size_t row_index = 0; row_index < row_count; ++row_index) { - for (size_t offset = 0; offset < row_numel; ++offset) { - size_t i = cpu_rows[row_index] * row_numel + offset; - functor.adam_update(i, grad_data[row_index * row_numel + offset]); - } - } - } -#ifndef _WIN32 - else if (FLAGS_inner_op_parallelism > 1 && // NOLINT - min_row_size_to_use_multithread > 0 && - param.dims()[0] > min_row_size_to_use_multithread) { - VLOG(3) << "use multi thread, inner_op_parallelism=" - << FLAGS_inner_op_parallelism << " min_row_size_to_use_multithread=" - << min_row_size_to_use_multithread; - if (FLAGS_inner_op_parallelism > 10) { - VLOG(1) << "FLAGS_inner_op_parallelism " << FLAGS_inner_op_parallelism - << " is two large!"; - } - auto& grad_rows = grad_merge.rows(); - std::unordered_map row_id_to_grad_row_offset; - size_t param_row_count = param.numel() / row_numel; - if (param_row_count < 1000) { - VLOG(1) << "param_row_count should be larger then 1000 to use " - "multi thread, currently " - << param_row_count; - } - for (size_t i = 0; i < grad_rows.size(); ++i) { - row_id_to_grad_row_offset[grad_rows[i]] = i; - } - std::vector> fs; - int64_t line_in_each_thread = - param_row_count / FLAGS_inner_op_parallelism + 1; - for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) { - int64_t start = i * line_in_each_thread; - int64_t end = (i + 1) * line_in_each_thread; - if (start >= static_cast(param_row_count)) { - break; - } - if (end > static_cast(param_row_count)) { - end = static_cast(param_row_count); - } - fs.push_back(paddle::framework::Async([&functor, - &row_id_to_grad_row_offset, - &grad_data, - row_numel, - start, - end]() { - for (int64_t row_id = start; row_id < end; ++row_id) { - auto iter = row_id_to_grad_row_offset.find(row_id); - if (iter != row_id_to_grad_row_offset.end()) { - for (size_t row_offset = 0U; row_offset < row_numel; ++row_offset) { - functor.adam_update( - row_id * row_numel + row_offset, - grad_data[iter->second * row_numel + row_offset]); - } - } else { - for (size_t row_offset = 0U; row_offset < row_numel; ++row_offset) { - functor.adam_update(row_id * row_numel + row_offset, 0); - } - } - } - })); - } - for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); - } -#endif // !_WIN32 - else { // NOLINT - functor(param.numel()); - } -} - } // namespace phi PD_REGISTER_KERNEL(adam, CPU, ALL_LAYOUT, phi::AdamDenseKernel, float, double) { } - -PD_REGISTER_KERNEL(adam_dense_param_sparse_grad, - CPU, - ALL_LAYOUT, - phi::AdamDenseParamSparseGradKernel, - float, - double) {} diff --git a/paddle/phi/kernels/cpu/adamw_kernel.cc b/paddle/phi/kernels/cpu/adamw_kernel.cc index 6c1d3fdc2d1ca..3a7869a062cf1 100644 --- a/paddle/phi/kernels/cpu/adamw_kernel.cc +++ b/paddle/phi/kernels/cpu/adamw_kernel.cc @@ -12,17 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include -#include - -#include "paddle/phi/kernels/adam_kernel.h" #include "paddle/phi/kernels/adamw_kernel.h" +#include + #include "paddle/fluid/framework/tensor_util.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/adam_kernel.h" #include "paddle/phi/kernels/funcs/adam_functors.h" namespace phi { @@ -131,118 +129,7 @@ void AdamwDenseKernel(const Context& dev_ctx, master_param_outs); } -template -void AdamwDenseParamSparseGradKernel( - const Context& dev_ctx, - const DenseTensor& param, - const SelectedRows& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - float lr_ratio, - float coeff, - bool with_decay, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs) { - bool skip_update_ = false; - if (skip_update.is_initialized()) { - PADDLE_ENFORCE_EQ( - skip_update->numel(), - 1, - errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", - skip_update->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); - skip_update_ = skip_update_vec[0]; - } - VLOG(3) << "Skip update" << skip_update_; - - if (skip_update_ || !with_decay) { - AdamDenseParamSparseGradKernel(dev_ctx, - param, - grad, - learning_rate, - moment1, - moment2, - beta1_pow, - beta2_pow, - master_param, - skip_update, - beta1, - beta2, - epsilon, - lazy_mode, - min_row_size_to_use_multithread, - multi_precision, - use_global_beta_pow, - param_out, - moment1_out, - moment2_out, - beta1_pow_out, - beta2_pow_out, - master_param_outs); - return; - } - - auto* param_ = - master_param.is_initialized() ? master_param.get_ptr() : ¶m; - T coeff_ = static_cast(coeff); - T lr_ratio_ = static_cast(lr_ratio); - funcs::AdamWFunctor functor( - coeff_, - lr_ratio_, - learning_rate.data(), - const_cast(param_->data())); - functor(param_->numel()); - - AdamDenseParamSparseGradKernel(dev_ctx, - param, - grad, - learning_rate, - moment1, - moment2, - beta1_pow, - beta2_pow, - master_param, - skip_update, - beta1, - beta2, - epsilon, - lazy_mode, - min_row_size_to_use_multithread, - multi_precision, - use_global_beta_pow, - param_out, - moment1_out, - moment2_out, - beta1_pow_out, - beta2_pow_out, - master_param_outs); -} - } // namespace phi PD_REGISTER_KERNEL( adamw, CPU, ALL_LAYOUT, phi::AdamwDenseKernel, float, double) {} - -PD_REGISTER_KERNEL(adamw_dense_param_sparse_grad, - CPU, - ALL_LAYOUT, - phi::AdamwDenseParamSparseGradKernel, - float, - double) {} diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu index b5a86bb980ba0..8fef101383bb0 100644 --- a/paddle/phi/kernels/gpu/adamw_kernel.cu +++ b/paddle/phi/kernels/gpu/adamw_kernel.cu @@ -12,14 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/adamw_kernel.h" + #include // for sqrt in CPU and CUDA -#include -#include -#include #include -#include "paddle/phi/kernels/adamw_kernel.h" - +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/amp_type_traits.h" @@ -127,6 +125,7 @@ __global__ void AdamWKernelMEM(MT beta1, } } } + template __global__ void UpdateAdamWBetaPow(T beta1, T beta2, @@ -138,67 +137,6 @@ __global__ void UpdateAdamWBetaPow(T beta1, *beta2_pow_out = beta2 * beta2_pow_[0]; } -template -__global__ void SparseAdamWCUDAKernelREG(MT beta1, - MT beta2, - MT epsilon, - MT coeff, - MT lr_ratio, - const MT beta1_pow, - const MT beta2_pow, - const MT* mom1_, - MT* mom1_out_, - const MT* mom2_, - MT* mom2_out_, - const MT* lr_, - const T* grad_, - const T* param_, - T* param_out_, - const MT* master_param, - MT* master_param_out, - const int64_t* rows_, - int64_t row_numel, - int64_t row_count, - bool lazy_mode, - int ndim) { - int id = blockIdx.x * blockDim.x + threadIdx.x; - MT lr = *lr_ * lr_ratio; - - for (; id < ndim; id += blockDim.x * gridDim.x) { - auto row_idx = - phi::funcs::BinarySearch(rows_, row_count, id / row_numel); - if (lazy_mode && row_idx < 0) { - return; - } else { - MT mom1 = static_cast(mom1_[id]); - MT mom2 = static_cast(mom2_[id]); - - MT p = master_param ? master_param[id] : static_cast(param_[id]); - MT g = row_idx >= 0 - ? static_cast(grad_[row_idx * row_numel + id % row_numel]) - : static_cast(0); - - p *= (static_cast(1.0) - lr * coeff); - - mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; - mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - - MT denom = - (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; - - p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); - - // Write back to global memory - mom1_out_[id] = mom1; - mom2_out_[id] = mom2; - param_out_[id] = static_cast(p); - if (master_param_out) { - master_param_out[id] = p; - } - } - } -} - template void AdamwDenseKernel(const Context& dev_ctx, const DenseTensor& param, @@ -353,205 +291,6 @@ void AdamwDenseKernel(const Context& dev_ctx, } } -template -void AdamwDenseParamSparseGradKernel( - const Context& dev_ctx, - const DenseTensor& param, - const SelectedRows& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - float lr_ratio, - float coeff, - bool with_decay, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs) { - using MPDType = typename phi::dtype::MPTypeTrait::Type; - - VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; - - MPDType coeff_ = static_cast(coeff); - MPDType lr_ratio_ = static_cast(lr_ratio); - - bool skip_update_ = false; - if (skip_update.is_initialized()) { - PADDLE_ENFORCE_EQ( - skip_update->numel(), - 1, - errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", - skip_update->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); - skip_update_ = skip_update_vec[0]; - } - - // skip_update=true, just copy input to output, and TensorCopy will call - // mutable_data - if (skip_update_) { - VLOG(4) << "Adamw skip update"; - phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); - phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); - phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); - phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); - phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); - return; - } - - // if with_decay = false, coeff = 0 - if (!with_decay) { - coeff_ = static_cast(0.0); - } - - MPDType beta1_ = beta1.to(); - MPDType beta2_ = beta2.to(); - MPDType epsilon_ = epsilon.to(); - VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel() - << "beta2_pow.numel() : " << beta2_pow.numel(); - VLOG(3) << "param.numel(): " << param.numel(); - PADDLE_ENFORCE_EQ( - beta1_pow_out->numel(), - 1, - errors::InvalidArgument("beta1 pow output size should be 1, but received " - "value is:%d.", - beta1_pow_out->numel())); - - PADDLE_ENFORCE_EQ( - beta2_pow_out->numel(), - 1, - errors::InvalidArgument("beta2 pow output size should be 1, but received " - "value is:%d.", - beta2_pow_out->numel())); - - const MPDType* master_in_data = - multi_precision ? master_param->data() : nullptr; - MPDType* master_out_data = - multi_precision ? dev_ctx.template Alloc(master_param_outs) - : nullptr; - - if (grad.rows().size() == 0) { - VLOG(3) << "grad row size is 0!!"; - return; - } - - std::vector cpu_rows(grad.rows().begin(), grad.rows().end()); - bool is_strict_sorted = true; - for (size_t i = 1; i < cpu_rows.size(); ++i) { - if (cpu_rows[i - 1] >= cpu_rows[i]) { - is_strict_sorted = false; - break; - } - } - - phi::SelectedRows tmp_grad_merge; - const phi::SelectedRows* grad_merge_ptr; - if (is_strict_sorted) { - grad_merge_ptr = &grad; - } else { - // merge duplicated rows if any. - // The rows of grad_merge have been sorted inside MergeAdd functor - paddle::operators::math::scatter::MergeAdd merge_func; - merge_func(dev_ctx, grad, &tmp_grad_merge, true); - grad_merge_ptr = &tmp_grad_merge; - } - auto& grad_merge = *grad_merge_ptr; - auto& grad_tensor = grad_merge.value(); - const T* grad_data = grad_tensor.template data(); - auto* grad_merge_rows = &grad_merge.rows(); - paddle::framework::MixVector mixv_grad_merge_rows(grad_merge_rows); - const int64_t* rows = mixv_grad_merge_rows.Data(dev_ctx.GetPlace()); - auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); - - if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) { - int threads = 512; - int ndim = param.numel(); - int blocks = (ndim + threads - 1) / threads; - - SparseAdamWCUDAKernelREG<<>>( - beta1_, - beta2_, - epsilon_, - coeff_, - lr_ratio_, - *beta1_pow.data(), - *beta2_pow.data(), - moment1.data(), - dev_ctx.template Alloc(moment1_out), - moment2.data(), - dev_ctx.template Alloc(moment2_out), - learning_rate.data(), - grad_data, - param.data(), - dev_ctx.template Alloc(param_out), - master_in_data, - master_out_data, - rows, - row_numel, - grad_merge.rows().size(), - lazy_mode, - ndim); - if (!use_global_beta_pow) { - // Update with cpu - dev_ctx.template HostAlloc(beta1_pow_out)[0] = - beta1_ * beta1_pow.data()[0]; - dev_ctx.template HostAlloc(beta2_pow_out)[0] = - beta2_ * beta2_pow.data()[0]; - } - } else { - funcs::SparseAdamWFunctor functor( - beta1_, - beta2_, - epsilon_, - coeff_, - lr_ratio_, - beta1_pow.data(), - beta2_pow.data(), - moment1.data(), - dev_ctx.template Alloc(moment1_out), - moment2.data(), - dev_ctx.template Alloc(moment2_out), - learning_rate.data(), - grad_data, - param.data(), - dev_ctx.template Alloc(param_out), - master_in_data, - master_out_data, - rows, - row_numel, - grad_merge.rows().size(), - lazy_mode); - - // FIXME(minqiyang): remove BinarySearch in GPU later - funcs::ForRange for_range(dev_ctx, param.numel()); - for_range(functor); - if (!use_global_beta_pow) { - // update beta1 and beta2 - UpdateAdamWBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( - beta1_, - beta2_, - beta1_pow.data(), - beta2_pow.data(), - dev_ctx.template Alloc(beta1_pow_out), - dev_ctx.template Alloc(beta2_pow_out)); - } - } -} - } // namespace phi PD_REGISTER_KERNEL(adamw, @@ -561,11 +300,3 @@ PD_REGISTER_KERNEL(adamw, float, double, phi::dtype::float16) {} - -PD_REGISTER_KERNEL(adamw_dense_param_sparse_grad, - GPU, - ALL_LAYOUT, - phi::AdamwDenseParamSparseGradKernel, - float, - double, - phi::dtype::float16) {} diff --git a/paddle/phi/kernels/selected_rows/adam_kernel.cc b/paddle/phi/kernels/selected_rows/adam_kernel.cc new file mode 100644 index 0000000000000..57e33beb95e3e --- /dev/null +++ b/paddle/phi/kernels/selected_rows/adam_kernel.cc @@ -0,0 +1,242 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/adam_kernel.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/adam_functors.h" + +namespace phi { +namespace sr { + +template +void AdamDenseParamSparseGradKernel( + const Context& dev_ctx, + const DenseTensor& param, + const SelectedRows& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs) { + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + + bool skip_update_ = false; + if (skip_update.is_initialized()) { + PADDLE_ENFORCE_EQ( + skip_update->numel(), + 1, + errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", + skip_update->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); + skip_update_ = skip_update_vec[0]; + } + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update_) { + VLOG(4) << "Adam skip update"; + phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); + phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); + phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); + phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); + phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); + return; + } + + T beta1_ = beta1.to(); + T beta2_ = beta2.to(); + T epsilon_ = epsilon.to(); + + VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel(); + VLOG(3) << "beta2_pow.numel() : " << beta2_pow.numel(); + VLOG(3) << "param.numel(): " << param.numel(); + + PADDLE_ENFORCE_EQ( + beta1_pow_out->numel(), + 1, + errors::InvalidArgument("beta1 pow output size should be 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ( + beta2_pow_out->numel(), + 1, + errors::InvalidArgument("beta2 pow output size should be 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + if (grad.rows().size() == 0) { + VLOG(3) << "grad row size is 0!!"; + return; + } + + std::vector cpu_rows(grad.rows().begin(), grad.rows().end()); + bool is_strict_sorted = true; + for (size_t i = 1; i < cpu_rows.size(); ++i) { + if (cpu_rows[i - 1] >= cpu_rows[i]) { + is_strict_sorted = false; + break; + } + } + + phi::SelectedRows tmp_grad_merge; + const phi::SelectedRows* grad_merge_ptr; + if (is_strict_sorted) { + grad_merge_ptr = &grad; + } else { + // merge duplicated rows if any. + // The rows of grad_merge have been sorted inside MergeAdd functor + paddle::operators::math::scatter::MergeAdd merge_func; + merge_func(dev_ctx, grad, &tmp_grad_merge, true); + grad_merge_ptr = &tmp_grad_merge; + } + + auto& grad_merge = *grad_merge_ptr; + auto& grad_tensor = grad_merge.value(); + const T* grad_data = grad_tensor.template data(); + auto* grad_merge_rows = &grad_merge.rows(); + paddle::framework::MixVector mixv_grad_merge_rows(grad_merge_rows); + const int64_t* rows = mixv_grad_merge_rows.Data(dev_ctx.GetPlace()); + auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); + + funcs::SparseAdamFunctor functor( + beta1_, + beta2_, + epsilon_, + beta1_pow.data(), + beta2_pow.data(), + moment1.data(), + dev_ctx.template Alloc(moment1_out), + moment2.data(), + dev_ctx.template Alloc(moment2_out), + learning_rate.data(), + grad_data, + param.data(), + dev_ctx.template Alloc(param_out), + rows, + row_numel, + grad_merge.rows().size(), + lazy_mode); + // update beta1 and beta2 + if (!use_global_beta_pow) { + dev_ctx.template Alloc(beta1_pow_out)[0] = + beta1_ * beta1_pow.data()[0]; + dev_ctx.template Alloc(beta2_pow_out)[0] = + beta2_ * beta2_pow.data()[0]; + } + if (lazy_mode) { + VLOG(3) << "run cpu lazy mode"; + size_t row_count = grad_merge.rows().size(); + std::vector cpu_rows(grad_merge.rows()); + for (size_t row_index = 0; row_index < row_count; ++row_index) { + for (size_t offset = 0; offset < row_numel; ++offset) { + size_t i = cpu_rows[row_index] * row_numel + offset; + functor.adam_update(i, grad_data[row_index * row_numel + offset]); + } + } + } +#ifndef _WIN32 + else if (FLAGS_inner_op_parallelism > 1 && // NOLINT + min_row_size_to_use_multithread > 0 && + param.dims()[0] > min_row_size_to_use_multithread) { + VLOG(3) << "use multi thread, inner_op_parallelism=" + << FLAGS_inner_op_parallelism << " min_row_size_to_use_multithread=" + << min_row_size_to_use_multithread; + if (FLAGS_inner_op_parallelism > 10) { + VLOG(1) << "FLAGS_inner_op_parallelism " << FLAGS_inner_op_parallelism + << " is two large!"; + } + auto& grad_rows = grad_merge.rows(); + std::unordered_map row_id_to_grad_row_offset; + size_t param_row_count = param.numel() / row_numel; + if (param_row_count < 1000) { + VLOG(1) << "param_row_count should be larger then 1000 to use " + "multi thread, currently " + << param_row_count; + } + for (size_t i = 0; i < grad_rows.size(); ++i) { + row_id_to_grad_row_offset[grad_rows[i]] = i; + } + std::vector> fs; + int64_t line_in_each_thread = + param_row_count / FLAGS_inner_op_parallelism + 1; + for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) { + int64_t start = i * line_in_each_thread; + int64_t end = (i + 1) * line_in_each_thread; + if (start >= static_cast(param_row_count)) { + break; + } + if (end > static_cast(param_row_count)) { + end = static_cast(param_row_count); + } + fs.push_back(paddle::framework::Async([&functor, + &row_id_to_grad_row_offset, + &grad_data, + row_numel, + start, + end]() { + for (int64_t row_id = start; row_id < end; ++row_id) { + auto iter = row_id_to_grad_row_offset.find(row_id); + if (iter != row_id_to_grad_row_offset.end()) { + for (size_t row_offset = 0U; row_offset < row_numel; ++row_offset) { + functor.adam_update( + row_id * row_numel + row_offset, + grad_data[iter->second * row_numel + row_offset]); + } + } else { + for (size_t row_offset = 0U; row_offset < row_numel; ++row_offset) { + functor.adam_update(row_id * row_numel + row_offset, 0); + } + } + } + })); + } + for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); + } +#endif // !_WIN32 + else { // NOLINT + functor(param.numel()); + } +} + +} // namespace sr +} // namespace phi + +PD_REGISTER_KERNEL(adam_dense_param_sparse_grad, + CPU, + ALL_LAYOUT, + phi::sr::AdamDenseParamSparseGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/selected_rows/adam_kernel.cu b/paddle/phi/kernels/selected_rows/adam_kernel.cu new file mode 100644 index 0000000000000..672399b9a7c5d --- /dev/null +++ b/paddle/phi/kernels/selected_rows/adam_kernel.cu @@ -0,0 +1,286 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/adam_kernel.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/adam_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" + +namespace phi { +namespace sr { + +template +__global__ void UpdateBetaPow(T beta1, + T beta2, + const T* beta1_pow_, + const T* beta2_pow_, + T* beta1_pow_out, + T* beta2_pow_out) { + *beta1_pow_out = beta1 * beta1_pow_[0]; + *beta2_pow_out = beta2 * beta2_pow_[0]; +} + +template +__global__ void SparseAdamCUDAKernelREG(MT beta1, + MT beta2, + MT epsilon, + const MT beta1_pow, + const MT beta2_pow, + const MT* mom1_, + MT* mom1_out_, + const MT* mom2_, + MT* mom2_out_, + const MT* lr_, + const T* grad_, + const T* param_, + T* param_out_, + const MT* master_param, + MT* master_param_out, + const int64_t* rows_, + int64_t row_numel, + int64_t row_count, + bool lazy_mode, + int ndim) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + MT lr = *lr_; + + for (; id < ndim; id += blockDim.x * gridDim.x) { + auto row_idx = + phi::funcs::BinarySearch(rows_, row_count, id / row_numel); + if (lazy_mode && row_idx < 0) { + return; + } else { + MT mom1 = mom1_[id]; + MT mom2 = mom2_[id]; + MT p = master_param ? master_param[id] : static_cast(param_[id]); + MT g = row_idx >= 0 + ? static_cast(grad_[row_idx * row_numel + id % row_numel]) + : static_cast(0); + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + + MT denom = + (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; + p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); + + // Write back to global memory + mom1_out_[id] = mom1; + mom2_out_[id] = mom2; + param_out_[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } + } +} + +template +void AdamDenseParamSparseGradKernel( + const Context& dev_ctx, + const DenseTensor& param, + const SelectedRows& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs) { + using MPDType = typename phi::dtype::MPTypeTrait::Type; + + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + + bool skip_update_ = false; + if (skip_update.is_initialized()) { + PADDLE_ENFORCE_EQ( + skip_update->numel(), + 1, + errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", + skip_update->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); + skip_update_ = skip_update_vec[0]; + } + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update_) { + VLOG(4) << "Adam skip update"; + phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); + phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); + phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); + phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); + phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); + return; + } + + MPDType beta1_ = beta1.to(); + MPDType beta2_ = beta2.to(); + MPDType epsilon_ = epsilon.to(); + VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel() + << "beta2_pow.numel() : " << beta2_pow.numel(); + VLOG(3) << "param.numel(): " << param.numel(); + PADDLE_ENFORCE_EQ( + beta1_pow_out->numel(), + 1, + errors::InvalidArgument("beta1 pow output size should be 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ( + beta2_pow_out->numel(), + 1, + errors::InvalidArgument("beta2 pow output size should be 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + const MPDType* master_in_data = + multi_precision ? master_param->data() : nullptr; + MPDType* master_out_data = + multi_precision ? dev_ctx.template Alloc(master_param_outs) + : nullptr; + + if (grad.rows().size() == 0) { + VLOG(3) << "grad row size is 0!!"; + return; + } + + std::vector cpu_rows(grad.rows().begin(), grad.rows().end()); + bool is_strict_sorted = true; + for (size_t i = 1; i < cpu_rows.size(); ++i) { + if (cpu_rows[i - 1] >= cpu_rows[i]) { + is_strict_sorted = false; + break; + } + } + + phi::SelectedRows tmp_grad_merge; + const phi::SelectedRows* grad_merge_ptr; + if (is_strict_sorted) { + grad_merge_ptr = &grad; + } else { + // merge duplicated rows if any. + // The rows of grad_merge have been sorted inside MergeAdd functor + paddle::operators::math::scatter::MergeAdd merge_func; + merge_func(dev_ctx, grad, &tmp_grad_merge, true); + grad_merge_ptr = &tmp_grad_merge; + } + auto& grad_merge = *grad_merge_ptr; + auto& grad_tensor = grad_merge.value(); + const T* grad_data = grad_tensor.template data(); + auto* grad_merge_rows = &grad_merge.rows(); + paddle::framework::MixVector mixv_grad_merge_rows(grad_merge_rows); + const int64_t* rows = mixv_grad_merge_rows.Data(dev_ctx.GetPlace()); + auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); + + if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) { + int threads = 512; + int ndim = param.numel(); + int blocks = (ndim + threads - 1) / threads; + + SparseAdamCUDAKernelREG<<>>( + beta1_, + beta2_, + epsilon_, + *beta1_pow.data(), + *beta2_pow.data(), + moment1.data(), + dev_ctx.template Alloc(moment1_out), + moment2.data(), + dev_ctx.template Alloc(moment2_out), + learning_rate.data(), + grad_data, + param.data(), + dev_ctx.template Alloc(param_out), + master_in_data, + master_out_data, + rows, + row_numel, + grad_merge.rows().size(), + lazy_mode, + ndim); + if (!use_global_beta_pow) { + // Update with cpu + dev_ctx.template HostAlloc(beta1_pow_out)[0] = + beta1_ * beta1_pow.data()[0]; + dev_ctx.template HostAlloc(beta2_pow_out)[0] = + beta2_ * beta2_pow.data()[0]; + } + } else { + funcs::SparseAdamFunctor functor( + beta1_, + beta2_, + epsilon_, + beta1_pow.data(), + beta2_pow.data(), + moment1.data(), + dev_ctx.template Alloc(moment1_out), + moment2.data(), + dev_ctx.template Alloc(moment2_out), + learning_rate.data(), + grad_data, + param.data(), + dev_ctx.template Alloc(param_out), + master_in_data, + master_out_data, + rows, + row_numel, + grad_merge.rows().size(), + lazy_mode); + + // FIXME(minqiyang): remove BinarySearch in GPU later + funcs::ForRange for_range(dev_ctx, param.numel()); + for_range(functor); + if (!use_global_beta_pow) { + // update beta1 and beta2 + UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + beta1_, + beta2_, + beta1_pow.data(), + beta2_pow.data(), + dev_ctx.template Alloc(beta1_pow_out), + dev_ctx.template Alloc(beta2_pow_out)); + } + } +} + +} // namespace sr +} // namespace phi + +PD_REGISTER_KERNEL(adam_dense_param_sparse_grad, + GPU, + ALL_LAYOUT, + phi::sr::AdamDenseParamSparseGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/selected_rows/adam_kernel.h b/paddle/phi/kernels/selected_rows/adam_kernel.h new file mode 100644 index 0000000000000..2e13d29d17284 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/adam_kernel.h @@ -0,0 +1,51 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/selected_rows.h" + +namespace phi { +namespace sr { + +template +void AdamDenseParamSparseGradKernel( + const Context& dev_ctx, + const DenseTensor& param, + const SelectedRows& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs); + +} // namespace sr +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/adamw_kernel.cc b/paddle/phi/kernels/selected_rows/adamw_kernel.cc new file mode 100644 index 0000000000000..a52bca761108c --- /dev/null +++ b/paddle/phi/kernels/selected_rows/adamw_kernel.cc @@ -0,0 +1,140 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/adamw_kernel.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/adam_kernel.h" +#include "paddle/phi/kernels/funcs/adam_functors.h" +#include "paddle/phi/kernels/selected_rows/adam_kernel.h" + +namespace phi { +namespace sr { + +template +void AdamwDenseParamSparseGradKernel( + const Context& dev_ctx, + const DenseTensor& param, + const SelectedRows& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + float lr_ratio, + float coeff, + bool with_decay, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs) { + bool skip_update_ = false; + if (skip_update.is_initialized()) { + PADDLE_ENFORCE_EQ( + skip_update->numel(), + 1, + errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", + skip_update->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); + skip_update_ = skip_update_vec[0]; + } + VLOG(3) << "Skip update" << skip_update_; + + if (skip_update_ || !with_decay) { + AdamDenseParamSparseGradKernel(dev_ctx, + param, + grad, + learning_rate, + moment1, + moment2, + beta1_pow, + beta2_pow, + master_param, + skip_update, + beta1, + beta2, + epsilon, + lazy_mode, + min_row_size_to_use_multithread, + multi_precision, + use_global_beta_pow, + param_out, + moment1_out, + moment2_out, + beta1_pow_out, + beta2_pow_out, + master_param_outs); + return; + } + + auto* param_ = + master_param.is_initialized() ? master_param.get_ptr() : ¶m; + T coeff_ = static_cast(coeff); + T lr_ratio_ = static_cast(lr_ratio); + funcs::AdamWFunctor functor( + coeff_, + lr_ratio_, + learning_rate.data(), + const_cast(param_->data())); + functor(param_->numel()); + + AdamDenseParamSparseGradKernel(dev_ctx, + param, + grad, + learning_rate, + moment1, + moment2, + beta1_pow, + beta2_pow, + master_param, + skip_update, + beta1, + beta2, + epsilon, + lazy_mode, + min_row_size_to_use_multithread, + multi_precision, + use_global_beta_pow, + param_out, + moment1_out, + moment2_out, + beta1_pow_out, + beta2_pow_out, + master_param_outs); +} + +} // namespace sr +} // namespace phi + +PD_REGISTER_KERNEL(adamw_dense_param_sparse_grad, + CPU, + ALL_LAYOUT, + phi::sr::AdamwDenseParamSparseGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/selected_rows/adamw_kernel.cu b/paddle/phi/kernels/selected_rows/adamw_kernel.cu new file mode 100644 index 0000000000000..89f5db74e6550 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/adamw_kernel.cu @@ -0,0 +1,312 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/adamw_kernel.h" + +#include // for sqrt in CPU and CUDA +#include + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/adam_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" + +namespace phi { +namespace sr { + +template +__global__ void UpdateAdamWBetaPow(T beta1, + T beta2, + const T* beta1_pow_, + const T* beta2_pow_, + T* beta1_pow_out, + T* beta2_pow_out) { + *beta1_pow_out = beta1 * beta1_pow_[0]; + *beta2_pow_out = beta2 * beta2_pow_[0]; +} + +template +__global__ void SparseAdamWCUDAKernelREG(MT beta1, + MT beta2, + MT epsilon, + MT coeff, + MT lr_ratio, + const MT beta1_pow, + const MT beta2_pow, + const MT* mom1_, + MT* mom1_out_, + const MT* mom2_, + MT* mom2_out_, + const MT* lr_, + const T* grad_, + const T* param_, + T* param_out_, + const MT* master_param, + MT* master_param_out, + const int64_t* rows_, + int64_t row_numel, + int64_t row_count, + bool lazy_mode, + int ndim) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + MT lr = *lr_ * lr_ratio; + + for (; id < ndim; id += blockDim.x * gridDim.x) { + auto row_idx = + phi::funcs::BinarySearch(rows_, row_count, id / row_numel); + if (lazy_mode && row_idx < 0) { + return; + } else { + MT mom1 = static_cast(mom1_[id]); + MT mom2 = static_cast(mom2_[id]); + + MT p = master_param ? master_param[id] : static_cast(param_[id]); + MT g = row_idx >= 0 + ? static_cast(grad_[row_idx * row_numel + id % row_numel]) + : static_cast(0); + + p *= (static_cast(1.0) - lr * coeff); + + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + + MT denom = + (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; + + p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); + + // Write back to global memory + mom1_out_[id] = mom1; + mom2_out_[id] = mom2; + param_out_[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } + } +} + +xtemplate void AdamwDenseParamSparseGradKernel( + const Context& dev_ctx, + const DenseTensor& param, + const SelectedRows& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + float lr_ratio, + float coeff, + bool with_decay, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs) { + using MPDType = typename phi::dtype::MPTypeTrait::Type; + + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + + MPDType coeff_ = static_cast(coeff); + MPDType lr_ratio_ = static_cast(lr_ratio); + + bool skip_update_ = false; + if (skip_update.is_initialized()) { + PADDLE_ENFORCE_EQ( + skip_update->numel(), + 1, + errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", + skip_update->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); + skip_update_ = skip_update_vec[0]; + } + + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update_) { + VLOG(4) << "Adamw skip update"; + phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); + phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); + phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); + phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); + phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); + return; + } + + // if with_decay = false, coeff = 0 + if (!with_decay) { + coeff_ = static_cast(0.0); + } + + MPDType beta1_ = beta1.to(); + MPDType beta2_ = beta2.to(); + MPDType epsilon_ = epsilon.to(); + VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel() + << "beta2_pow.numel() : " << beta2_pow.numel(); + VLOG(3) << "param.numel(): " << param.numel(); + PADDLE_ENFORCE_EQ( + beta1_pow_out->numel(), + 1, + errors::InvalidArgument("beta1 pow output size should be 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ( + beta2_pow_out->numel(), + 1, + errors::InvalidArgument("beta2 pow output size should be 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + const MPDType* master_in_data = + multi_precision ? master_param->data() : nullptr; + MPDType* master_out_data = + multi_precision ? dev_ctx.template Alloc(master_param_outs) + : nullptr; + + if (grad.rows().size() == 0) { + VLOG(3) << "grad row size is 0!!"; + return; + } + + std::vector cpu_rows(grad.rows().begin(), grad.rows().end()); + bool is_strict_sorted = true; + for (size_t i = 1; i < cpu_rows.size(); ++i) { + if (cpu_rows[i - 1] >= cpu_rows[i]) { + is_strict_sorted = false; + break; + } + } + + phi::SelectedRows tmp_grad_merge; + const phi::SelectedRows* grad_merge_ptr; + if (is_strict_sorted) { + grad_merge_ptr = &grad; + } else { + // merge duplicated rows if any. + // The rows of grad_merge have been sorted inside MergeAdd functor + paddle::operators::math::scatter::MergeAdd merge_func; + merge_func(dev_ctx, grad, &tmp_grad_merge, true); + grad_merge_ptr = &tmp_grad_merge; + } + auto& grad_merge = *grad_merge_ptr; + auto& grad_tensor = grad_merge.value(); + const T* grad_data = grad_tensor.template data(); + auto* grad_merge_rows = &grad_merge.rows(); + paddle::framework::MixVector mixv_grad_merge_rows(grad_merge_rows); + const int64_t* rows = mixv_grad_merge_rows.Data(dev_ctx.GetPlace()); + auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); + + if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) { + int threads = 512; + int ndim = param.numel(); + int blocks = (ndim + threads - 1) / threads; + + SparseAdamWCUDAKernelREG<<>>( + beta1_, + beta2_, + epsilon_, + coeff_, + lr_ratio_, + *beta1_pow.data(), + *beta2_pow.data(), + moment1.data(), + dev_ctx.template Alloc(moment1_out), + moment2.data(), + dev_ctx.template Alloc(moment2_out), + learning_rate.data(), + grad_data, + param.data(), + dev_ctx.template Alloc(param_out), + master_in_data, + master_out_data, + rows, + row_numel, + grad_merge.rows().size(), + lazy_mode, + ndim); + if (!use_global_beta_pow) { + // Update with cpu + dev_ctx.template HostAlloc(beta1_pow_out)[0] = + beta1_ * beta1_pow.data()[0]; + dev_ctx.template HostAlloc(beta2_pow_out)[0] = + beta2_ * beta2_pow.data()[0]; + } + } else { + funcs::SparseAdamWFunctor functor( + beta1_, + beta2_, + epsilon_, + coeff_, + lr_ratio_, + beta1_pow.data(), + beta2_pow.data(), + moment1.data(), + dev_ctx.template Alloc(moment1_out), + moment2.data(), + dev_ctx.template Alloc(moment2_out), + learning_rate.data(), + grad_data, + param.data(), + dev_ctx.template Alloc(param_out), + master_in_data, + master_out_data, + rows, + row_numel, + grad_merge.rows().size(), + lazy_mode); + + // FIXME(minqiyang): remove BinarySearch in GPU later + funcs::ForRange for_range(dev_ctx, param.numel()); + for_range(functor); + if (!use_global_beta_pow) { + // update beta1 and beta2 + UpdateAdamWBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + beta1_, + beta2_, + beta1_pow.data(), + beta2_pow.data(), + dev_ctx.template Alloc(beta1_pow_out), + dev_ctx.template Alloc(beta2_pow_out)); + } + } +} + +} // namespace sr +} // namespace phi + +PD_REGISTER_KERNEL(adamw_dense_param_sparse_grad, + GPU, + ALL_LAYOUT, + phi::sr::AdamwDenseParamSparseGradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/selected_rows/adamw_kernel.h b/paddle/phi/kernels/selected_rows/adamw_kernel.h new file mode 100644 index 0000000000000..ddb155ce4504e --- /dev/null +++ b/paddle/phi/kernels/selected_rows/adamw_kernel.h @@ -0,0 +1,54 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/selected_rows.h" + +namespace phi { +namespace sr { + +template +void AdamwDenseParamSparseGradKernel( + const Context& dev_ctx, + const DenseTensor& param, + const SelectedRows& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + paddle::optional master_param, + paddle::optional skip_update, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + float lr_ratio, + float coeff, + bool with_decay, + bool lazy_mode, + int64_t min_row_size_to_use_multithread, + bool multi_precision, + bool use_global_beta_pow, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs); + +} // namespace sr +} // namespace phi From 0ee26f6e3f0014b3b3a9550beafbb9ac962f9780 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Mon, 14 Mar 2022 05:53:09 +0000 Subject: [PATCH 12/16] modify according reviewer --- paddle/phi/kernels/gpu/adam_kernel.cu | 253 +------------------------- 1 file changed, 3 insertions(+), 250 deletions(-) diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu index 64f9df7288bc3..68bf1757a9912 100644 --- a/paddle/phi/kernels/gpu/adam_kernel.cu +++ b/paddle/phi/kernels/gpu/adam_kernel.cu @@ -12,14 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/adam_kernel.h" + #include // for sqrt in CPU and CUDA -#include -#include -#include #include -#include "paddle/phi/kernels/adam_kernel.h" - +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/amp_type_traits.h" @@ -128,60 +126,6 @@ __global__ void UpdateBetaPow(T beta1, *beta2_pow_out = beta2 * beta2_pow_[0]; } -template -__global__ void SparseAdamCUDAKernelREG(MT beta1, - MT beta2, - MT epsilon, - const MT beta1_pow, - const MT beta2_pow, - const MT* mom1_, - MT* mom1_out_, - const MT* mom2_, - MT* mom2_out_, - const MT* lr_, - const T* grad_, - const T* param_, - T* param_out_, - const MT* master_param, - MT* master_param_out, - const int64_t* rows_, - int64_t row_numel, - int64_t row_count, - bool lazy_mode, - int ndim) { - int id = blockIdx.x * blockDim.x + threadIdx.x; - MT lr = *lr_; - - for (; id < ndim; id += blockDim.x * gridDim.x) { - auto row_idx = - phi::funcs::BinarySearch(rows_, row_count, id / row_numel); - if (lazy_mode && row_idx < 0) { - return; - } else { - MT mom1 = mom1_[id]; - MT mom2 = mom2_[id]; - MT p = master_param ? master_param[id] : static_cast(param_[id]); - MT g = row_idx >= 0 - ? static_cast(grad_[row_idx * row_numel + id % row_numel]) - : static_cast(0); - mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; - mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - - MT denom = - (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; - p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); - - // Write back to global memory - mom1_out_[id] = mom1; - mom2_out_[id] = mom2; - param_out_[id] = static_cast(p); - if (master_param_out) { - master_param_out[id] = p; - } - } - } -} - template void AdamDenseKernel(const Context& dev_ctx, const DenseTensor& param, @@ -320,189 +264,6 @@ void AdamDenseKernel(const Context& dev_ctx, } } -template -void AdamDenseParamSparseGradKernel( - const Context& dev_ctx, - const DenseTensor& param, - const SelectedRows& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs) { - using MPDType = typename phi::dtype::MPTypeTrait::Type; - - VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; - - bool skip_update_ = false; - if (skip_update.is_initialized()) { - PADDLE_ENFORCE_EQ( - skip_update->numel(), - 1, - errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", - skip_update->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); - skip_update_ = skip_update_vec[0]; - } - // skip_update=true, just copy input to output, and TensorCopy will call - // mutable_data - if (skip_update_) { - VLOG(4) << "Adam skip update"; - phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); - phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); - phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); - phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); - phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); - return; - } - - MPDType beta1_ = beta1.to(); - MPDType beta2_ = beta2.to(); - MPDType epsilon_ = epsilon.to(); - VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel() - << "beta2_pow.numel() : " << beta2_pow.numel(); - VLOG(3) << "param.numel(): " << param.numel(); - PADDLE_ENFORCE_EQ( - beta1_pow_out->numel(), - 1, - errors::InvalidArgument("beta1 pow output size should be 1, but received " - "value is:%d.", - beta1_pow_out->numel())); - - PADDLE_ENFORCE_EQ( - beta2_pow_out->numel(), - 1, - errors::InvalidArgument("beta2 pow output size should be 1, but received " - "value is:%d.", - beta2_pow_out->numel())); - - const MPDType* master_in_data = - multi_precision ? master_param->data() : nullptr; - MPDType* master_out_data = - multi_precision ? dev_ctx.template Alloc(master_param_outs) - : nullptr; - - if (grad.rows().size() == 0) { - VLOG(3) << "grad row size is 0!!"; - return; - } - - std::vector cpu_rows(grad.rows().begin(), grad.rows().end()); - bool is_strict_sorted = true; - for (size_t i = 1; i < cpu_rows.size(); ++i) { - if (cpu_rows[i - 1] >= cpu_rows[i]) { - is_strict_sorted = false; - break; - } - } - - phi::SelectedRows tmp_grad_merge; - const phi::SelectedRows* grad_merge_ptr; - if (is_strict_sorted) { - grad_merge_ptr = &grad; - } else { - // merge duplicated rows if any. - // The rows of grad_merge have been sorted inside MergeAdd functor - paddle::operators::math::scatter::MergeAdd merge_func; - merge_func(dev_ctx, grad, &tmp_grad_merge, true); - grad_merge_ptr = &tmp_grad_merge; - } - auto& grad_merge = *grad_merge_ptr; - auto& grad_tensor = grad_merge.value(); - const T* grad_data = grad_tensor.template data(); - auto* grad_merge_rows = &grad_merge.rows(); - paddle::framework::MixVector mixv_grad_merge_rows(grad_merge_rows); - const int64_t* rows = mixv_grad_merge_rows.Data(dev_ctx.GetPlace()); - auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); - - if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) { - int threads = 512; - int ndim = param.numel(); - int blocks = (ndim + threads - 1) / threads; - - SparseAdamCUDAKernelREG<<>>( - beta1_, - beta2_, - epsilon_, - *beta1_pow.data(), - *beta2_pow.data(), - moment1.data(), - dev_ctx.template Alloc(moment1_out), - moment2.data(), - dev_ctx.template Alloc(moment2_out), - learning_rate.data(), - grad_data, - param.data(), - dev_ctx.template Alloc(param_out), - master_in_data, - master_out_data, - rows, - row_numel, - grad_merge.rows().size(), - lazy_mode, - ndim); - if (!use_global_beta_pow) { - // Update with cpu - dev_ctx.template HostAlloc(beta1_pow_out)[0] = - beta1_ * beta1_pow.data()[0]; - dev_ctx.template HostAlloc(beta2_pow_out)[0] = - beta2_ * beta2_pow.data()[0]; - } - } else { - funcs::SparseAdamFunctor functor( - beta1_, - beta2_, - epsilon_, - beta1_pow.data(), - beta2_pow.data(), - moment1.data(), - dev_ctx.template Alloc(moment1_out), - moment2.data(), - dev_ctx.template Alloc(moment2_out), - learning_rate.data(), - grad_data, - param.data(), - dev_ctx.template Alloc(param_out), - master_in_data, - master_out_data, - rows, - row_numel, - grad_merge.rows().size(), - lazy_mode); - - // FIXME(minqiyang): remove BinarySearch in GPU later - funcs::ForRange for_range(dev_ctx, param.numel()); - for_range(functor); - if (!use_global_beta_pow) { - // update beta1 and beta2 - UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( - beta1_, - beta2_, - beta1_pow.data(), - beta2_pow.data(), - dev_ctx.template Alloc(beta1_pow_out), - dev_ctx.template Alloc(beta2_pow_out)); - } - } -} - } // namespace phi PD_REGISTER_KERNEL(adam, @@ -512,11 +273,3 @@ PD_REGISTER_KERNEL(adam, float, double, phi::dtype::float16) {} - -PD_REGISTER_KERNEL(adam_dense_param_sparse_grad, - GPU, - ALL_LAYOUT, - phi::AdamDenseParamSparseGradKernel, - float, - double, - phi::dtype::float16) {} From 21be3e34826a2f787ed618b8d3650fa75b099a70 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Tue, 15 Mar 2022 01:52:37 +0000 Subject: [PATCH 13/16] fix dtype float16 --- paddle/phi/kernels/selected_rows/adam_kernel.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/phi/kernels/selected_rows/adam_kernel.cu b/paddle/phi/kernels/selected_rows/adam_kernel.cu index 672399b9a7c5d..32c05765a9ab0 100644 --- a/paddle/phi/kernels/selected_rows/adam_kernel.cu +++ b/paddle/phi/kernels/selected_rows/adam_kernel.cu @@ -283,4 +283,5 @@ PD_REGISTER_KERNEL(adam_dense_param_sparse_grad, ALL_LAYOUT, phi::sr::AdamDenseParamSparseGradKernel, float, - double) {} + double, + phi::dtype::float16) {} From c657b596c8f6ca9a4124ebf46d7303f66cee554a Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Tue, 15 Mar 2022 08:18:22 +0000 Subject: [PATCH 14/16] fix typo --- paddle/phi/kernels/selected_rows/adamw_kernel.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/phi/kernels/selected_rows/adamw_kernel.cu b/paddle/phi/kernels/selected_rows/adamw_kernel.cu index 89f5db74e6550..2e48b8235ed72 100644 --- a/paddle/phi/kernels/selected_rows/adamw_kernel.cu +++ b/paddle/phi/kernels/selected_rows/adamw_kernel.cu @@ -102,7 +102,8 @@ __global__ void SparseAdamWCUDAKernelREG(MT beta1, } } -xtemplate void AdamwDenseParamSparseGradKernel( +template +void AdamwDenseParamSparseGradKernel( const Context& dev_ctx, const DenseTensor& param, const SelectedRows& grad, From 42aa15ef81b45c3dd8ca5c518924f31eff7202e3 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Thu, 24 Mar 2022 03:39:37 +0000 Subject: [PATCH 15/16] fix Cmake --- cmake/phi.cmake | 2 +- paddle/phi/kernels/adam_kernel.h | 1 - paddle/phi/kernels/adamw_kernel.h | 1 - paddle/phi/kernels/selected_rows/CMakeLists.txt | 4 ++-- paddle/phi/kernels/selected_rows/{ => cpu}/adam_kernel.cc | 0 paddle/phi/kernels/selected_rows/{ => cpu}/adamw_kernel.cc | 0 paddle/phi/kernels/selected_rows/{ => gpu}/adam_kernel.cu | 0 paddle/phi/kernels/selected_rows/{ => gpu}/adamw_kernel.cu | 0 8 files changed, 3 insertions(+), 5 deletions(-) rename paddle/phi/kernels/selected_rows/{ => cpu}/adam_kernel.cc (100%) rename paddle/phi/kernels/selected_rows/{ => cpu}/adamw_kernel.cc (100%) rename paddle/phi/kernels/selected_rows/{ => gpu}/adam_kernel.cu (100%) rename paddle/phi/kernels/selected_rows/{ => gpu}/adamw_kernel.cu (100%) diff --git a/cmake/phi.cmake b/cmake/phi.cmake index f1241aaa66bb8..8cdde4a2ea61d 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -113,7 +113,7 @@ function(kernel_library TARGET) # used for cc_library selected_rows dir target set(target_suffix "") - if ("${kernel_library_SUB_DIR}" STREQUAL "selected_rows_kernel") + if ("${kernel_library_SUB_DIR}" STREQUAL "selected_rows") set(target_suffix "_sr") endif() diff --git a/paddle/phi/kernels/adam_kernel.h b/paddle/phi/kernels/adam_kernel.h index eadd0e6da699d..f144d40d2b666 100644 --- a/paddle/phi/kernels/adam_kernel.h +++ b/paddle/phi/kernels/adam_kernel.h @@ -16,7 +16,6 @@ #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/selected_rows.h" namespace phi { diff --git a/paddle/phi/kernels/adamw_kernel.h b/paddle/phi/kernels/adamw_kernel.h index cda90102c003d..d7b072adda4a2 100644 --- a/paddle/phi/kernels/adamw_kernel.h +++ b/paddle/phi/kernels/adamw_kernel.h @@ -16,7 +16,6 @@ #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/selected_rows.h" namespace phi { diff --git a/paddle/phi/kernels/selected_rows/CMakeLists.txt b/paddle/phi/kernels/selected_rows/CMakeLists.txt index 4e6c110c670b4..c6fb621ffc075 100644 --- a/paddle/phi/kernels/selected_rows/CMakeLists.txt +++ b/paddle/phi/kernels/selected_rows/CMakeLists.txt @@ -1,3 +1,3 @@ -set(SELECTED_ROWS_KERNEL_DEPS dense_tensor selected_rows sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function custom_kernel) -register_kernels(DEPS ${SELECTED_ROWS_KERNEL_DEPS} SUB_DIR "selected_rows_kernel") +set(SELECTED_ROWS_KERNEL_DEPS dense_tensor selected_rows selected_rows_functor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function custom_kernel) +register_kernels(DEPS ${SELECTED_ROWS_KERNEL_DEPS} SUB_DIR "selected_rows") diff --git a/paddle/phi/kernels/selected_rows/adam_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc similarity index 100% rename from paddle/phi/kernels/selected_rows/adam_kernel.cc rename to paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc diff --git a/paddle/phi/kernels/selected_rows/adamw_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc similarity index 100% rename from paddle/phi/kernels/selected_rows/adamw_kernel.cc rename to paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc diff --git a/paddle/phi/kernels/selected_rows/adam_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu similarity index 100% rename from paddle/phi/kernels/selected_rows/adam_kernel.cu rename to paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu diff --git a/paddle/phi/kernels/selected_rows/adamw_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu similarity index 100% rename from paddle/phi/kernels/selected_rows/adamw_kernel.cu rename to paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu From c5e9cc1a0e7848b7157f160a40babf3c7c4b51ff Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Thu, 24 Mar 2022 06:14:49 +0000 Subject: [PATCH 16/16] fix code style --- paddle/phi/infermeta/multiary.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 7c5d7cbee97ce..6d6c8656e3dc1 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -251,7 +251,7 @@ void AdamwInferMeta(const MetaTensor& param, beta1_pow_out, beta2_pow_out, master_param_outs); - } +} void AucInferMeta(const MetaTensor& input, const MetaTensor& label, @@ -333,7 +333,6 @@ void AucInferMeta(const MetaTensor& input, } } - void BatchNormInferMeta(const MetaTensor& x, const MetaTensor& scale, const MetaTensor& bias,