From 6f36a929001a50880a98b6f9ed9158955dd99086 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Tue, 29 Mar 2022 11:57:53 +0000
Subject: [PATCH 1/6] add concat_grad kernel

---
 paddle/fluid/operators/concat_op.cc           | 15 ----
 paddle/fluid/operators/concat_op.cu.cc        | 36 ---------
 paddle/fluid/operators/concat_op.h            | 56 -------------
 paddle/phi/kernels/concat_grad_kernel.h       | 30 +++++++
 paddle/phi/kernels/cpu/concat_grad_kernel.cc  | 35 +++++++++
 paddle/phi/kernels/gpu/concat_grad_kernel.cu  | 37 +++++++++
 .../kernels/impl/concat_grad_kernel_impl.h    | 78 +++++++++++++++++++
 paddle/phi/ops/compat/concat_sig.cc           | 12 +++
 8 files changed, 192 insertions(+), 107 deletions(-)
 delete mode 100644 paddle/fluid/operators/concat_op.cu.cc
 create mode 100644 paddle/phi/kernels/concat_grad_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/concat_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/concat_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/concat_grad_kernel_impl.h
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 059fafa3e7f4d..a467f2dbee7c9 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -216,18 +216,3 @@ REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad,
                   ops::ConcatDoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::ConcatDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::ConcatOpGradNoNeedBufferVarInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext,
-                          paddle::platform::float16>,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext,
-                          paddle::platform::complex<float>>,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext,
-                          paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/concat_op.cu.cc b/paddle/fluid/operators/concat_op.cu.cc
deleted file mode 100644
index f7b64f16e2d8b..0000000000000
--- a/paddle/fluid/operators/concat_op.cu.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/concat_op.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext,
-                          plat::complex<float>>,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext,
-                          plat::complex<double>>);
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index ec43e2ad374db..50aca54c12dec 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -39,62 +39,6 @@ static inline int64_t ComputeAxis(int64_t axis, int64_t rank) {
   }
   return axis > 0 ? axis : 0;
 }
-template <typename DeviceContext, typename T>
-class ConcatGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
-    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
-    auto outs =
-        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
-
-    {
-      auto dx = outs;
-      auto x = ins;
-      for (size_t i = 0; i < dx.size(); ++i) {
-        if (dx[i] != nullptr) {
-          dx[i]->set_lod(x[i]->lod());
-        }
-      }
-    }
-    PADDLE_ENFORCE_NOT_NULL(ins[0],
-                            platform::errors::NotFound(
-                                "The first input tensor is not initalized."));
-
-    auto axis = ctx.Attr<int>("axis");
-    if (ctx.HasInput("AxisTensor")) {
-      auto* axis_tensor = ctx.Input<framework::Tensor>("AxisTensor");
-      axis = GetDataFromTensor<int>(axis_tensor)[0];
-    }
-    axis = ComputeAxis(static_cast<int64_t>(axis),
-                       static_cast<int64_t>(ins[0]->dims().size()));
-    // get output tensor that the name is not kEmptyVarName
-    std::vector<framework::Tensor*> outputs;
-    for (size_t j = 0; j < outs.size(); ++j) {
-      if (out_var_names[j] != framework::kEmptyVarName &&
-          outs[j]->numel() != 0UL) {
-        outs[j]->mutable_data<T>(ctx.GetPlace());
-        outputs.push_back(outs[j]);
-      } else {
-        outputs.push_back(nullptr);
-      }
-    }
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    // Sometimes direct copies will be faster, this maybe need deeply analysis.
-    if (axis == 0 && outs.size() < 10) {
-      std::vector<const framework::Tensor*> ref_shape;
-      ref_shape.insert(ref_shape.begin(), ins.begin(), ins.end());
-      StridedMemcpyWithAxis0<T>(dev_ctx, *out_grad, ref_shape, &outputs);
-    } else {
-      math::SplitFunctor<DeviceContext, T> split_functor;
-      split_functor(dev_ctx, *out_grad, ctx.MultiInput<framework::Tensor>("X"),
-                    static_cast<int>(axis), &outputs);
-    }
-  }
-};
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/phi/kernels/concat_grad_kernel.h b/paddle/phi/kernels/concat_grad_kernel.h
new file mode 100644
index 0000000000000..e407d73bb49ee
--- /dev/null
+++ b/paddle/phi/kernels/concat_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/multiary.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+namespace phi {
+
+template <typename T, typename Context>
+void ConcatGradKernel(const Context& dev_ctx,
+                      const std::vector<const DenseTensor*>& x,
+                      const DenseTensor& out_grad,
+                      const Scalar& axis_scalar,
+                      std::vector<DenseTensor*> x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/concat_grad_kernel.cc b/paddle/phi/kernels/cpu/concat_grad_kernel.cc
new file mode 100644
index 0000000000000..2aaf230d8b608
--- /dev/null
+++ b/paddle/phi/kernels/cpu/concat_grad_kernel.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/concat_grad_kernel.h"
+
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/concat_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(concat_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ConcatGradKernel,
+                   double,
+                   float,
+                   bool,
+                   int64_t,
+                   int,
+                   uint8_t,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/concat_grad_kernel.cu b/paddle/phi/kernels/gpu/concat_grad_kernel.cu
new file mode 100644
index 0000000000000..bb2d7aa4fd717
--- /dev/null
+++ b/paddle/phi/kernels/gpu/concat_grad_kernel.cu
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/concat_grad_kernel.h"
+
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/concat_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(concat_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ConcatGradKernel,
+                   float,
+                   double,
+                   bool,
+                   int64_t,
+                   int,
+                   uint8_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/impl/concat_grad_kernel_impl.h b/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
new file mode 100644
index 0000000000000..100d0352b7c91
--- /dev/null
+++ b/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/phi/kernels/concat_grad_kernel.h"
+
+#include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/concat_funcs.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConcatGradKernel(const Context& dev_ctx,
+                      const std::vector<const DenseTensor*>& x,
+                      const DenseTensor& out_grad,
+                      const Scalar& axis_scalar,
+                      std::vector<DenseTensor*> x_grad) {
+  auto outs = x_grad;
+  {
+    auto dx = x_grad;
+    for (size_t i = 0; i < dx.size(); ++i) {
+      if (dx[i] != nullptr) {
+        dx[i]->set_lod(x[i]->lod());
+      }
+    }
+  }
+  PADDLE_ENFORCE_NOT_NULL(
+      x[0], phi::errors::NotFound("The first input tensor is not initalized."));
+
+  auto axis = axis_scalar.to<int>();
+  /*
+  if (axis_scalar.FromTensor()) {
+    auto* axis_tensor = ctx.Input<framework::Tensor>("AxisTensor");
+    axis = GetDataFromTensor<int>(axis_tensor)[0];
+  }
+  */
+  axis = funcs::ComputeAxis(static_cast<int64_t>(axis),
+                            static_cast<int64_t>(x[0]->dims().size()));
+  // get output tensor that the name is not kEmptyVarName
+  std::vector<DenseTensor*> outputs;
+  for (size_t j = 0; j < outs.size(); ++j) {
+    // if (out_var_names[j] != framework::kEmptyVarName && outs[j]->numel() !=
+    // 0UL) {
+    if (outs[j]->numel() != 0UL) {
+      // outs[j]->mutable_data<T>(ctx.GetPlace());
+      dev_ctx.template Alloc<T>(outs[j]);
+
+      outputs.push_back(outs[j]);
+    } else {
+      outputs.push_back(nullptr);
+    }
+  }
+
+  // Sometimes direct copies will be faster, this maybe need deeply analysis.
+  if (axis == 0 && outs.size() < 10) {
+    std::vector<const DenseTensor*> ref_shape;
+    ref_shape.insert(ref_shape.begin(), x.begin(), x.end());
+    paddle::operators::StridedMemcpyWithAxis0<T>(
+        dev_ctx, out_grad, ref_shape, &outputs);
+  } else {
+    phi::funcs::SplitFunctor<Context, T> split_functor;
+    split_functor(dev_ctx, out_grad, x, static_cast<int>(axis), &outputs);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/concat_sig.cc b/paddle/phi/ops/compat/concat_sig.cc
index 21e653ccfe90f..c192f7fc69afc 100644
--- a/paddle/phi/ops/compat/concat_sig.cc
+++ b/paddle/phi/ops/compat/concat_sig.cc
@@ -23,6 +23,18 @@ KernelSignature ConcatOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("concat", {"X"}, {"axis"}, {"Out"});
 }
 
+KernelSignature ConcatGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("AxisTensor")) {
+    return KernelSignature("concat_grad",
+                           {"X", {GradVarName("Out")}},
+                           {"AxisTensor"},
+                           {{GradVarName("X")}});
+  }
+  return KernelSignature(
+      "concat", {"X", {GradVarName("Out")}}, {"axis"}, {{GradVarName("X")}});
+}
+
 }  // namespace phi
 
 PD_REGISTER_ARG_MAPPING_FN(concat, phi::ConcatOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(concat_grad, phi::ConcatGradOpArgumentMapping);

From 319a50072dffa8deff06e7e732ba0938235d1234 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Tue, 29 Mar 2022 12:46:48 +0000
Subject: [PATCH 2/6] fix error

---
 .../framework/new_executor/standalone_executor_test.cc      | 2 +-
 paddle/phi/ops/compat/concat_sig.cc                         | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index d3adccff73337..8f1cf81ea043e 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -46,7 +46,7 @@ USE_OP_ITSELF(elementwise_add_grad);
 USE_OP_ITSELF(matmul_grad);
 USE_OP_ITSELF(square);
 USE_OP_ITSELF(transpose2_grad);
-USE_OP(concat_grad);
+USE_OP_ITSELF(concat_grad);
 USE_OP_ITSELF(elementwise_mul_grad);
 USE_OP_ITSELF(sigmoid_grad);
 USE_OP_ITSELF(tanh_grad);
diff --git a/paddle/phi/ops/compat/concat_sig.cc b/paddle/phi/ops/compat/concat_sig.cc
index c192f7fc69afc..d443f521c6146 100644
--- a/paddle/phi/ops/compat/concat_sig.cc
+++ b/paddle/phi/ops/compat/concat_sig.cc
@@ -30,8 +30,10 @@ KernelSignature ConcatGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
                            {"AxisTensor"},
                            {{GradVarName("X")}});
   }
-  return KernelSignature(
-      "concat", {"X", {GradVarName("Out")}}, {"axis"}, {{GradVarName("X")}});
+  return KernelSignature("concat_grad",
+                         {"X", {GradVarName("Out")}},
+                         {"axis"},
+                         {{GradVarName("X")}});
 }
 
 }  // namespace phi

From 9ac68392de94ed7b9317217e8c3a2a3ac7595487 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Tue, 29 Mar 2022 12:48:45 +0000
Subject: [PATCH 3/6] remove comment code

---
 paddle/phi/kernels/impl/concat_grad_kernel_impl.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/paddle/phi/kernels/impl/concat_grad_kernel_impl.h b/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
index 100d0352b7c91..07baa10372a74 100644
--- a/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
@@ -40,21 +40,12 @@ void ConcatGradKernel(const Context& dev_ctx,
       x[0], phi::errors::NotFound("The first input tensor is not initalized."));
 
   auto axis = axis_scalar.to<int>();
-  /*
-  if (axis_scalar.FromTensor()) {
-    auto* axis_tensor = ctx.Input<framework::Tensor>("AxisTensor");
-    axis = GetDataFromTensor<int>(axis_tensor)[0];
-  }
-  */
   axis = funcs::ComputeAxis(static_cast<int64_t>(axis),
                             static_cast<int64_t>(x[0]->dims().size()));
   // get output tensor that the name is not kEmptyVarName
   std::vector<DenseTensor*> outputs;
   for (size_t j = 0; j < outs.size(); ++j) {
-    // if (out_var_names[j] != framework::kEmptyVarName && outs[j]->numel() !=
-    // 0UL) {
     if (outs[j]->numel() != 0UL) {
-      // outs[j]->mutable_data<T>(ctx.GetPlace());
       dev_ctx.template Alloc<T>(outs[j]);
 
       outputs.push_back(outs[j]);

From 52e6539fbc5e3f3e7a4e3ec8beff245997da3f85 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Wed, 30 Mar 2022 03:45:56 +0000
Subject: [PATCH 4/6] fix outs nullptr error

---
 paddle/phi/kernels/impl/concat_grad_kernel_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/impl/concat_grad_kernel_impl.h b/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
index 07baa10372a74..e89920340ff18 100644
--- a/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
@@ -45,7 +45,7 @@ void ConcatGradKernel(const Context& dev_ctx,
   // get output tensor that the name is not kEmptyVarName
   std::vector<DenseTensor*> outputs;
   for (size_t j = 0; j < outs.size(); ++j) {
-    if (outs[j]->numel() != 0UL) {
+    if (outs[j] && outs[j]->numel() != 0UL) {
       dev_ctx.template Alloc<T>(outs[j]);
 
       outputs.push_back(outs[j]);

From b9d3c250bb804fd144506ce975f2bdb0f99306a0 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Wed, 30 Mar 2022 03:54:12 +0000
Subject: [PATCH 5/6] change to phi header

---
 paddle/phi/kernels/cpu/concat_grad_kernel.cc | 4 ++--
 paddle/phi/kernels/gpu/concat_grad_kernel.cu | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/phi/kernels/cpu/concat_grad_kernel.cc b/paddle/phi/kernels/cpu/concat_grad_kernel.cc
index 2aaf230d8b608..56ed95769fef4 100644
--- a/paddle/phi/kernels/cpu/concat_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_grad_kernel.cc
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/concat_grad_kernel.h"
 
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/concat_grad_kernel_impl.h"
 
diff --git a/paddle/phi/kernels/gpu/concat_grad_kernel.cu b/paddle/phi/kernels/gpu/concat_grad_kernel.cu
index bb2d7aa4fd717..2445978daca46 100644
--- a/paddle/phi/kernels/gpu/concat_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_grad_kernel.cu
@@ -14,10 +14,10 @@
 
 #include "paddle/phi/kernels/concat_grad_kernel.h"
 
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/concat_grad_kernel_impl.h"
 

From c08f9ada016c40f9c154e19dd6ae039ae25bd74b Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Thu, 31 Mar 2022 02:02:31 +0000
Subject: [PATCH 6/6] add concat_grad declare for standalone_executor_test

---
 paddle/fluid/framework/new_executor/standalone_executor_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 8f1cf81ea043e..ae57b68ad57ee 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -67,6 +67,7 @@ PD_DECLARE_KERNEL(transpose, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(reshape, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(split, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(concat, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(concat_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_raw, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);