[DCU] fix compile error on develop (#62832)

* [DCU] fix build error, test=develop * fix py3 cpu ci build error
PaddlePaddle · Mar 20, 2024 · 6925c9d · 6925c9d
1 parent 4f06a9c
commit 6925c9d
Show file tree

Hide file tree

Showing 38 changed files with 1,204 additions and 132 deletions.
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -70,7 +70,7 @@ static void RunProgramDescs(const ProgramDescs &programs,
 
 FetchResultType ScopeBufferedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors, bool return_merged) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::IsCUDAGraphCapturing()) {
     strategy_.num_iteration_per_drop_scope_ =
         std::numeric_limits<size_t>::max();

diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -145,7 +145,7 @@ PirInterpreter::PirInterpreter(const platform::Place& place,
      << std::chrono::high_resolution_clock::now().time_since_epoch().count();
   BuildScope(*ir_block_, ss.str(), value_exe_info_.get());
 
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   calculate_stream_timer_ = std::make_unique<phi::CalculateStreamTimer>(place);
 #endif
 }
@@ -299,7 +299,7 @@ void PirInterpreter::ShareBuildResultsFrom(const InterpreterBaseImpl& src) {
 
 std::tuple<double, double> PirInterpreter::InterpreterRunTime() {
   double start_time = 0, end_time = 0;
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   start_time = calculate_stream_timer_->StartTime();
   end_time = calculate_stream_timer_->EndTime();
 #endif
@@ -337,7 +337,7 @@ std::shared_ptr<interpreter::AsyncWorkQueue> PirInterpreter::GetWorkQueue() {
 
 void PirInterpreter::PrepareForCUDAGraphCapture() {
   if (!FLAGS_new_executor_use_cuda_graph) return;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
       platform::IsCUDAGraphCapturing(),
       false,
@@ -362,7 +362,7 @@ void PirInterpreter::PrepareForCUDAGraphCapture() {
 
 void PirInterpreter::CheckCUDAGraphBeforeRun(
     const std::vector<std::string>& feed_names) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::IsCUDAGraphCapturing()) {
     PADDLE_ENFORCE_EQ(
         feed_names.empty(),
@@ -1724,7 +1724,7 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) {
 
   try {
     instr_node->WaitEvent(cur_place);
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (enable_job_schedule_profiler_) {
       std::string op_name = instr_node->Name();
       ::pir::Operation* op = instr_node->Operation();
@@ -1772,7 +1772,7 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) {
     }
     VLOG(5) << "after run kernel";
     instr_node->RecordEvent(cur_place);
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (enable_job_schedule_profiler_) {
       if (instr_node->Id() == last_calculate_instr_id_ &&
           calculate_stream_timer_->IsStarted()) {

diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.h b/paddle/fluid/framework/new_executor/pir_interpreter.h
@@ -18,7 +18,7 @@
 #include "paddle/fluid/framework/new_executor/interpreter_base_impl.h"
 #include "paddle/pir/include/core/value.h"
 
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/kernels/autotune/gpu_timer.h"
 #endif
 
@@ -274,7 +274,7 @@ class PirInterpreter : public InterpreterBaseImpl {
   // belongs to a parameter and cannot GC.
   std::unordered_set<std::string> parameter_var_names_;
 
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   std::unique_ptr<phi::CalculateStreamTimer> calculate_stream_timer_;
 #endif
   size_t last_calculate_instr_id_;

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -191,7 +191,7 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
     if (fetch_var) {
       auto fetch_list =
           std::move(*fetch_var->GetMutable<framework::FetchList>());
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       if (platform::IsCUDAGraphCapturing()) {
         PADDLE_ENFORCE_EQ(fetch_list.empty(),
                           true,
@@ -269,7 +269,7 @@ FetchList ProgramInterpreter::Run(
     if (fetch_var) {
       auto fetch_list =
           std::move(*fetch_var->GetMutable<framework::FetchList>());
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       if (platform::IsCUDAGraphCapturing()) {
         PADDLE_ENFORCE_EQ(fetch_list.empty(),
                           true,
@@ -533,7 +533,7 @@ void ProgramInterpreter::BuildInplace() {
 
 void ProgramInterpreter::PrepareForCUDAGraphCapture() {
   if (!FLAGS_new_executor_use_cuda_graph) return;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
       platform::IsCUDAGraphCapturing(),
       false,
@@ -579,7 +579,7 @@ void ProgramInterpreter::PrepareForCUDAGraphCapture() {
 
 void ProgramInterpreter::CheckCUDAGraphBeforeRun(
     const std::vector<std::string>& feed_names) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::IsCUDAGraphCapturing()) {
     PADDLE_ENFORCE_EQ(
         feed_names.empty(),
@@ -862,7 +862,7 @@ void ProgramInterpreter::BuildOpFuncNode(
     auto& op_func_node = nodes[op_idx];
     stream_analyzer_.SetForceEventsToWaitInfo(force_events_to_wait_);
     auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (FLAGS_new_executor_use_cuda_graph) {
       auto& op = op_func_node.operator_base_;
       auto& op_type = op->Type();

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
@@ -1416,7 +1416,7 @@ void ParallelExecutor::PreludeToRun(
   platform::RecordEvent record_run(
       "ParallelExecutor::Run", platform::TracerEventType::UserDefined, 1);
   VLOG(3) << "enter ParallelExecutor Run";
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::IsCUDAGraphCapturing()) {
     PADDLE_ENFORCE_EQ(fetch_tensors.empty(),
                       true,
@@ -1804,7 +1804,7 @@ const ir::Graph &ParallelExecutor::Graph() const {
 void ParallelExecutor::PrepareForCUDAGraphCapture(ir::Graph *graph) {
   const auto &build_strategy = member_->build_strategy_;
   if (!build_strategy.allow_cuda_graph_capture_) return;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
       build_strategy.async_mode_,
       false,

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2691,7 +2691,7 @@ void AnalysisPredictor::HookCollectShapeRangeInfo() {
                              int32_tensor.data<int>(),
                              int32_tensor.numel() * sizeof(int));
       } else if (platform::is_gpu_place(tensor->place())) {
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         auto *dev_ctx = pool.Get(tensor->place());
         auto &int32_tensor = *tensor;
         if (tensor->dtype() == phi::DataType::INT64) {
@@ -2914,7 +2914,7 @@ bool AnalysisPredictor::LoadParameters() {
 }
 
 uint64_t AnalysisPredictor::TryShrinkMemory() {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (config_.use_gpu()) {
     paddle::platform::EmptyCache();
   }
@@ -3607,39 +3607,39 @@ bool InternalUtils::RunWithRuntimeConfig(paddle_infer::Predictor *p,
 
 void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c,
                                             bool with_interleaved) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   c->trt_with_interleaved_ = with_interleaved;
 #endif
 }
 
 void InternalUtils::SetTransformerPosid(
     paddle_infer::Config *c, const std::string &tensorrt_transformer_posid) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   c->tensorrt_transformer_posid_ = tensorrt_transformer_posid;
 #endif
 }
 
 void InternalUtils::SetTransformerMaskid(
     paddle_infer::Config *c, const std::string &tensorrt_transformer_maskid) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   c->tensorrt_transformer_maskid_ = tensorrt_transformer_maskid;
 #endif
 }
 
 void InternalUtils::DisableTensorRtHalfOps(
     paddle_infer::Config *c, const std::unordered_set<std::string> &ops) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   c->trt_ops_run_float_ = ops;
 #endif
 }
 
 void InternalUtils::SyncStream(paddle_infer::Predictor *p) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   auto *pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
   paddle::platform::DeviceContextPool &pool =
       paddle::platform::DeviceContextPool::Instance();
   auto *dev_ctx = reinterpret_cast<phi::GPUContext *>(pool.Get(pred->place_));
-  cudaStreamSynchronize(dev_ctx->stream());
+  paddle::gpuStreamSynchronize(dev_ctx->stream());
 #endif
 }
 void InternalUtils::SyncStream(cudaStream_t stream) {
@@ -3648,5 +3648,11 @@ void InternalUtils::SyncStream(cudaStream_t stream) {
 #endif
 }
 
+void InternalUtils::SyncStream(hipStream_t stream) {
+#ifdef PADDLE_WITH_HIP
+  hipStreamSynchronize(stream);
+#endif
+}
+
 }  // namespace experimental
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
@@ -523,6 +523,7 @@ class PD_INFER_DECL InternalUtils {
 
   static void SyncStream(paddle_infer::Predictor* pred);
   static void SyncStream(cudaStream_t stream);
+  static void SyncStream(hipStream_t stream);
   template <typename T>
   static void CopyFromCpuWithIoStream(paddle_infer::Tensor* t,
                                       const T* data,

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -39,8 +39,10 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA)
 #include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
+#elif defined(PADDLE_WITH_HIP)
+#include "paddle/phi/backends/gpu/rocm/hip_graph.h"
 #endif
 
 #if CUDA_VERSION >= 10020
@@ -49,6 +51,10 @@
 #include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
 #include "paddle/fluid/platform/dynload/cuda_driver.h"
 #endif
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/memory/allocation/cuda_malloc_async_allocator.h"  // NOLINT
+#endif
 #endif
 
 #ifdef PADDLE_WITH_XPU
@@ -107,7 +113,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class CUDAGraphAllocator
     : public Allocator,
       public std::enable_shared_from_this<CUDAGraphAllocator> {
@@ -158,7 +164,7 @@ class CUDAGraphAllocator
 #endif
 
 static bool IsCUDAGraphCapturing() {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   return UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing());
 #else
   return false;
@@ -329,7 +335,7 @@ class AllocatorFacadePrivate {
 
     CheckAllocThreadSafe();
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     // No need to wrap CUDAGraphAllocator for StreamSafeCUDAAllocator
     if (!is_stream_safe_cuda_allocator_used_ &&
         UNLIKELY(IsCUDAGraphCapturing())) {
@@ -1120,7 +1126,7 @@ class AllocatorFacadePrivate {
     allocator = std::make_shared<StatAllocator>(allocator);
   }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void WrapCUDAGraphAllocator() {
     for (auto& item : allocators_) {
       auto& allocator = item.second;
@@ -1511,7 +1517,7 @@ AllocatorFacade& AllocatorFacade::Instance() {
 }
 
 AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // if we use cuda_malloc_async_allocator, we don't need to open a private pool
   // for each graph
   if (UNLIKELY(IsCUDAGraphCapturing()) &&
@@ -1702,7 +1708,7 @@ void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
   }
 }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(int64_t id) {
   PADDLE_ENFORCE_EQ(GetAllocatorStrategy(),
                     AllocatorStrategy::kAutoGrowth,

diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -95,7 +95,7 @@ class AllocatorFacade {
   void SetDefaultStream(const platform::CUDAPlace& place, gpuStream_t stream);
 #endif
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void PrepareMemoryPoolForCUDAGraph(int64_t id);
   void RemoveMemoryPoolOfCUDAGraph(int64_t id);
 #endif
@@ -116,7 +116,7 @@ class AllocatorFacade {
  private:
   AllocatorFacade();
   AllocatorFacadePrivate* m_;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   std::unordered_map<int64_t, std::unique_ptr<AllocatorFacadePrivate>>
       cuda_graph_map_;
   std::unordered_map<int64_t, int64_t> cuda_graph_ref_cnt_;

diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc
@@ -47,17 +47,16 @@ std::shared_ptr<void> GetIpcBasePtr(std::string handle) {
   // The IpcMemHandle can only open once for the same handle,
   // so here we cache it here.
   void *baseptr = nullptr;
-  auto ipc_handle =
-      reinterpret_cast<const cudaIpcMemHandle_t *>(handle.c_str());
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcOpenMemHandle(
-      &baseptr, *ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+  auto ipc_handle = reinterpret_cast<const gpuIpcMemHandle_t *>(handle.c_str());
+  PADDLE_ENFORCE_GPU_SUCCESS(gpuIpcOpenMemHandle(
+      &baseptr, *ipc_handle, gpuIpcMemLazyEnablePeerAccess));
   // Close ipc handle on the same device.
   int device_id = platform::GetCurrentDeviceId();
   // Add deleter to close ipc handle.
   auto sp = std::shared_ptr<void>(baseptr, [handle, device_id](void *ptr) {
     platform::CUDADeviceGuard guard(device_id);
     std::lock_guard<std::mutex> lock(ipc_mutex_);
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcCloseMemHandle(ptr));
+    PADDLE_ENFORCE_GPU_SUCCESS(gpuIpcCloseMemHandle(ptr));
     ipc_handle_to_baseptr_.erase(handle);
     VLOG(6) << "cudaIpcCloseMemHandle for ptr:"
             << "\t" << ptr;