diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 9d275b0fd4c2e..355b179599ce9 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -70,7 +70,7 @@ static void RunProgramDescs(const ProgramDescs &programs, FetchResultType ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors, bool return_merged) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::IsCUDAGraphCapturing()) { strategy_.num_iteration_per_drop_scope_ = std::numeric_limits::max(); diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc index 94ff108f7d61c..30df6f14e366d 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc @@ -145,7 +145,7 @@ PirInterpreter::PirInterpreter(const platform::Place& place, << std::chrono::high_resolution_clock::now().time_since_epoch().count(); BuildScope(*ir_block_, ss.str(), value_exe_info_.get()); -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) calculate_stream_timer_ = std::make_unique(place); #endif } @@ -299,7 +299,7 @@ void PirInterpreter::ShareBuildResultsFrom(const InterpreterBaseImpl& src) { std::tuple PirInterpreter::InterpreterRunTime() { double start_time = 0, end_time = 0; -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) start_time = calculate_stream_timer_->StartTime(); end_time = calculate_stream_timer_->EndTime(); #endif @@ -337,7 +337,7 @@ std::shared_ptr PirInterpreter::GetWorkQueue() { void PirInterpreter::PrepareForCUDAGraphCapture() { if (!FLAGS_new_executor_use_cuda_graph) return; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_EQ( platform::IsCUDAGraphCapturing(), false, @@ -362,7 +362,7 @@ void PirInterpreter::PrepareForCUDAGraphCapture() { void PirInterpreter::CheckCUDAGraphBeforeRun( const std::vector& feed_names) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::IsCUDAGraphCapturing()) { PADDLE_ENFORCE_EQ( feed_names.empty(), @@ -1724,7 +1724,7 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) { try { instr_node->WaitEvent(cur_place); -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (enable_job_schedule_profiler_) { std::string op_name = instr_node->Name(); ::pir::Operation* op = instr_node->Operation(); @@ -1772,7 +1772,7 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) { } VLOG(5) << "after run kernel"; instr_node->RecordEvent(cur_place); -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (enable_job_schedule_profiler_) { if (instr_node->Id() == last_calculate_instr_id_ && calculate_stream_timer_->IsStarted()) { diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.h b/paddle/fluid/framework/new_executor/pir_interpreter.h index daf6351bb6723..e28e418b9dd95 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.h +++ b/paddle/fluid/framework/new_executor/pir_interpreter.h @@ -18,7 +18,7 @@ #include "paddle/fluid/framework/new_executor/interpreter_base_impl.h" #include "paddle/pir/include/core/value.h" -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/phi/kernels/autotune/gpu_timer.h" #endif @@ -274,7 +274,7 @@ class PirInterpreter : public InterpreterBaseImpl { // belongs to a parameter and cannot GC. std::unordered_set parameter_var_names_; -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::unique_ptr calculate_stream_timer_; #endif size_t last_calculate_instr_id_; diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 67a5c8c9d0b5b..136b8980dee90 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -191,7 +191,7 @@ FetchList ProgramInterpreter::Run(const std::vector& feed_names, if (fetch_var) { auto fetch_list = std::move(*fetch_var->GetMutable()); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::IsCUDAGraphCapturing()) { PADDLE_ENFORCE_EQ(fetch_list.empty(), true, @@ -269,7 +269,7 @@ FetchList ProgramInterpreter::Run( if (fetch_var) { auto fetch_list = std::move(*fetch_var->GetMutable()); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::IsCUDAGraphCapturing()) { PADDLE_ENFORCE_EQ(fetch_list.empty(), true, @@ -533,7 +533,7 @@ void ProgramInterpreter::BuildInplace() { void ProgramInterpreter::PrepareForCUDAGraphCapture() { if (!FLAGS_new_executor_use_cuda_graph) return; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_EQ( platform::IsCUDAGraphCapturing(), false, @@ -579,7 +579,7 @@ void ProgramInterpreter::PrepareForCUDAGraphCapture() { void ProgramInterpreter::CheckCUDAGraphBeforeRun( const std::vector& feed_names) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::IsCUDAGraphCapturing()) { PADDLE_ENFORCE_EQ( feed_names.empty(), @@ -862,7 +862,7 @@ void ProgramInterpreter::BuildOpFuncNode( auto& op_func_node = nodes[op_idx]; stream_analyzer_.SetForceEventsToWaitInfo(force_events_to_wait_); auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_new_executor_use_cuda_graph) { auto& op = op_func_node.operator_base_; auto& op_type = op->Type(); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index c2b6c37e7dd6e..ccf2b718e535e 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -1416,7 +1416,7 @@ void ParallelExecutor::PreludeToRun( platform::RecordEvent record_run( "ParallelExecutor::Run", platform::TracerEventType::UserDefined, 1); VLOG(3) << "enter ParallelExecutor Run"; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::IsCUDAGraphCapturing()) { PADDLE_ENFORCE_EQ(fetch_tensors.empty(), true, @@ -1804,7 +1804,7 @@ const ir::Graph &ParallelExecutor::Graph() const { void ParallelExecutor::PrepareForCUDAGraphCapture(ir::Graph *graph) { const auto &build_strategy = member_->build_strategy_; if (!build_strategy.allow_cuda_graph_capture_) return; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_EQ( build_strategy.async_mode_, false, diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index d09ec702c813c..2ea19823c5f4a 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -2691,7 +2691,7 @@ void AnalysisPredictor::HookCollectShapeRangeInfo() { int32_tensor.data(), int32_tensor.numel() * sizeof(int)); } else if (platform::is_gpu_place(tensor->place())) { -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto *dev_ctx = pool.Get(tensor->place()); auto &int32_tensor = *tensor; if (tensor->dtype() == phi::DataType::INT64) { @@ -2914,7 +2914,7 @@ bool AnalysisPredictor::LoadParameters() { } uint64_t AnalysisPredictor::TryShrinkMemory() { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (config_.use_gpu()) { paddle::platform::EmptyCache(); } @@ -3607,39 +3607,39 @@ bool InternalUtils::RunWithRuntimeConfig(paddle_infer::Predictor *p, void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c, bool with_interleaved) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) c->trt_with_interleaved_ = with_interleaved; #endif } void InternalUtils::SetTransformerPosid( paddle_infer::Config *c, const std::string &tensorrt_transformer_posid) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) c->tensorrt_transformer_posid_ = tensorrt_transformer_posid; #endif } void InternalUtils::SetTransformerMaskid( paddle_infer::Config *c, const std::string &tensorrt_transformer_maskid) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) c->tensorrt_transformer_maskid_ = tensorrt_transformer_maskid; #endif } void InternalUtils::DisableTensorRtHalfOps( paddle_infer::Config *c, const std::unordered_set &ops) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) c->trt_ops_run_float_ = ops; #endif } void InternalUtils::SyncStream(paddle_infer::Predictor *p) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto *pred = dynamic_cast(p->predictor_.get()); paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool::Instance(); auto *dev_ctx = reinterpret_cast(pool.Get(pred->place_)); - cudaStreamSynchronize(dev_ctx->stream()); + paddle::gpuStreamSynchronize(dev_ctx->stream()); #endif } void InternalUtils::SyncStream(cudaStream_t stream) { @@ -3648,5 +3648,11 @@ void InternalUtils::SyncStream(cudaStream_t stream) { #endif } +void InternalUtils::SyncStream(hipStream_t stream) { +#ifdef PADDLE_WITH_HIP + hipStreamSynchronize(stream); +#endif +} + } // namespace experimental } // namespace paddle_infer diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 8c66b66363603..b6931814ab9e7 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -523,6 +523,7 @@ class PD_INFER_DECL InternalUtils { static void SyncStream(paddle_infer::Predictor* pred); static void SyncStream(cudaStream_t stream); + static void SyncStream(hipStream_t stream); template static void CopyFromCpuWithIoStream(paddle_infer::Tensor* t, const T* data, diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 9b30ca8308022..9df64154402e5 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -39,8 +39,10 @@ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_context.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) #include "paddle/phi/backends/gpu/cuda/cuda_graph.h" +#elif defined(PADDLE_WITH_HIP) +#include "paddle/phi/backends/gpu/rocm/hip_graph.h" #endif #if CUDA_VERSION >= 10020 @@ -49,6 +51,10 @@ #include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h" #include "paddle/fluid/platform/dynload/cuda_driver.h" #endif + +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/memory/allocation/cuda_malloc_async_allocator.h" // NOLINT +#endif #endif #ifdef PADDLE_WITH_XPU @@ -107,7 +113,7 @@ namespace paddle { namespace memory { namespace allocation { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class CUDAGraphAllocator : public Allocator, public std::enable_shared_from_this { @@ -158,7 +164,7 @@ class CUDAGraphAllocator #endif static bool IsCUDAGraphCapturing() { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) return UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing()); #else return false; @@ -329,7 +335,7 @@ class AllocatorFacadePrivate { CheckAllocThreadSafe(); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // No need to wrap CUDAGraphAllocator for StreamSafeCUDAAllocator if (!is_stream_safe_cuda_allocator_used_ && UNLIKELY(IsCUDAGraphCapturing())) { @@ -1120,7 +1126,7 @@ class AllocatorFacadePrivate { allocator = std::make_shared(allocator); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void WrapCUDAGraphAllocator() { for (auto& item : allocators_) { auto& allocator = item.second; @@ -1511,7 +1517,7 @@ AllocatorFacade& AllocatorFacade::Instance() { } AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // if we use cuda_malloc_async_allocator, we don't need to open a private pool // for each graph if (UNLIKELY(IsCUDAGraphCapturing()) && @@ -1702,7 +1708,7 @@ void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place, } } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(int64_t id) { PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth, diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index f0f321b887b59..de26eae6eb4ba 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -95,7 +95,7 @@ class AllocatorFacade { void SetDefaultStream(const platform::CUDAPlace& place, gpuStream_t stream); #endif -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void PrepareMemoryPoolForCUDAGraph(int64_t id); void RemoveMemoryPoolOfCUDAGraph(int64_t id); #endif @@ -116,7 +116,7 @@ class AllocatorFacade { private: AllocatorFacade(); AllocatorFacadePrivate* m_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::unordered_map> cuda_graph_map_; std::unordered_map cuda_graph_ref_cnt_; diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc index df62c112681b1..be3f578f4942f 100644 --- a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc @@ -47,17 +47,16 @@ std::shared_ptr GetIpcBasePtr(std::string handle) { // The IpcMemHandle can only open once for the same handle, // so here we cache it here. void *baseptr = nullptr; - auto ipc_handle = - reinterpret_cast(handle.c_str()); - PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcOpenMemHandle( - &baseptr, *ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + auto ipc_handle = reinterpret_cast(handle.c_str()); + PADDLE_ENFORCE_GPU_SUCCESS(gpuIpcOpenMemHandle( + &baseptr, *ipc_handle, gpuIpcMemLazyEnablePeerAccess)); // Close ipc handle on the same device. int device_id = platform::GetCurrentDeviceId(); // Add deleter to close ipc handle. auto sp = std::shared_ptr(baseptr, [handle, device_id](void *ptr) { platform::CUDADeviceGuard guard(device_id); std::lock_guard lock(ipc_mutex_); - PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcCloseMemHandle(ptr)); + PADDLE_ENFORCE_GPU_SUCCESS(gpuIpcCloseMemHandle(ptr)); ipc_handle_to_baseptr_.erase(handle); VLOG(6) << "cudaIpcCloseMemHandle for ptr:" << "\t" << ptr; diff --git a/paddle/fluid/memory/allocation/cuda_malloc_async_allocator.cc b/paddle/fluid/memory/allocation/cuda_malloc_async_allocator.cc index cdc3f60da7c7e..7e0c513f5c81c 100644 --- a/paddle/fluid/memory/allocation/cuda_malloc_async_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_malloc_async_allocator.cc @@ -27,7 +27,11 @@ #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" +#if defined(PADDLE_WITH_CUDA) #include "paddle/phi/backends/gpu/cuda/cuda_graph.h" +#elif defined(PADDLE_WITH_HIP) +#include "paddle/phi/backends/gpu/rocm/hip_graph.h" +#endif namespace paddle { namespace memory { @@ -47,11 +51,11 @@ void CUDAMallocAsyncAllocation::RecordStreamWithNoGraphCapturing( if (event_map_.find(stream) == event_map_.end()) { gpuEvent_t event; PADDLE_ENFORCE_GPU_SUCCESS( - cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream)); + gpuEventCreateWithFlags(&event, gpuEventDisableTiming)); + PADDLE_ENFORCE_GPU_SUCCESS(gpuEventRecord(event, stream)); event_map_[stream] = event; } else { - PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_map_[stream], stream)); + PADDLE_ENFORCE_GPU_SUCCESS(gpuEventRecord(event_map_[stream], stream)); } } @@ -93,16 +97,16 @@ bool CUDAMallocAsyncAllocation::CanBeFreed(bool synchronize) { for (auto it = event_map_.begin(); it != event_map_.end();) { gpuEvent_t& event = it->second; if (synchronize) { - PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event)); + PADDLE_ENFORCE_GPU_SUCCESS(gpuEventSynchronize(event)); } else { - gpuError_t err = cudaEventQuery(event); - if (err == cudaErrorNotReady) { + gpuError_t err = gpuEventQuery(event); + if (err == gpuErrorNotReady) { VLOG(9) << "Event " << event << " for " << ptr() << " is not completed"; return false; } PADDLE_ENFORCE_GPU_SUCCESS(err); } - PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event)); + PADDLE_ENFORCE_GPU_SUCCESS(gpuEventDestroy(event)); VLOG(8) << "Destroy event " << event; it = event_map_.erase(it); } @@ -117,7 +121,7 @@ CUDAMallocAsyncAllocator::CUDAMallocAsyncAllocator( place_(place), default_stream_(default_stream) { PADDLE_ENFORCE_GPU_SUCCESS( - cudaStreamCreateWithPriority(&memory_stream_, cudaStreamNonBlocking, 0)); + gpuStreamCreateWithPriority(&memory_stream_, gpuStreamNonBlocking, 0)); } bool CUDAMallocAsyncAllocator::IsAllocThreadSafe() const { return true; } diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 9d82ca6ed1826..dfcb90dffecb1 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -18,8 +18,10 @@ #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/backends/gpu/gpu_info.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) #include "paddle/phi/backends/gpu/cuda/cuda_graph.h" +#elif defined(PADDLE_WITH_HIP) +#include "paddle/phi/backends/gpu/rocm/hip_graph.h" #endif namespace paddle { @@ -48,7 +50,7 @@ void StreamSafeCUDAAllocation::RecordStream(gpuStream_t stream) { [this] { phi::backends::gpu::SetDeviceId(place_.device); }); std::lock_guard lock_guard(outstanding_event_map_lock_); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing())) { graph_capturing_stream_set_.insert(stream); return; @@ -66,7 +68,7 @@ void StreamSafeCUDAAllocation::EraseStream(gpuStream_t stream) { } bool StreamSafeCUDAAllocation::CanBeFreed() { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing())) { return graph_capturing_stream_set_.empty() && outstanding_event_map_.empty(); diff --git a/paddle/fluid/operators/cuda_graph_with_in_out.h b/paddle/fluid/operators/cuda_graph_with_in_out.h index 3f65450d30c0e..7547bdd436395 100644 --- a/paddle/fluid/operators/cuda_graph_with_in_out.h +++ b/paddle/fluid/operators/cuda_graph_with_in_out.h @@ -16,21 +16,21 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/tensor.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #endif namespace paddle { namespace operators { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class CUDAGraphWithInOuts { public: template CUDAGraphWithInOuts(Callable &&callable, platform::CUDAPlace place, const std::vector &in_ptrs, - cudaStreamCaptureMode mode, + gpuStreamCaptureMode mode, int64_t pool_id) { in_indices_.resize(in_ptrs.size()); ins_.reserve(in_ptrs.size()); @@ -102,7 +102,7 @@ static std::unique_ptr CaptureCUDAGraph( const framework::ExecutionContext &ctx, const std::vector &input_names, const std::vector &output_names, - cudaStreamCaptureMode mode, + gpuStreamCaptureMode mode, int64_t pool_id) { std::vector inputs; for (const auto &name : input_names) { diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h index 9e2d1fc4c97fb..6006d7556423c 100644 --- a/paddle/fluid/operators/run_program_op.h +++ b/paddle/fluid/operators/run_program_op.h @@ -34,7 +34,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_DNNL #include "paddle/fluid/platform/mkldnn_helper.h" #endif -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/operators/cuda_graph_with_in_out.h" #endif #include "paddle/common/flags.h" @@ -196,6 +196,20 @@ static cudaStreamCaptureMode StringToCUDAGraphCaptureMode( "Unsupported CUDA Graph capture mode %s", mode)); } } +#elif defined(PADDLE_WITH_HIP) +static hipStreamCaptureMode StringToCUDAGraphCaptureMode( + const std::string &mode) { + if (mode == "global") { + return hipStreamCaptureModeGlobal; + } else if (mode == "thread_local") { + return hipStreamCaptureModeThreadLocal; + } else if (mode == "relaxed") { + return hipStreamCaptureModeRelaxed; + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "Unsupported CUDA Graph capture mode %s", mode)); + } +} #endif } // namespace details @@ -211,7 +225,7 @@ class RunProgramOpKernel : public framework::OpKernel { return; } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto mode = details::StringToCUDAGraphCaptureMode(capture_mode); PADDLE_ENFORCE_EQ( platform::is_gpu_place(ctx.GetPlace()), @@ -408,7 +422,7 @@ class RunProgramGradOpKernel : public framework::OpKernel { return; } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto mode = details::StringToCUDAGraphCaptureMode(capture_mode); PADDLE_ENFORCE_EQ( platform::is_gpu_place(ctx.GetPlace()), diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc index 5b5efb43f9096..9d522d8b2f0fe 100644 --- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc +++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc @@ -25,7 +25,7 @@ COMMON_DECLARE_bool(new_executor_use_cuda_graph); namespace paddle { namespace platform { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void InitCUDNNRelatedHandle(phi::GPUContext* dev_ctx) { dev_ctx->cudnn_workspace_handle().ResetWorkspace(); @@ -82,7 +82,7 @@ phi::DeviceContext* SelectCUDAGraphDeviceContext(phi::GPUPlace place, } void BeginCUDAGraphCapture(phi::GPUPlace place, - cudaStreamCaptureMode mode, + gpuStreamCaptureMode mode, int64_t pool_id) { auto* mutable_dev_ctx = SelectCUDAGraphDeviceContext(place, &pool_id); auto* dev_ctx = reinterpret_cast(mutable_dev_ctx); diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.h b/paddle/fluid/platform/cuda_graph_with_memory_pool.h index c076d33c88682..a1eca67a9ee87 100644 --- a/paddle/fluid/platform/cuda_graph_with_memory_pool.h +++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/common/macros.h" +#include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/enforce.h" @@ -23,17 +24,17 @@ namespace paddle { namespace platform { // NOTE: These APIs are not thread-safe. -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) using CUDAGraph = phi::backends::gpu::CUDAGraph; void BeginCUDAGraphCapture(phi::GPUPlace place, - cudaStreamCaptureMode mode, + gpuStreamCaptureMode mode, int64_t pool_id = CUDAGraph::kInvalidPoolID); std::unique_ptr EndCUDAGraphCapture(); #endif inline phi::GPUPlace CUDAGraphCapturingPlace() { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) return CUDAGraph::CapturingPlace(); #else PADDLE_THROW(phi::errors::Unimplemented( @@ -52,8 +53,8 @@ class SkipCUDAGraphCaptureGuard { public: SkipCUDAGraphCaptureGuard() { -#ifdef PADDLE_WITH_CUDA -#if CUDA_VERSION >= 10010 +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 10010 if (UNLIKELY(CUDAGraph::IsCapturing())) { CUDAGraph::EndSegmentCapture(); } @@ -62,8 +63,8 @@ class SkipCUDAGraphCaptureGuard { } ~SkipCUDAGraphCaptureGuard() { -#ifdef PADDLE_WITH_CUDA -#if CUDA_VERSION >= 10010 +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 10010 if (UNLIKELY(CUDAGraph::IsCapturing())) { CUDAGraph::BeginSegmentCapture(); } diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 8fca9708b4b5d..36189cc7e4c90 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -35,6 +35,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/dynload/miopen.h" +#include "paddle/phi/backends/gpu/rocm/hip_graph.h" #else #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/phi/backends/gpu/cuda/cuda_graph.h" @@ -44,6 +45,8 @@ limitations under the License. */ #if CUDA_VERSION >= 10020 #include "paddle/fluid/platform/dynload/cuda_driver.h" #endif +#else // PADDLE_WITH_HIP +#include "paddle/fluid/platform/dynload/rocm_driver.h" #endif COMMON_DECLARE_double(fraction_of_gpu_memory_to_use); @@ -256,7 +259,8 @@ class RecordedGpuMallocHelper { * would be clear. */ gpuError_t MallocAsync(void **ptr, size_t size, gpuStream_t stream) { -#if defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11020) +#if defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11020) LockGuardPtr lock(mtx_); if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) { return gpuErrorOutOfMemory; @@ -264,19 +268,35 @@ class RecordedGpuMallocHelper { CUDADeviceGuard guard(dev_id_); std::call_once(set_cudamempoolattr_once_flag_, [&]() { +#ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS( cudaDeviceGetDefaultMemPool(&memPool_, dev_id_)); +#else // PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + hipDeviceGetDefaultMemPool(&memPool_, dev_id_)); +#endif uint64_t thresholdVal = FLAGS_cuda_memory_async_pool_realease_threshold; VLOG(10) << "[cudaMallocAsync] set cudaMemPoolAttrReleaseThreshold to " << thresholdVal; +#ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS( cudaMemPoolSetAttribute(memPool_, cudaMemPoolAttrReleaseThreshold, reinterpret_cast(&thresholdVal))); +#else // PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + hipMemPoolSetAttribute(memPool_, + hipMemPoolAttrReleaseThreshold, + reinterpret_cast(&thresholdVal))); +#endif }); gpuError_t result; +#ifdef PADDLE_WITH_CUDA result = cudaMallocAsync(ptr, size, stream); +#else // PADDLE_WITH_HIP + result = hipMallocAsync(ptr, size, stream); +#endif VLOG(10) << "[cudaMallocAsync] ptr = " << (*ptr) << " size = " << static_cast(size) / (1 << 20) << " MB result = " << result << " stream = " << stream; @@ -343,18 +363,23 @@ class RecordedGpuMallocHelper { } void FreeAsync(void *ptr, size_t size, gpuStream_t stream) { -#if defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11020) +#if defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11020) // Purposefully allow cudaErrorCudartUnloading, because // that is returned if you ever call cudaFree after the // driver has already shutdown. This happens only if the // process is terminating, in which case we don't care if // cudaFree succeeds. CUDADeviceGuard guard(dev_id_); +#ifdef PADDLE_WITH_CUDA auto err = cudaFreeAsync(ptr, stream); +#else // PADDLE_WITH_HIP + auto err = hipFreeAsync(ptr, stream); +#endif VLOG(10) << "[cudaFreeAsync] ptr = " << ptr << " size =" << static_cast(size) / (1 << 20) << " MB result = " << err << " stream = " << stream; - if (err != cudaErrorCudartUnloading) { + if (err != gpuErrorCudartUnloading) { PADDLE_ENFORCE_GPU_SUCCESS(err); cur_size_.fetch_sub(size); DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size); @@ -449,6 +474,27 @@ class RecordedGpuMallocHelper { } #endif +#else // PADDLE_WITH_HIP + hipError_t MemCreate(hipMemGenericAllocationHandle_t *handle, + size_t size, + const hipMemAllocationProp *prop, + unsigned long long flags) { // NOLINT + auto result = + paddle::platform::dynload::hipMemCreate(handle, size, prop, flags); + if (result == hipSuccess) { + cur_size_.fetch_add(size); + } + return result; + } + + hipError_t MemRelease(hipMemGenericAllocationHandle_t handle, size_t size) { + auto result = paddle::platform::dynload::hipMemRelease(handle); + if (result == hipSuccess) { + cur_size_.fetch_sub(size); + } + return result; + } + #endif private: @@ -460,6 +506,10 @@ class RecordedGpuMallocHelper { cudaMemPool_t memPool_; static std::once_flag set_cudamempoolattr_once_flag_; #endif +#if defined(PADDLE_WITH_HIP) + hipMemPool_t memPool_; + static std::once_flag set_cudamempoolattr_once_flag_; +#endif mutable std::unique_ptr mtx_; static std::once_flag once_flag_; @@ -468,7 +518,8 @@ class RecordedGpuMallocHelper { std::once_flag RecordedGpuMallocHelper::once_flag_; -#if defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11020) +#if defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11020) std::once_flag RecordedGpuMallocHelper::set_cudamempoolattr_once_flag_; #endif @@ -516,6 +567,21 @@ CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size); } #endif +#else // PADDLE_WITH_HIP +hipError_t RecordedGpuMemCreate(hipMemGenericAllocationHandle_t *handle, + size_t size, + const hipMemAllocationProp *prop, + unsigned long long flags, // NOLINT + int dev_id) { + return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate( + handle, size, prop, flags); +} + +hipError_t RecordedGpuMemRelease(hipMemGenericAllocationHandle_t handle, + size_t size, + int dev_id) { + return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size); +} #endif bool RecordedGpuMemGetInfo(size_t *avail, diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h index c9afafdef7166..8a192ba919cad 100644 --- a/paddle/fluid/platform/device/gpu/gpu_types.h +++ b/paddle/fluid/platform/device/gpu/gpu_types.h @@ -1,5 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// Copyright (c) 2022 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -33,11 +32,13 @@ namespace paddle { +// Note(qili93): CUDA Runtime API supported by HIP +// https://github.com/ROCm/HIPIFY/blob/master/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md + #ifdef PADDLE_WITH_HIP #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ using GPU_TYPE = ROCM_TYPE; -#else // CDUA - +#else // PADDLE_WITH_CUDA #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ using GPU_TYPE = CUDA_TYPE; #endif @@ -81,22 +82,22 @@ DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, cudnnDropoutDescriptor_t, miopenDropoutDescriptor_t); DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t); - +DECLARE_TYPE_FOR_GPU(gpuIpcMemHandle_t, cudaIpcMemHandle_t, hipIpcMemHandle_t); DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle); +DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode, + cudaStreamCaptureMode, + hipStreamCaptureMode); // TODO(Ming Huang): Since there is no blasLt handler, // use rocblas_handle for workround. DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); -using CUDAGraphID = unsigned long long; // NOLINT - #undef DECLARE_TYPE_FOR_GPU #ifdef PADDLE_WITH_HIP #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ constexpr auto GPU_CV = ROCM_CV; -#else // CDUA - +#else // PADDLE_WITH_CUDA #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \ constexpr auto GPU_CV = CUDA_CV; #endif @@ -106,8 +107,64 @@ DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory, hipErrorOutOfMemory); DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady); DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess); +DECLARE_CONSTANT_FOR_GPU(gpuErrorCudartUnloading, + cudaErrorCudartUnloading, + hipErrorDeinitialized); +DECLARE_CONSTANT_FOR_GPU(gpuEventDisableTiming, + cudaEventDisableTiming, + hipEventDisableTiming); +DECLARE_CONSTANT_FOR_GPU(gpuStreamNonBlocking, + cudaStreamNonBlocking, + hipStreamNonBlocking); +DECLARE_CONSTANT_FOR_GPU(gpuIpcMemLazyEnablePeerAccess, + cudaIpcMemLazyEnablePeerAccess, + hipIpcMemLazyEnablePeerAccess); #undef DECLARE_CONSTANT_FOR_GPU -} // namespace paddle +#ifdef PADDLE_WITH_HIP +#define DECLARE_FUNCTION_FOR_GPU(GPU_FUNC, CUDA_FUNC, ROCM_FUNC) \ + const auto GPU_FUNC = ROCM_FUNC; +#else // PADDLE_WITH_CUDA +#define DECLARE_FUNCTION_FOR_GPU(GPU_FUNC, CUDA_FUNC, ROCM_FUNC) \ + const auto GPU_FUNC = CUDA_FUNC; #endif + +DECLARE_FUNCTION_FOR_GPU(gpuStreamCreateWithPriority, + cudaStreamCreateWithPriority, + hipStreamCreateWithPriority); +DECLARE_FUNCTION_FOR_GPU(gpuStreamBeginCapture, + cudaStreamBeginCapture, + hipStreamBeginCapture); +DECLARE_FUNCTION_FOR_GPU(gpuStreamEndCapture, + cudaStreamEndCapture, + hipStreamEndCapture); +DECLARE_FUNCTION_FOR_GPU(gpuStreamGetCaptureInfo, + cudaStreamGetCaptureInfo, + hipStreamGetCaptureInfo); +DECLARE_FUNCTION_FOR_GPU(gpuEventCreateWithFlags, + cudaEventCreateWithFlags, + hipEventCreateWithFlags); +DECLARE_FUNCTION_FOR_GPU(gpuEventRecord, cudaEventRecord, hipEventRecord); +DECLARE_FUNCTION_FOR_GPU(gpuEventDestroy, cudaEventDestroy, hipEventDestroy); +DECLARE_FUNCTION_FOR_GPU(gpuEventQuery, cudaEventQuery, hipEventQuery); +DECLARE_FUNCTION_FOR_GPU(gpuEventSynchronize, + cudaEventSynchronize, + hipEventSynchronize); +DECLARE_FUNCTION_FOR_GPU(gpuStreamSynchronize, + cudaStreamSynchronize, + hipStreamSynchronize); +DECLARE_FUNCTION_FOR_GPU(gpuIpcOpenMemHandle, + cudaIpcOpenMemHandle, + hipIpcOpenMemHandle); +DECLARE_FUNCTION_FOR_GPU(gpuIpcCloseMemHandle, + cudaIpcCloseMemHandle, + hipIpcCloseMemHandle); + +#undef DECLARE_FUNCTION_FOR_GPU + +using CUDAGraphID = unsigned long long; // NOLINT + +} // namespace paddle + +#endif // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/platform/dynload/rocm_driver.h b/paddle/fluid/platform/dynload/rocm_driver.h index 5c8e18611c40a..5295ffb07c1d1 100644 --- a/paddle/fluid/platform/dynload/rocm_driver.h +++ b/paddle/fluid/platform/dynload/rocm_driver.h @@ -39,13 +39,33 @@ extern bool HasCUDADriver(); __macro(hipModuleLoadData); \ __macro(hipModuleGetFunction); \ __macro(hipModuleUnload); \ - /*rocm3.5 not support the function*/ \ + /* DTK not support the function*/ \ /* __macro(hipOccupancyMaxActiveBlocksPerMultiprocessor);*/ \ __macro(hipModuleLaunchKernel); \ __macro(hipLaunchKernel); \ __macro(hipGetDevice); \ __macro(hipGetDeviceCount); \ - __macro(hipDevicePrimaryCtxGetState) + __macro(hipDevicePrimaryCtxGetState); \ + __macro(hipDeviceGetAttribute); \ + __macro(hipDeviceGet) + +#define ROCM_ROUTINE_EACH_VVM(__macro) \ + __macro(hipMemGetAllocationGranularity); \ + __macro(hipMemAddressReserve); \ + __macro(hipMemCreate); \ + __macro(hipMemMap); \ + __macro(hipMemSetAccess); \ + __macro(hipMemUnmap); \ + __macro(hipMemRelease); \ + __macro(hipMemAddressFree) + +#define ROCM_ROUTINE_EACH_GPU_GRAPH(__macro) \ + __macro(hipGraphNodeGetType); \ + __macro(hipGraphKernelNodeGetParams); \ + __macro(hipGraphExecKernelNodeSetParams) + +ROCM_ROUTINE_EACH_VVM(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP); +ROCM_ROUTINE_EACH_GPU_GRAPH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP); ROCM_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 86841a177d92e..8747b70414ddc 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -78,7 +78,7 @@ limitations under the License. */ #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/prim/utils/utils.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h" #endif #include "paddle/common/macros.h" @@ -978,12 +978,12 @@ PYBIND11_MODULE(libpaddle, m) { #endif m.def("is_cuda_graph_capturing", &platform::IsCUDAGraphCapturing); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) py::class_(m, "CUDAGraph") .def_static("begin_capture", [](platform::CUDAPlace place, int mode) { platform::BeginCUDAGraphCapture( - place, static_cast(mode)); + place, static_cast(mode)); }) .def_static("end_capture", &platform::EndCUDAGraphCapture) .def_static("gen_new_memory_pool_id", diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt index 50da99217b153..80d5f14e627a3 100644 --- a/paddle/phi/backends/CMakeLists.txt +++ b/paddle/phi/backends/CMakeLists.txt @@ -14,7 +14,7 @@ if(WITH_GPU OR WITH_ROCM) list(APPEND BACKENDS_SRCS gpu/cuda/cuda_info.cc gpu/cuda/cuda_graph.cc) endif() if(WITH_ROCM) - list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc) + list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc gpu/rocm/hip_graph.cc) endif() endif() diff --git a/paddle/phi/backends/dynload/rccl.cc b/paddle/phi/backends/dynload/rccl.cc index 95e171842527b..ee347af62fb79 100644 --- a/paddle/phi/backends/dynload/rccl.cc +++ b/paddle/phi/backends/dynload/rccl.cc @@ -14,11 +14,20 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/rccl.h" +ncclResult_t ncclCommInitRank2(ncclComm_t* newcomm, + int nranks, + ncclUniqueId commId, + int myrank, + int param) { + // fake impl for compilation + return ncclInvalidUsage; +} + namespace phi { namespace dynload { std::once_flag rccl_dso_flag; -void *rccl_dso_handle; +void* rccl_dso_handle; #define DEFINE_WRAP(__name) DynLoad__##__name __name diff --git a/paddle/phi/backends/dynload/rccl.h b/paddle/phi/backends/dynload/rccl.h index e1018a3f253fa..0123107cd230e 100644 --- a/paddle/phi/backends/dynload/rccl.h +++ b/paddle/phi/backends/dynload/rccl.h @@ -20,6 +20,18 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/dynamic_loader.h" #include "paddle/phi/backends/dynload/port.h" +#ifdef __cplusplus +extern "C" { +#endif +ncclResult_t ncclCommInitRank2(ncclComm_t* newcomm, + int nranks, + ncclUniqueId commId, + int myrank, + int param); +#ifdef __cplusplus +} +#endif + namespace phi { namespace dynload { @@ -28,15 +40,21 @@ extern void* rccl_dso_handle; #define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name) \ struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ - using nccl_func = decltype(&::__name); \ + static auto GetRCCLFunc() { \ + using rccl_func = decltype(&::__name); \ std::call_once(rccl_dso_flag, []() { \ rccl_dso_handle = phi::dynload::GetNCCLDsoHandle(); \ }); \ static void* p_##__name = dlsym(rccl_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ + return reinterpret_cast(p_##__name); \ + } \ + \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + return GetRCCLFunc()(args...); \ } \ + \ + static bool IsValid() { return GetRCCLFunc() != nullptr; } \ }; \ extern DynLoad__##__name __name @@ -44,6 +62,7 @@ extern void* rccl_dso_handle; __macro(ncclCommInitAll); \ __macro(ncclGetUniqueId); \ __macro(ncclCommInitRank); \ + __macro(ncclCommInitRank2); \ __macro(ncclCommAbort); \ __macro(ncclCommDestroy); \ __macro(ncclCommCount); \ diff --git a/paddle/phi/backends/dynload/rocm_driver.h b/paddle/phi/backends/dynload/rocm_driver.h index 4e456db44c904..bd221c3f1e32e 100644 --- a/paddle/phi/backends/dynload/rocm_driver.h +++ b/paddle/phi/backends/dynload/rocm_driver.h @@ -51,13 +51,33 @@ extern bool HasCUDADriver(); __macro(hipModuleLoadData); \ __macro(hipModuleGetFunction); \ __macro(hipModuleUnload); \ - /*rocm3.5 not support the function*/ \ + /* DTK not support the function*/ \ /* __macro(hipOccupancyMaxActiveBlocksPerMultiprocessor);*/ \ __macro(hipModuleLaunchKernel); \ __macro(hipLaunchKernel); \ __macro(hipGetDevice); \ __macro(hipGetDeviceCount); \ - __macro(hipDevicePrimaryCtxGetState) + __macro(hipDevicePrimaryCtxGetState); \ + __macro(hipDeviceGetAttribute); \ + __macro(hipDeviceGet) + +#define ROCM_ROUTINE_EACH_VVM(__macro) \ + __macro(hipMemGetAllocationGranularity); \ + __macro(hipMemAddressReserve); \ + __macro(hipMemCreate); \ + __macro(hipMemMap); \ + __macro(hipMemSetAccess); \ + __macro(hipMemUnmap); \ + __macro(hipMemRelease); \ + __macro(hipMemAddressFree) + +#define ROCM_ROUTINE_EACH_GPU_GRAPH(__macro) \ + __macro(hipGraphNodeGetType); \ + __macro(hipGraphKernelNodeGetParams); \ + __macro(hipGraphExecKernelNodeSetParams) + +ROCM_ROUTINE_EACH_VVM(DECLARE_DYNAMIC_LOAD_ROCM_WRAP); +ROCM_ROUTINE_EACH_GPU_GRAPH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP); ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP); diff --git a/paddle/phi/backends/gpu/cuda/cuda_graph.cc b/paddle/phi/backends/gpu/cuda/cuda_graph.cc index 728451f9bde40..43ec0a0c89c08 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_graph.cc +++ b/paddle/phi/backends/gpu/cuda/cuda_graph.cc @@ -301,8 +301,7 @@ void CUDAGraph::PrintToDotFiles(const std::string &dirname, #if CUDA_VERSION >= 11000 void CUDAGraphNodeLauncher::KernelNodeLaunch( - parameterSetter_t parameterSetter, - cudaKernelCallback_t cudakernelCallback) { + parameterSetter_t parameterSetter, gpuKernelCallback_t cudakernelCallback) { if (UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing())) { unsigned int id = GenerateIdentifier(); auto cudaFunc = cudakernelCallback(id); @@ -333,7 +332,7 @@ CUDAGraphNodeLauncher::GetParameterSettersForExecGraph(cudaGraph_t graph) { PADDLE_ENFORCE_GPU_SUCCESS( dynload::cuGraphKernelNodeGetParams(cuNode, &cuParams)); - CUDAKernelParams kernel_params(cuParams.kernelParams); + gpuKernelParams kernel_params(cuParams.kernelParams); auto kernel = parameterSetters.find(static_cast(cuParams.func)); VLOG(10) << "[GetParameterSettersForExecGraph] cuParams.func = " @@ -350,7 +349,7 @@ CUDAGraphNodeLauncher::GetParameterSettersForExecGraph(cudaGraph_t graph) { auto setter = parameterSetter->second; hooks.emplace_back([setter, cuNode, cuParams]( cudaGraphExec_t exec_graph) { - CUDAKernelParams kernel_params(cuParams.kernelParams); + gpuKernelParams kernel_params(cuParams.kernelParams); setter(kernel_params); PADDLE_ENFORCE_GPU_SUCCESS(dynload::cuGraphExecKernelNodeSetParams( static_cast(exec_graph), cuNode, &cuParams)); @@ -369,7 +368,7 @@ CUDAGraphNodeLauncher::GetParameterSettersForExecGraph(cudaGraph_t graph) { void CUDAGraphNodeLauncher::KernelNodeLaunch( cudaFunction_t cudaFunc, parameterSetter_t parameterSetter, - cudaKernelCallback_t cudakernelCallback) { + gpuKernelCallback_t cudakernelCallback) { cudakernelCallback(0); } diff --git a/paddle/phi/backends/gpu/cuda/cuda_graph.h b/paddle/phi/backends/gpu/cuda/cuda_graph.h index db5e4fcbe2da6..dfc981850ca13 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_graph.h +++ b/paddle/phi/backends/gpu/cuda/cuda_graph.h @@ -95,9 +95,9 @@ class CUDAGraphContextManager { std::set capturing_ctxs_; }; -class CUDAKernelParams { +class gpuKernelParams { public: - explicit CUDAKernelParams(void **params) : kernelParams(params) {} + explicit gpuKernelParams(void **params) : kernelParams(params) {} template T &As(size_t idx) const { @@ -132,20 +132,20 @@ class CUDAGraphNodeLauncher { // Sets the kernel's parameters BEFORE activating the CUDA graph. It enables // dynamic determination and setup of kernel arguments. // - // parameterSetter_t parameterSetter = [saved_state](CUDAKernelParams + // parameterSetter_t parameterSetter = [saved_state](gpuKernelParams // ¶m){ // // Code to compute and the parameter values from the saved_state // // ... // param.As(idx) = calculated_value; // }; - using parameterSetter_t = std::function; + using parameterSetter_t = std::function; // [CUDA Kernel Callback] // Acts as the launcher for the kernel. It accepts an `unsigned int` // identifier and uses it for the kernel launch. // The `cudaGetFuncBySymbol` method can be used to fetch the `cudaFunction_t` // reference of the kernel from the kernel pointer. - // cudaKernelCallback_t cudaKernelCallback = [=](unsigned int id) { + // gpuKernelCallback_t cudaKernelCallback = [=](unsigned int id) { // // cudaFunction_t is REQUIRED to get here // cudaFunction_t cudaFunc; // PADDLE_ENFORCE_GPU_SUCCESS(cudaGetFuncBySymbol(&cudaFunc, &kernel)); @@ -153,18 +153,18 @@ class CUDAGraphNodeLauncher { // kernel<<<>>>(id, ...); // Launching the kernel with id // return cudaFunc; // }; - using cudaKernelCallback_t = std::function; + using gpuKernelCallback_t = std::function; // [Kernel Launch] // With the callbacks defined and the CUDA function obtained, the kernel can // be launched using the `KernelNodeLaunch` method. void KernelNodeLaunch(parameterSetter_t parameterSetter, - cudaKernelCallback_t cudakernelCallback); + gpuKernelCallback_t cudakernelCallback); std::vector GetParameterSettersForExecGraph( cudaGraph_t graph); - parameterSetter_t GetParameterSetter(const CUDAKernelParams ¶ms); + parameterSetter_t GetParameterSetter(const gpuKernelParams ¶ms); static CUDAGraphNodeLauncher &Instance() { static CUDAGraphNodeLauncher *launcher = new CUDAGraphNodeLauncher; @@ -185,7 +185,7 @@ class CUDAGraphNodeLauncher { #if CUDA_VERSION >= 10010 static void ThrowErrorIfNotSupportCUDAGraph() {} #else -enum cudaStreamCaptureMode { +enum gpuStreamCaptureMode { cudaStreamCaptureModeGlobal = 0, cudaStreamCaptureModeThreadLocal = 1, cudaStreamCaptureModeRelaxed = 2 @@ -262,7 +262,7 @@ class CUDAGraph { static void BeginCapture(phi::GPUPlace place, cudaStream_t stream, - cudaStreamCaptureMode mode); + gpuStreamCaptureMode mode); static std::unique_ptr EndCapture(); static void BeginSegmentCapture(); @@ -309,7 +309,7 @@ class CUDAGraph { } } - using SetSeedFunc = std::function; + using SetSeedFunc = std::function; static void RecordRandomKernelInfo(SetSeedFunc set_seed_func) { std::lock_guard guard(capturing_graph_->func_mtx_); capturing_graph_->set_seed_funcs_.emplace_back(std::move(set_seed_func)); @@ -324,7 +324,7 @@ class CUDAGraph { #if CUDA_VERSION >= 10010 std::vector graphs_; std::vector exec_graphs_; - cudaStreamCaptureMode capture_mode_; + gpuStreamCaptureMode capture_mode_; #endif cudaStream_t stream_{nullptr}; phi::GPUPlace place_; @@ -368,7 +368,7 @@ class CUDAGraphCaptureModeGuard { public: explicit CUDAGraphCaptureModeGuard( - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) { + gpuStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) { if (UNLIKELY(CUDAGraph::IsCapturing())) { PADDLE_ENFORCE_GPU_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode)); // After cudaThreadExchangeStreamCaptureMode is called, @@ -385,7 +385,7 @@ class CUDAGraphCaptureModeGuard { } private: - cudaStreamCaptureMode old_mode_; + gpuStreamCaptureMode old_mode_; }; #else class CUDAGraphCaptureModeGuard { @@ -393,7 +393,7 @@ class CUDAGraphCaptureModeGuard { public: explicit CUDAGraphCaptureModeGuard( - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {} + gpuStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {} }; #endif diff --git a/paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h b/paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h index 952dd355882e5..2d5810fbe1c9b 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h +++ b/paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h @@ -17,9 +17,13 @@ #include #include -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/phi/backends/context_pool.h" +#if defined(PADDLE_WITH_CUDA) #include "paddle/phi/backends/gpu/cuda/cuda_graph.h" +#else +#include "paddle/phi/backends/gpu/rocm/hip_graph.h" +#endif #include "paddle/phi/kernels/funcs/dropout_impl_util.h" #endif @@ -28,7 +32,7 @@ namespace backends { namespace gpu { inline bool IsCUDAGraphCapturing() { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) return CUDAGraph::IsCapturing(); #else return false; @@ -39,7 +43,7 @@ inline bool IsCUDAGraphCapturing() { // Otherwise, invoke callback directly. template inline void AddPostResetCallbackIfCapturingCUDAGraph(Callback &&callback) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (UNLIKELY(IsCUDAGraphCapturing())) { return CUDAGraph::AddPostResetCallbackDuringCapturing( std::forward(callback)); @@ -52,7 +56,7 @@ template inline T *RestoreHostMemIfCapturingCUDAGraph(T *host_mem, size_t size) { static_assert(std::is_trivial::value, "T must be trivial type"); static_assert(!std::is_same::value, "T cannot be void"); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (UNLIKELY(IsCUDAGraphCapturing())) { size_t nbytes = size * sizeof(T); void *new_host_mem = new uint8_t[nbytes]; diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h index fe4d6a6623a96..97f34de9a55a6 100644 --- a/paddle/phi/backends/gpu/gpu_types.h +++ b/paddle/phi/backends/gpu/gpu_types.h @@ -29,6 +29,9 @@ namespace phi { +// Note(qili93): CUDA Runtime API supported by HIP +// https://github.com/ROCm/HIPIFY/blob/master/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md + #ifdef PADDLE_WITH_HIP #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \ using GPU_TYPE = ROCM_TYPE; @@ -50,6 +53,20 @@ DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t, DECLARE_TYPE_FOR_GPU(dnnActivationMode_t, cudnnActivationMode_t, miopenActivationMode_t); +DECLARE_TYPE_FOR_GPU(gpuGraph_t, cudaGraph_t, hipGraph_t); +DECLARE_TYPE_FOR_GPU(gpuFunction_t, cudaFunction_t, hipFunction_t); +DECLARE_TYPE_FOR_GPU(gpuGraphExec_t, cudaGraphExec_t, hipGraphExec_t); +DECLARE_TYPE_FOR_GPU(gpuGraphNode_t, cudaGraphNode_t, hipGraphNode_t); +DECLARE_TYPE_FOR_GPU(gpuGraphNodeType, cudaGraphNodeType, hipGraphNodeType); +DECLARE_TYPE_FOR_GPU(gpuKernelNodeParams, + cudaKernelNodeParams, + hipKernelNodeParams); +DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode, + cudaStreamCaptureMode, + hipStreamCaptureMode); +DECLARE_TYPE_FOR_GPU(gpuStreamCaptureStatus, + cudaStreamCaptureStatus, + hipStreamCaptureStatus); #undef DECLARE_TYPE_FOR_GPU @@ -76,8 +93,75 @@ DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToHost, DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToDevice, cudaMemcpyKind::cudaMemcpyDeviceToDevice, hipMemcpyKind::hipMemcpyDeviceToDevice); +DECLARE_CONSTANT_FOR_GPU(gpuEventDisableTiming, + cudaEventDisableTiming, + hipEventDisableTiming); +DECLARE_CONSTANT_FOR_GPU(gpuStreamNonBlocking, + cudaStreamNonBlocking, + hipStreamNonBlocking); +DECLARE_CONSTANT_FOR_GPU(gpuStreamCaptureModeThreadLocal, + cudaStreamCaptureModeThreadLocal, + hipStreamCaptureModeThreadLocal); +DECLARE_CONSTANT_FOR_GPU(gpuStreamCaptureModeRelaxed, + cudaStreamCaptureModeRelaxed, + hipStreamCaptureModeRelaxed); +DECLARE_CONSTANT_FOR_GPU(gpuStreamCaptureStatusActive, + cudaStreamCaptureStatusActive, + hipStreamCaptureStatusActive); +DECLARE_CONSTANT_FOR_GPU(gpuGraphNodeTypeKernel, + cudaGraphNodeTypeKernel, + hipGraphNodeTypeKernel); #undef DECLARE_CONSTANT_FOR_GPU + +#ifdef PADDLE_WITH_HIP +#define DECLARE_FUNCTION_FOR_GPU(GPU_FUNC, CUDA_FUNC, ROCM_FUNC) \ + const auto GPU_FUNC = ROCM_FUNC; +#else // PADDLE_WITH_CUDA +#define DECLARE_FUNCTION_FOR_GPU(GPU_FUNC, CUDA_FUNC, ROCM_FUNC) \ + const auto GPU_FUNC = CUDA_FUNC; +#endif + +DECLARE_FUNCTION_FOR_GPU(gpuGraphGetNodes, cudaGraphGetNodes, hipGraphGetNodes); +DECLARE_FUNCTION_FOR_GPU(gpuGraphGetEdges, cudaGraphGetEdges, hipGraphGetEdges); +DECLARE_FUNCTION_FOR_GPU(gpuGraphLaunch, cudaGraphLaunch, hipGraphLaunch); +DECLARE_FUNCTION_FOR_GPU(gpuGraphDestroy, cudaGraphDestroy, hipGraphDestroy); +DECLARE_FUNCTION_FOR_GPU(gpuGraphExecDestroy, + cudaGraphExecDestroy, + hipGraphExecDestroy); +DECLARE_FUNCTION_FOR_GPU(gpuGraphNodeGetType, + cudaGraphNodeGetType, + hipGraphNodeGetType); +DECLARE_FUNCTION_FOR_GPU(gpuGraphExecKernelNodeSetParams, + cudaGraphExecKernelNodeSetParams, + hipGraphExecKernelNodeSetParams); +DECLARE_FUNCTION_FOR_GPU(gpuGraphKernelNodeGetParams, + cudaGraphKernelNodeGetParams, + hipGraphKernelNodeGetParams); +DECLARE_FUNCTION_FOR_GPU(gpuStreamCreateWithPriority, + cudaStreamCreateWithPriority, + hipStreamCreateWithPriority); +DECLARE_FUNCTION_FOR_GPU(gpuStreamBeginCapture, + cudaStreamBeginCapture, + hipStreamBeginCapture); +DECLARE_FUNCTION_FOR_GPU(gpuStreamEndCapture, + cudaStreamEndCapture, + hipStreamEndCapture); +DECLARE_FUNCTION_FOR_GPU(gpuStreamGetCaptureInfo, + cudaStreamGetCaptureInfo, + hipStreamGetCaptureInfo); +DECLARE_FUNCTION_FOR_GPU(gpuEventCreateWithFlags, + cudaEventCreateWithFlags, + hipEventCreateWithFlags); +DECLARE_FUNCTION_FOR_GPU(gpuEventRecord, cudaEventRecord, hipEventRecord); +DECLARE_FUNCTION_FOR_GPU(gpuEventDestroy, cudaEventDestroy, hipEventDestroy); +DECLARE_FUNCTION_FOR_GPU(gpuEventQuery, cudaEventQuery, hipEventQuery); +DECLARE_FUNCTION_FOR_GPU(gpuEventSynchronize, + cudaEventSynchronize, + hipEventSynchronize); + +#undef DECLARE_FUNCTION_FOR_GPU + } // namespace phi #endif // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/phi/backends/gpu/rocm/hip_graph.cc b/paddle/phi/backends/gpu/rocm/hip_graph.cc new file mode 100644 index 0000000000000..781cb41ae6983 --- /dev/null +++ b/paddle/phi/backends/gpu/rocm/hip_graph.cc @@ -0,0 +1,365 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/rocm/hip_graph.h" +#include "glog/logging.h" +#include "paddle/common/flags.h" + +COMMON_DECLARE_bool(use_cuda_malloc_async_allocator); +COMMON_DECLARE_bool(auto_free_cudagraph_allocations_on_launch); + +namespace phi { +namespace backends { +namespace gpu { + +std::unique_ptr CUDAGraph::capturing_graph_{nullptr}; +paddle::optional CUDAGraph::capturing_thread_id_{paddle::none}; + +static std::vector ToposortCUDAGraph(hipGraph_t graph) { + size_t num_nodes; + PADDLE_ENFORCE_GPU_SUCCESS(hipGraphGetNodes(graph, nullptr, &num_nodes)); + std::vector nodes(num_nodes); + PADDLE_ENFORCE_GPU_SUCCESS(hipGraphGetNodes(graph, nodes.data(), &num_nodes)); + + size_t num_edges; + PADDLE_ENFORCE_GPU_SUCCESS( + hipGraphGetEdges(graph, nullptr, nullptr, &num_edges)); + std::vector from(num_edges), to(num_edges); + PADDLE_ENFORCE_GPU_SUCCESS( + hipGraphGetEdges(graph, from.data(), to.data(), &num_edges)); + + std::unordered_map> + in_edges, out_edges; + for (auto node : nodes) { + in_edges[node]; + out_edges[node]; + } + + for (size_t i = 0; i < num_edges; ++i) { + in_edges[to[i]].insert(from[i]); + out_edges[from[i]].insert(to[i]); + } + + std::queue q; + for (const auto &pair : in_edges) { + if (pair.second.empty()) { + q.push(pair.first); + } + } + + nodes.clear(); + while (!q.empty()) { + auto cur = q.front(); + q.pop(); + nodes.push_back(cur); + + for (auto out_node : out_edges.at(cur)) { + auto &in_nodes = in_edges.at(out_node); + in_nodes.erase(cur); + if (in_nodes.empty()) { + q.push(out_node); + } + } + } + PADDLE_ENFORCE_EQ( + nodes.size(), + num_nodes, + phi::errors::InvalidArgument("Toposort error, this may be a bug.")); + return nodes; +} + +CUDAGraphID CUDAGraph::UniqueID() { + static std::atomic id; + return id.fetch_add(1); +} + +int64_t CUDAGraph::UniqueMemoryPoolID() { + static std::atomic id(CUDAGraph::kDefaultPoolID + 1); + return id.fetch_add(1); +} + +void CUDAGraph::Reset() { + if (is_reset_) return; +#if defined(PADDLE_WITH_HIP) + for (auto graph : graphs_) { + PADDLE_ENFORCE_GPU_SUCCESS(hipGraphDestroy(graph)); + } + graphs_.clear(); + for (auto exec_graph : exec_graphs_) { + PADDLE_ENFORCE_GPU_SUCCESS(hipGraphExecDestroy(exec_graph)); + } + exec_graphs_.clear(); +#endif + // callback should be called in reverse order because the latter added + // callback may rely on the former added callback. + for (auto iter = cudagraph_post_reset_callbacks_.rbegin(); + iter != cudagraph_post_reset_callbacks_.rend(); + ++iter) { + (*iter)(); + } + cudagraph_post_reset_callbacks_.clear(); + is_reset_ = true; +} + +void CUDAGraph::Replay() { +#if defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE_EQ(is_reset_, + false, + phi::errors::PermissionDenied( + "Cannot replay the CUDA Graph after reset is called.")); + size_t n = exec_graphs_.size(); + for (size_t i = 0; i < n; ++i) { + if (!is_first_run_) { + for (auto &hook : cudagraph_pre_replay_callbacks_[i]) { + hook(exec_graphs_[i]); + } + } + PADDLE_ENFORCE_GPU_SUCCESS(hipGraphLaunch(exec_graphs_[i], stream_)); + } + is_first_run_ = false; +#endif +} + +void CUDAGraph::BeginSegmentCapture() { + ThrowErrorIfNotSupportCUDAGraph(); +#if defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE_EQ(IsCapturing(), + true, + phi::errors::PermissionDenied( + "BeginSegmentCapture should be called when CUDA " + "Graph is capturing.")); + if (IsThreadLocalCapturing()) { + PADDLE_ENFORCE_EQ(IsThisThreadCapturing(), + true, + phi::errors::PermissionDenied( + "When capturing CUDA Graph in the thread local mode, " + "you cannot begin segmented capturing in the thread " + "which is not the one that starts the capturing.")); + } + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamBeginCapture( + capturing_graph_->stream_, capturing_graph_->capture_mode_)); + PADDLE_ENFORCE_EQ( + IsValidCapturing(), + true, + phi::errors::PermissionDenied("CUDA Graph should not be invalidated.")); + VLOG(10) << "Begin to capture CUDA Graph with ID " << capturing_graph_->id_ + << ", segment id " << capturing_graph_->graphs_.size() + << ", memory pool id " << capturing_graph_->pool_id_; +#endif +} + +void CUDAGraph::BeginCapture(phi::GPUPlace place, + gpuStream_t stream, + hipStreamCaptureMode mode) { + ThrowErrorIfNotSupportCUDAGraph(); +#if defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE_EQ(IsCapturing(), + false, + phi::errors::PermissionDenied( + "CUDA Graph can only captured one by one.")); + PADDLE_ENFORCE_NOT_NULL( + stream, + phi::errors::PermissionDenied( + "CUDA Graph cannot be captured in default CUDA stream 0.")); + capturing_graph_.reset(new CUDAGraph()); + capturing_graph_->place_ = place; + capturing_graph_->stream_ = stream; + capturing_graph_->capture_mode_ = mode; + if (mode == hipStreamCaptureModeThreadLocal) { + capturing_thread_id_ = std::this_thread::get_id(); + VLOG(10) << "Capturing CUDA Graph in thread local mode, thread id: " + << capturing_thread_id_; + } + BeginSegmentCapture(); +#endif +} + +void CUDAGraph::EndSegmentCapture() { + ThrowErrorIfNotSupportCUDAGraph(); +#if defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE_EQ( + IsCapturing(), + true, + phi::errors::PermissionDenied("No CUDA Graph is capturing.")); + hipGraph_t graph; + PADDLE_ENFORCE_GPU_SUCCESS( + hipStreamEndCapture(capturing_graph_->stream_, &graph)); + auto num_nodes = static_cast(-1); + PADDLE_ENFORCE_GPU_SUCCESS(hipGraphGetNodes(graph, nullptr, &num_nodes)); + if (num_nodes == 0) { + PADDLE_ENFORCE_GPU_SUCCESS(hipGraphDestroy(graph)); + VLOG(10) << "Skip empty CUDA Graph with ID " << capturing_graph_->id_ + << ", segment id " << capturing_graph_->graphs_.size() + << ", memory pool id " << capturing_graph_->pool_id_; + return; + } + + for (auto &cudagraph_post_capture_callback : + capturing_graph_->cudagraph_post_capture_callbacks_) { + cudagraph_post_capture_callback(); + } + capturing_graph_->cudagraph_post_capture_callbacks_.clear(); + + capturing_graph_->cudagraph_pre_replay_callbacks_.emplace_back( + CUDAGraphNodeLauncher::Instance().GetParameterSettersForExecGraph(graph)); + + // if forward graph is registered, this graph is a backward graph + // we check whether there is remain blocks that is unreleased by this + hipGraphExec_t exec_graph; + if (FLAGS_use_cuda_malloc_async_allocator && + FLAGS_auto_free_cudagraph_allocations_on_launch) { +#if defined(PADDLE_WITH_HIP) + VLOG(1) << "hipGraphInstantiateFlagAutoFreeOnLaunch is enabled!"; + PADDLE_ENFORCE_GPU_SUCCESS(hipGraphInstantiateWithFlags( + &exec_graph, graph, hipGraphInstantiateFlagAutoFreeOnLaunch)); +#else + PADDLE_THROW(phi::errors::Unimplemented( + "The cudaGraphInstantiateFlagAutoFreeOnLaunch is only supported when " + "CUDA version >= 11.4.0")); +#endif + } else { +#if defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE_GPU_SUCCESS( + hipGraphInstantiate(&exec_graph, graph, nullptr, nullptr, 0)); +#endif + } + VLOG(10) << "End to capture CUDA Graph with ID " << capturing_graph_->id_ + << ", segment id " << capturing_graph_->graphs_.size() + << ", memory pool id " << capturing_graph_->pool_id_; + capturing_graph_->graphs_.emplace_back(graph); + capturing_graph_->exec_graphs_.emplace_back(exec_graph); +#endif +} + +std::unique_ptr CUDAGraph::EndCapture() { + EndSegmentCapture(); + capturing_thread_id_ = paddle::none; + return std::move(capturing_graph_); +} + +bool CUDAGraph::IsValidCapturing() { +#if defined(PADDLE_WITH_HIP) + if (!IsCapturing()) return false; + hipStreamCaptureStatus status; + CUDAGraphID id; + PADDLE_ENFORCE_GPU_SUCCESS( + hipStreamGetCaptureInfo(capturing_graph_->stream_, &status, &id)); + return status == hipStreamCaptureStatusActive; +#else + return false; +#endif +} + +static std::string ConcatPath(const std::string &dirname, + const std::string &filename) { +#ifdef _WIN32 + const std::array kFileSep = {"\\"}; +#else + const std::array kFileSep = {"/"}; +#endif + if (!dirname.empty() && dirname.back() == kFileSep[0]) { + return dirname + filename; + } else { + return dirname + kFileSep.data() + filename; + } +} + +void CUDAGraph::PrintToDotFiles(const std::string &dirname, + unsigned int flags) { + ThrowErrorIfNotSupportCUDAGraph(); + PADDLE_THROW(phi::errors::Unimplemented( + "The print_to_dot_files() method is not supported on ROCm/HIP")); +} + +#if defined(PADDLE_WITH_HIP) +void CUDAGraphNodeLauncher::KernelNodeLaunch( + parameterSetter_t parameterSetter, gpuKernelCallback_t cudakernelCallback) { + if (UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing())) { + unsigned int id = GenerateIdentifier(); + auto cudaFunc = cudakernelCallback(id); + + parameterSetters[cudaFunc][id] = parameterSetter; + VLOG(10) << "[KernelNodeLaunch] Launch kernel with cudaFunc = " << cudaFunc + << " id = " << id; + } else { + cudakernelCallback(0); + } +} + +std::vector +CUDAGraphNodeLauncher::GetParameterSettersForExecGraph(hipGraph_t graph) { + size_t num_nodes; + PADDLE_ENFORCE_GPU_SUCCESS(hipGraphGetNodes(graph, nullptr, &num_nodes)); + std::vector nodes(num_nodes); + PADDLE_ENFORCE_GPU_SUCCESS(hipGraphGetNodes(graph, nodes.data(), &num_nodes)); + + std::vector> hooks; + for (auto node : nodes) { + hipGraphNode_t gpuNode = node; + hipGraphNodeType pType; + PADDLE_ENFORCE_GPU_SUCCESS(hipGraphNodeGetType(gpuNode, &pType)); + if (pType == hipGraphNodeTypeKernel) { + hipKernelNodeParams gpuParams; + PADDLE_ENFORCE_GPU_SUCCESS( + gpuGraphKernelNodeGetParams(gpuNode, &gpuParams)); + gpuKernelParams kernel_params(gpuParams.kernelParams); + auto kernel = + parameterSetters.find(static_cast(gpuParams.func)); + VLOG(10) << "[GetParameterSettersForExecGraph] gpuParams.func = " + << gpuParams.func; + // There exists a parameter setter + if (kernel != parameterSetters.end()) { + auto launchSequence = kernel->second; + unsigned int id = kernel_params.As(0); + + VLOG(10) << "[GetParameterSettersForExecGraph] Find launch kernel id = " + << id; + auto parameterSetter = launchSequence.find(id); + if (parameterSetter != launchSequence.end()) { + auto setter = parameterSetter->second; + hooks.emplace_back( + [setter, gpuNode, gpuParams](hipGraphExec_t exec_graph) { + gpuKernelParams kernel_params(gpuParams.kernelParams); + setter(kernel_params); + PADDLE_ENFORCE_GPU_SUCCESS(hipGraphExecKernelNodeSetParams( + exec_graph, gpuNode, &gpuParams)); + }); + } else { + PADDLE_THROW( + phi::errors::InvalidArgument("Error: does not find launch id")); + } + } + } + } + + return hooks; +} +#else +void CUDAGraphNodeLauncher::KernelNodeLaunch( + hipFunction_t cudaFunc, + parameterSetter_t parameterSetter, + gpuKernelCallback_t cudakernelCallback) { + cudakernelCallback(0); +} + +std::vector +CUDAGraphNodeLauncher::GetParameterSettersForExecGraph(hipGraph_t graph) { + PADDLE_THROW(phi::errors::Unimplemented( + "CUDAGraphNodeLauncher is only supported when CUDA version >= 11.0")); +} +#endif + +} // namespace gpu +} // namespace backends +} // namespace phi diff --git a/paddle/phi/backends/gpu/rocm/hip_graph.h b/paddle/phi/backends/gpu/rocm/hip_graph.h new file mode 100644 index 0000000000000..cb92275227254 --- /dev/null +++ b/paddle/phi/backends/gpu/rocm/hip_graph.h @@ -0,0 +1,393 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/common/errors.h" +#include "paddle/common/macros.h" +#include "paddle/phi/backends/context_pool.h" +#include "paddle/phi/backends/device_code.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/memory_utils.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/utils/optional.h" + +namespace phi { +namespace backends { +namespace gpu { + +class CUDAGraphContextManager { + public: + using DeviceContextMap = + std::map>>; + + static CUDAGraphContextManager &Instance() { + static CUDAGraphContextManager *cuda_graph_ctx_manager = + new CUDAGraphContextManager; + return *cuda_graph_ctx_manager; + } + + DeviceContext *Get(int64_t pool_id, const Place &place, int stream_priority) { + std::lock_guard lk(ctx_mtx_); + VLOG(6) << "Get cuda graph device context for " << place; + + DeviceContextMap &ctxs = cuda_graph_ctx_pool_[pool_id]; + if (ctxs.find(place) == ctxs.end()) { + phi::memory_utils::EmplaceDeviceContexts( + &ctxs, + {place}, + /*disable_setting_default_stream_for_allocator=*/true, + stream_priority); + } + return ctxs[place].get().get(); + } + + void RecordCapturingDeviceContext(DeviceContext *dev_ctx) { + capturing_ctxs_.insert(dev_ctx); + } + + std::set GetAllCapturingDeviceContexts() const { + return capturing_ctxs_; + } + + void ClearDeviceContextsRecords() { capturing_ctxs_.clear(); } + + private: + CUDAGraphContextManager() {} + DISABLE_COPY_AND_ASSIGN(CUDAGraphContextManager); + + std::mutex ctx_mtx_; + std::unordered_map cuda_graph_ctx_pool_; + std::set capturing_ctxs_; +}; + +class gpuKernelParams { + public: + explicit gpuKernelParams(void **params) : kernelParams(params) {} + + template + T &As(size_t idx) const { + return *reinterpret_cast(kernelParams[idx]); + } + + void **getParams() const { return kernelParams; } + + private: + void **kernelParams; +}; + +using cudaGraphExecuterSetter_t = std::function; + +// ** class CUDAGraphNodeLauncher +// +// This class offers a interface for launching CUDA kernels in CUDA Graph, we +// utilize the `cudaGraphExecKernelNodeSetParams` function for parameter setup. +// Launching kernels via this class ensures proper management. +// +// NOTE: It's essential that the first parameter for any kernel launched +// through this class is an `unsigned int` identifier. This identifier plays a +// crucial role in linking the CUDA kernel to its corresponding CUDA graph +// node. We tag each kernel launch with a unique identifier to maintain +// structured linkage with its CUDA graph node. +// +// NOTE: This class use a singleton design pattern ensures there's only a +// single global instance accessible via the `Instance()` method. +class CUDAGraphNodeLauncher { + public: + // [Parameter Setter Callback] + // Sets the kernel's parameters BEFORE activating the CUDA graph. It enables + // dynamic determination and setup of kernel arguments. + // + // parameterSetter_t parameterSetter = [saved_state](gpuKernelParams + // ¶m){ + // // Code to compute and the parameter values from the saved_state + // // ... + // param.As(idx) = calculated_value; + // }; + using parameterSetter_t = std::function; + + // [CUDA Kernel Callback] + // Acts as the launcher for the kernel. It accepts an `unsigned int` + // identifier and uses it for the kernel launch. + // The `cudaGetFuncBySymbol` method can be used to fetch the `cudaFunction_t` + // reference of the kernel from the kernel pointer. + // gpuKernelCallback_t cudaKernelCallback = [=](unsigned int id) { + // // cudaFunction_t is REQUIRED to get here + // cudaFunction_t cudaFunc; + // PADDLE_ENFORCE_GPU_SUCCESS(cudaGetFuncBySymbol(&cudaFunc, &kernel)); + // + // kernel<<<>>>(id, ...); // Launching the kernel with id + // return cudaFunc; + // }; + using gpuKernelCallback_t = std::function; + + // [Kernel Launch] + // With the callbacks defined and the CUDA function obtained, the kernel can + // be launched using the `KernelNodeLaunch` method. + void KernelNodeLaunch(parameterSetter_t parameterSetter, + gpuKernelCallback_t cudakernelCallback); + + std::vector GetParameterSettersForExecGraph( + hipGraph_t graph); + + parameterSetter_t GetParameterSetter(const gpuKernelParams ¶ms); + + static CUDAGraphNodeLauncher &Instance() { + static CUDAGraphNodeLauncher *launcher = new CUDAGraphNodeLauncher; + return *launcher; + } + + private: + CUDAGraphNodeLauncher() : id(0) {} + DISABLE_COPY_AND_ASSIGN(CUDAGraphNodeLauncher); + + unsigned int GenerateIdentifier() { return id++; } + + unsigned int id; + std::unordered_map> + parameterSetters; +}; + +#if defined(PADDLE_WITH_HIP) +static void ThrowErrorIfNotSupportCUDAGraph() {} +#else +enum gpuStreamCaptureMode { + hipStreamCaptureModeGlobal = 0, + hipStreamCaptureModeThreadLocal = 1, + hipStreamCaptureModeRelaxed = 2 +}; +static void ThrowErrorIfNotSupportCUDAGraph() { + PADDLE_THROW(phi::errors::Unimplemented( + "CUDA Graph is only supported when CUDA version >= 10.1")); +} +#endif + +using CUDAGraphID = unsigned long long; // NOLINT + +// NOTE: Currently, we do not support to capture CUDA graph in parallel +// NOTE: Do not use this class directly because it should be used with +// the memory pool. +class CUDAGraph { + DISABLE_COPY_AND_ASSIGN(CUDAGraph); + + // Since the constructor would throw error is CUDA_VERSION < 10010. + // The non-static method of CUDAGraph need not check CUDA_VERSION + // again. + CUDAGraph() { + ThrowErrorIfNotSupportCUDAGraph(); + id_ = UniqueID(); + } + + public: + static constexpr int64_t kDefaultPoolID = 0; + static constexpr int64_t kInvalidPoolID = -1; + + ~CUDAGraph() { Reset(); } + + CUDAGraphID ID() const { return id_; } + + static int64_t SetMemoryPoolID(int64_t pool_id) { + auto &pool_id_ = capturing_graph_->pool_id_; + PADDLE_ENFORCE_EQ( + pool_id_, + kInvalidPoolID, + phi::errors::InvalidArgument("Cannot reset memory pool id twice, the " + "former memory pool id is %d.", + pool_id_)); + if (pool_id <= kInvalidPoolID) { + pool_id_ = UniqueMemoryPoolID(); + } else { + PADDLE_ENFORCE_GE( + pool_id, + kDefaultPoolID, + phi::errors::InvalidArgument("Invalid memory pool id %d.", pool_id)); + pool_id_ = pool_id; + } + return pool_id_; + } + + int64_t PoolID() const { return pool_id_; } + + static int64_t CapturingPoolID() { return capturing_graph_->pool_id_; } + + void Replay(); + + void Reset(); + + void AddPostResetCallback(std::function callback) { + std::lock_guard guard(mtx_); + cudagraph_post_reset_callbacks_.push_back(std::move(callback)); + } + + void AddPostCaptureCallback(std::function callback) { + std::lock_guard guard(mtx_); + cudagraph_post_capture_callbacks_.push_back(std::move(callback)); + } + + void PrintToDotFiles(const std::string &dirname, unsigned int flags); + + static void BeginCapture(phi::GPUPlace place, + gpuStream_t stream, + gpuStreamCaptureMode mode); + static std::unique_ptr EndCapture(); + + static void BeginSegmentCapture(); + static void EndSegmentCapture(); + + static void AddPostResetCallbackDuringCapturing( + std::function callback) { + capturing_graph_->AddPostResetCallback(std::move(callback)); + } + + static void AddPostCaptureCallbackDuringCapturing( + std::function callback) { + capturing_graph_->AddPostCaptureCallback(std::move(callback)); + } + + // No need to add CUDA_VERSION macro because capturing_graph_ would + // always be nullptr (constructor throws error) + static bool IsCapturing() { return capturing_graph_ != nullptr; } + + static CUDAGraphID CapturingID() { return capturing_graph_->id_; } + + static phi::GPUPlace CapturingPlace() { return capturing_graph_->place_; } + + // This API can be used to debug which GPU operation is not + // supported during capturing CUDA Graph. + static bool IsValidCapturing(); + + static bool IsThreadLocalCapturing() { +#if defined(PADDLE_WITH_HIP) + return IsCapturing() && + capturing_graph_->capture_mode_ == hipStreamCaptureModeThreadLocal; +#else + return false; +#endif + } + + static bool IsThisThreadCapturing() { + if (UNLIKELY(IsCapturing())) { + return IsThreadLocalCapturing() + ? capturing_thread_id_.get() == std::this_thread::get_id() + : true; + } else { + return false; + } + } + + using SetSeedFunc = std::function; + static void RecordRandomKernelInfo(SetSeedFunc set_seed_func) { + std::lock_guard guard(capturing_graph_->func_mtx_); + capturing_graph_->set_seed_funcs_.emplace_back(std::move(set_seed_func)); + } + + static int64_t UniqueMemoryPoolID(); + + private: + static CUDAGraphID UniqueID(); + + private: +#if defined(PADDLE_WITH_HIP) + std::vector graphs_; + std::vector exec_graphs_; + gpuStreamCaptureMode capture_mode_; +#endif + gpuStream_t stream_{nullptr}; + phi::GPUPlace place_; + CUDAGraphID id_; + int64_t pool_id_{kInvalidPoolID}; + bool is_reset_{false}; + std::mutex mtx_; + + std::vector set_seed_funcs_; + + // Holds callbacks that are triggered after the CUDA graph is reset. These + // callbacks are used for operations that need to be performed following the + // reset of a CUDA graph. + std::vector> cudagraph_post_reset_callbacks_; + + // Contains callbacks that are invoked after the CUDA graph has been captured. + // These callbacks are crucial for managing memory allocations related to the + // CUDA graph. They ensure that memory blocks not associated with a graph (as + // detailed in cuda_malloc_async_allocator) are not erroneously released + // during the graph's lifecycle. + std::vector> cudagraph_post_capture_callbacks_; + + // Maintains a collection of 'pre-hooks' - functions that are executed before + // the CUDA graph is replayed. These pre-hooks are essential for setting up + // the necessary conditions or states required for the correct execution of + // the CUDA graph. + std::vector> + cudagraph_pre_replay_callbacks_; + + std::mutex func_mtx_; + + bool is_first_run_{true}; + + static paddle::optional capturing_thread_id_; + static std::unique_ptr capturing_graph_; +}; + +#if defined(PADDLE_WITH_HIP) +class CUDAGraphCaptureModeGuard { + DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard); + + public: + explicit CUDAGraphCaptureModeGuard( + gpuStreamCaptureMode mode = hipStreamCaptureModeRelaxed) { + if (UNLIKELY(CUDAGraph::IsCapturing())) { + PADDLE_ENFORCE_GPU_SUCCESS(hipThreadExchangeStreamCaptureMode(&mode)); + // After cudaThreadExchangeStreamCaptureMode is called, + // the variable "mode" would be set to the old capturing mode. + old_mode_ = mode; + } + } + + ~CUDAGraphCaptureModeGuard() PADDLE_MAY_THROW { + if (UNLIKELY(CUDAGraph::IsCapturing())) { + PADDLE_ENFORCE_GPU_SUCCESS( + hipThreadExchangeStreamCaptureMode(&old_mode_)); + } + } + + private: + gpuStreamCaptureMode old_mode_; +}; +#else +class CUDAGraphCaptureModeGuard { + DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard); + + public: + explicit CUDAGraphCaptureModeGuard( + gpuStreamCaptureMode mode = hipStreamCaptureModeRelaxed) {} +}; +#endif + +} // namespace gpu +} // namespace backends +} // namespace phi diff --git a/paddle/phi/backends/gpu/rocm/rocm_info.cc b/paddle/phi/backends/gpu/rocm/rocm_info.cc index edc23479c9238..b8ddea98b5c9e 100644 --- a/paddle/phi/backends/gpu/rocm/rocm_info.cc +++ b/paddle/phi/backends/gpu/rocm/rocm_info.cc @@ -173,7 +173,7 @@ int GetCurrentDeviceId() { return device_id; } -std::array GetGpuMaxGridDimSize(int id) { +std::array GetGpuMaxGridDimSize(int id) { PADDLE_ENFORCE_LT( id, GetGPUDeviceCount(), @@ -181,7 +181,7 @@ std::array GetGpuMaxGridDimSize(int id) { "but received id is: %d. GPU count is: %d.", id, GetGPUDeviceCount())); - std::array ret; + std::array ret; int size; auto error_code_x = hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id); diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc index 6169681885b7b..6cf80c350cd04 100644 --- a/paddle/phi/core/device_context.cc +++ b/paddle/phi/core/device_context.cc @@ -14,8 +14,10 @@ #include "paddle/phi/core/device_context.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) #include "paddle/phi/backends/gpu/cuda/cuda_graph.h" +#elif defined(PADDLE_WITH_HIP) +#include "paddle/phi/backends/gpu/rocm/hip_graph.h" #endif #include "paddle/phi/core/dense_tensor.h" @@ -70,7 +72,7 @@ struct DeviceContext::Impl { pinned_allocator_ = allocator; } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void SetCUDAGraphAllocator(const Allocator* allocator) { // NOTE (Yuang): cuda graph allocator can be set to nullptr, so don't check // validation of the allocator here @@ -163,7 +165,7 @@ struct DeviceContext::Impl { (fake_alloc || tensor->numel() == 0) && requested_size == 0 ? zero_allocator_ : (pinned ? pinned_allocator_ : device_allocator_); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) bool must_cuda_graph_allocator = (!fake_alloc && tensor->numel() != 0) && !pinned; if (must_cuda_graph_allocator && @@ -289,7 +291,7 @@ struct DeviceContext::Impl { const Allocator* zero_allocator_{nullptr}; const Allocator* host_zero_allocator_{nullptr}; const Allocator* pinned_allocator_{nullptr}; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) const Allocator* cuda_graph_allocator_{nullptr}; #endif Generator* device_generator_{nullptr}; @@ -309,7 +311,7 @@ DeviceContext::DeviceContext(const DeviceContext& other) { impl_->SetPinnedAllocator(&other.GetPinnedAllocator()); impl_->SetHostGenerator(other.GetHostGenerator()); impl_->SetGenerator(other.GetGenerator()); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (other.IsCUDAGraphAllocatorValid()) { impl_->SetCUDAGraphAllocator(&other.GetCUDAGraphAllocator()); } @@ -340,7 +342,7 @@ const Allocator& DeviceContext::GetHostAllocator() const { return impl_->GetHostAllocator(); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void DeviceContext::SetCUDAGraphAllocator(const Allocator* allocator) { impl_->SetCUDAGraphAllocator(allocator); } diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h index 25d748c915086..9ead0e2c32b23 100644 --- a/paddle/phi/core/device_context.h +++ b/paddle/phi/core/device_context.h @@ -115,7 +115,7 @@ class PADDLE_API DeviceContext { const Allocator& GetPinnedAllocator() const; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) /** * @brief Set the CUDA graph Allocator object. * diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 80d61ebc9a9a6..304fd3cef793a 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -213,6 +213,7 @@ if(WITH_ROCM) "gpu/put_along_axis_grad_kernel.cu" "gpu/put_along_axis_kernel.cu" "gpu/qr_kernel.cu" + "gpu/rms_norm_grad_kernel.cu" "gpu/svd_kernel.cu" "gpudnn/mha_cudnn_frontend.cu" "fusion/gpu/block_multi_head_attention_kernel.cu" diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h index 03bc6ca85efed..463272a37c00d 100644 --- a/paddle/phi/kernels/funcs/dropout_impl.cu.h +++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h @@ -368,7 +368,7 @@ void DropoutFwGPUKernelDriver( phi::backends::gpu::CUDAGraphNodeLauncher::parameterSetter_t parameterSetter = [offset, dev_ctx_p, state_index, is_fix_seed]( - phi::backends::gpu::CUDAKernelParams& params) { + phi::backends::gpu::gpuKernelParams& params) { if (!is_fix_seed) { // we assume seed is null pointer // seed copy to cpu is meaningless here @@ -389,7 +389,7 @@ void DropoutFwGPUKernelDriver( } }; - phi::backends::gpu::CUDAGraphNodeLauncher::cudaKernelCallback_t + phi::backends::gpu::CUDAGraphNodeLauncher::gpuKernelCallback_t cudaKernelCallback = [=](unsigned int id) { void* functionPtr = reinterpret_cast(&(VectorizedRandomGenerator)); diff --git a/paddle/phi/kernels/funcs/segmented_array.h b/paddle/phi/kernels/funcs/segmented_array.h index e6ecb9819e505..4b4b1b59db66e 100644 --- a/paddle/phi/kernels/funcs/segmented_array.h +++ b/paddle/phi/kernels/funcs/segmented_array.h @@ -118,7 +118,7 @@ struct ArraySetterBase { phi::Stream(reinterpret_cast(ctx.stream()))); int8_t* restored = reinterpret_cast(src); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (use_cuda_graph) { restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( restored, num_bytes); diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu index ff6380ceeec0a..801f070251fb2 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu @@ -218,7 +218,7 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx, // seed_offset_data should preserved by cudaGraph pool const phi::GPUContext* dev_ctx_p = &dev_ctx; auto parameterSetter = [offset, dev_ctx_p, seed_offset]( - phi::backends::gpu::CUDAKernelParams& params) { + phi::backends::gpu::gpuKernelParams& params) { const auto* seed_offset_data = seed_offset.data(); const uint64_t seed_data = static_cast(seed_offset_data[0]); const uint64_t increment = static_cast(seed_offset_data[1]); @@ -229,7 +229,7 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx, << ", increment = " << increment; }; - phi::backends::gpu::CUDAGraphNodeLauncher::cudaKernelCallback_t + phi::backends::gpu::CUDAGraphNodeLauncher::gpuKernelCallback_t cudaKernelCallback = [=](unsigned int id) { void* functionPtr = reinterpret_cast( &(VectorizedDropoutBackward>)); diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu index 5ec23e777211b..c95c5fbf0ca3d 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu @@ -211,7 +211,7 @@ void FusedDropoutAddKernel(const Context& dev_ctx, seed_offset_data, state_index, seed_tensor_ptr, - fix_seed](phi::backends::gpu::CUDAKernelParams& params) { + fix_seed](phi::backends::gpu::gpuKernelParams& params) { if (!fix_seed) { auto gen_cuda = dev_ctx_p->GetGenerator(); // ensure the generator use correct state index @@ -233,7 +233,7 @@ void FusedDropoutAddKernel(const Context& dev_ctx, seed_offset_data[1] = static_cast(increment); } }; - phi::backends::gpu::CUDAGraphNodeLauncher::cudaKernelCallback_t + phi::backends::gpu::CUDAGraphNodeLauncher::gpuKernelCallback_t cudaKernelCallback = [=](unsigned int id) { void* functionPtr = reinterpret_cast( &(VectorizedDropoutForward>));