diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 9d275b0fd4c2e..355b179599ce9 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -70,7 +70,7 @@ static void RunProgramDescs(const ProgramDescs &programs,
 
 FetchResultType ScopeBufferedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors, bool return_merged) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::IsCUDAGraphCapturing()) {
     strategy_.num_iteration_per_drop_scope_ =
         std::numeric_limits<size_t>::max();
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index 94ff108f7d61c..30df6f14e366d 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -145,7 +145,7 @@ PirInterpreter::PirInterpreter(const platform::Place& place,
      << std::chrono::high_resolution_clock::now().time_since_epoch().count();
   BuildScope(*ir_block_, ss.str(), value_exe_info_.get());
 
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   calculate_stream_timer_ = std::make_unique<phi::CalculateStreamTimer>(place);
 #endif
 }
@@ -299,7 +299,7 @@ void PirInterpreter::ShareBuildResultsFrom(const InterpreterBaseImpl& src) {
 
 std::tuple<double, double> PirInterpreter::InterpreterRunTime() {
   double start_time = 0, end_time = 0;
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   start_time = calculate_stream_timer_->StartTime();
   end_time = calculate_stream_timer_->EndTime();
 #endif
@@ -337,7 +337,7 @@ std::shared_ptr<interpreter::AsyncWorkQueue> PirInterpreter::GetWorkQueue() {
 
 void PirInterpreter::PrepareForCUDAGraphCapture() {
   if (!FLAGS_new_executor_use_cuda_graph) return;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
       platform::IsCUDAGraphCapturing(),
       false,
@@ -362,7 +362,7 @@ void PirInterpreter::PrepareForCUDAGraphCapture() {
 
 void PirInterpreter::CheckCUDAGraphBeforeRun(
     const std::vector<std::string>& feed_names) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::IsCUDAGraphCapturing()) {
     PADDLE_ENFORCE_EQ(
         feed_names.empty(),
@@ -1724,7 +1724,7 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) {
 
   try {
     instr_node->WaitEvent(cur_place);
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (enable_job_schedule_profiler_) {
       std::string op_name = instr_node->Name();
       ::pir::Operation* op = instr_node->Operation();
@@ -1772,7 +1772,7 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) {
     }
     VLOG(5) << "after run kernel";
     instr_node->RecordEvent(cur_place);
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (enable_job_schedule_profiler_) {
       if (instr_node->Id() == last_calculate_instr_id_ &&
           calculate_stream_timer_->IsStarted()) {
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.h b/paddle/fluid/framework/new_executor/pir_interpreter.h
index daf6351bb6723..e28e418b9dd95 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.h
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.h
@@ -18,7 +18,7 @@
 #include "paddle/fluid/framework/new_executor/interpreter_base_impl.h"
 #include "paddle/pir/include/core/value.h"
 
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/kernels/autotune/gpu_timer.h"
 #endif
 
@@ -274,7 +274,7 @@ class PirInterpreter : public InterpreterBaseImpl {
   // belongs to a parameter and cannot GC.
   std::unordered_set<std::string> parameter_var_names_;
 
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   std::unique_ptr<phi::CalculateStreamTimer> calculate_stream_timer_;
 #endif
   size_t last_calculate_instr_id_;
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 67a5c8c9d0b5b..136b8980dee90 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -191,7 +191,7 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
     if (fetch_var) {
       auto fetch_list =
           std::move(*fetch_var->GetMutable<framework::FetchList>());
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       if (platform::IsCUDAGraphCapturing()) {
         PADDLE_ENFORCE_EQ(fetch_list.empty(),
                           true,
@@ -269,7 +269,7 @@ FetchList ProgramInterpreter::Run(
     if (fetch_var) {
       auto fetch_list =
           std::move(*fetch_var->GetMutable<framework::FetchList>());
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       if (platform::IsCUDAGraphCapturing()) {
         PADDLE_ENFORCE_EQ(fetch_list.empty(),
                           true,
@@ -533,7 +533,7 @@ void ProgramInterpreter::BuildInplace() {
 
 void ProgramInterpreter::PrepareForCUDAGraphCapture() {
   if (!FLAGS_new_executor_use_cuda_graph) return;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
       platform::IsCUDAGraphCapturing(),
       false,
@@ -579,7 +579,7 @@ void ProgramInterpreter::PrepareForCUDAGraphCapture() {
 
 void ProgramInterpreter::CheckCUDAGraphBeforeRun(
     const std::vector<std::string>& feed_names) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::IsCUDAGraphCapturing()) {
     PADDLE_ENFORCE_EQ(
         feed_names.empty(),
@@ -862,7 +862,7 @@ void ProgramInterpreter::BuildOpFuncNode(
     auto& op_func_node = nodes[op_idx];
     stream_analyzer_.SetForceEventsToWaitInfo(force_events_to_wait_);
     auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (FLAGS_new_executor_use_cuda_graph) {
       auto& op = op_func_node.operator_base_;
       auto& op_type = op->Type();
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index c2b6c37e7dd6e..ccf2b718e535e 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -1416,7 +1416,7 @@ void ParallelExecutor::PreludeToRun(
   platform::RecordEvent record_run(
       "ParallelExecutor::Run", platform::TracerEventType::UserDefined, 1);
   VLOG(3) << "enter ParallelExecutor Run";
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::IsCUDAGraphCapturing()) {
     PADDLE_ENFORCE_EQ(fetch_tensors.empty(),
                       true,
@@ -1804,7 +1804,7 @@ const ir::Graph &ParallelExecutor::Graph() const {
 void ParallelExecutor::PrepareForCUDAGraphCapture(ir::Graph *graph) {
   const auto &build_strategy = member_->build_strategy_;
   if (!build_strategy.allow_cuda_graph_capture_) return;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
       build_strategy.async_mode_,
       false,
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index d09ec702c813c..2ea19823c5f4a 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2691,7 +2691,7 @@ void AnalysisPredictor::HookCollectShapeRangeInfo() {
                              int32_tensor.data<int>(),
                              int32_tensor.numel() * sizeof(int));
       } else if (platform::is_gpu_place(tensor->place())) {
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         auto *dev_ctx = pool.Get(tensor->place());
         auto &int32_tensor = *tensor;
         if (tensor->dtype() == phi::DataType::INT64) {
@@ -2914,7 +2914,7 @@ bool AnalysisPredictor::LoadParameters() {
 }
 
 uint64_t AnalysisPredictor::TryShrinkMemory() {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (config_.use_gpu()) {
     paddle::platform::EmptyCache();
   }
@@ -3607,39 +3607,39 @@ bool InternalUtils::RunWithRuntimeConfig(paddle_infer::Predictor *p,
 
 void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c,
                                             bool with_interleaved) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   c->trt_with_interleaved_ = with_interleaved;
 #endif
 }
 
 void InternalUtils::SetTransformerPosid(
     paddle_infer::Config *c, const std::string &tensorrt_transformer_posid) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   c->tensorrt_transformer_posid_ = tensorrt_transformer_posid;
 #endif
 }
 
 void InternalUtils::SetTransformerMaskid(
     paddle_infer::Config *c, const std::string &tensorrt_transformer_maskid) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   c->tensorrt_transformer_maskid_ = tensorrt_transformer_maskid;
 #endif
 }
 
 void InternalUtils::DisableTensorRtHalfOps(
     paddle_infer::Config *c, const std::unordered_set<std::string> &ops) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   c->trt_ops_run_float_ = ops;
 #endif
 }
 
 void InternalUtils::SyncStream(paddle_infer::Predictor *p) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   auto *pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
   paddle::platform::DeviceContextPool &pool =
       paddle::platform::DeviceContextPool::Instance();
   auto *dev_ctx = reinterpret_cast<phi::GPUContext *>(pool.Get(pred->place_));
-  cudaStreamSynchronize(dev_ctx->stream());
+  paddle::gpuStreamSynchronize(dev_ctx->stream());
 #endif
 }
 void InternalUtils::SyncStream(cudaStream_t stream) {
@@ -3648,5 +3648,11 @@ void InternalUtils::SyncStream(cudaStream_t stream) {
 #endif
 }
 
+void InternalUtils::SyncStream(hipStream_t stream) {
+#ifdef PADDLE_WITH_HIP
+  hipStreamSynchronize(stream);
+#endif
+}
+
 }  // namespace experimental
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 8c66b66363603..b6931814ab9e7 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -523,6 +523,7 @@ class PD_INFER_DECL InternalUtils {
 
   static void SyncStream(paddle_infer::Predictor* pred);
   static void SyncStream(cudaStream_t stream);
+  static void SyncStream(hipStream_t stream);
   template <typename T>
   static void CopyFromCpuWithIoStream(paddle_infer::Tensor* t,
                                       const T* data,
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 9b30ca8308022..9df64154402e5 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -39,8 +39,10 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA)
 #include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
+#elif defined(PADDLE_WITH_HIP)
+#include "paddle/phi/backends/gpu/rocm/hip_graph.h"
 #endif
 
 #if CUDA_VERSION >= 10020
@@ -49,6 +51,10 @@
 #include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
 #include "paddle/fluid/platform/dynload/cuda_driver.h"
 #endif
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/memory/allocation/cuda_malloc_async_allocator.h"  // NOLINT
+#endif
 #endif
 
 #ifdef PADDLE_WITH_XPU
@@ -107,7 +113,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class CUDAGraphAllocator
     : public Allocator,
       public std::enable_shared_from_this<CUDAGraphAllocator> {
@@ -158,7 +164,7 @@ class CUDAGraphAllocator
 #endif
 
 static bool IsCUDAGraphCapturing() {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   return UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing());
 #else
   return false;
@@ -329,7 +335,7 @@ class AllocatorFacadePrivate {
 
     CheckAllocThreadSafe();
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     // No need to wrap CUDAGraphAllocator for StreamSafeCUDAAllocator
     if (!is_stream_safe_cuda_allocator_used_ &&
         UNLIKELY(IsCUDAGraphCapturing())) {
@@ -1120,7 +1126,7 @@ class AllocatorFacadePrivate {
     allocator = std::make_shared<StatAllocator>(allocator);
   }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void WrapCUDAGraphAllocator() {
     for (auto& item : allocators_) {
       auto& allocator = item.second;
@@ -1511,7 +1517,7 @@ AllocatorFacade& AllocatorFacade::Instance() {
 }
 
 AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // if we use cuda_malloc_async_allocator, we don't need to open a private pool
   // for each graph
   if (UNLIKELY(IsCUDAGraphCapturing()) &&
@@ -1702,7 +1708,7 @@ void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
   }
 }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(int64_t id) {
   PADDLE_ENFORCE_EQ(GetAllocatorStrategy(),
                     AllocatorStrategy::kAutoGrowth,
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index f0f321b887b59..de26eae6eb4ba 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -95,7 +95,7 @@ class AllocatorFacade {
   void SetDefaultStream(const platform::CUDAPlace& place, gpuStream_t stream);
 #endif
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void PrepareMemoryPoolForCUDAGraph(int64_t id);
   void RemoveMemoryPoolOfCUDAGraph(int64_t id);
 #endif
@@ -116,7 +116,7 @@ class AllocatorFacade {
  private:
   AllocatorFacade();
   AllocatorFacadePrivate* m_;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   std::unordered_map<int64_t, std::unique_ptr<AllocatorFacadePrivate>>
       cuda_graph_map_;
   std::unordered_map<int64_t, int64_t> cuda_graph_ref_cnt_;
diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc
index df62c112681b1..be3f578f4942f 100644
--- a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc
@@ -47,17 +47,16 @@ std::shared_ptr<void> GetIpcBasePtr(std::string handle) {
   // The IpcMemHandle can only open once for the same handle,
   // so here we cache it here.
   void *baseptr = nullptr;
-  auto ipc_handle =
-      reinterpret_cast<const cudaIpcMemHandle_t *>(handle.c_str());
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcOpenMemHandle(
-      &baseptr, *ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+  auto ipc_handle = reinterpret_cast<const gpuIpcMemHandle_t *>(handle.c_str());
+  PADDLE_ENFORCE_GPU_SUCCESS(gpuIpcOpenMemHandle(
+      &baseptr, *ipc_handle, gpuIpcMemLazyEnablePeerAccess));
   // Close ipc handle on the same device.
   int device_id = platform::GetCurrentDeviceId();
   // Add deleter to close ipc handle.
   auto sp = std::shared_ptr<void>(baseptr, [handle, device_id](void *ptr) {
     platform::CUDADeviceGuard guard(device_id);
     std::lock_guard<std::mutex> lock(ipc_mutex_);
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcCloseMemHandle(ptr));
+    PADDLE_ENFORCE_GPU_SUCCESS(gpuIpcCloseMemHandle(ptr));
     ipc_handle_to_baseptr_.erase(handle);
     VLOG(6) << "cudaIpcCloseMemHandle for ptr:"
             << "\t" << ptr;
diff --git a/paddle/fluid/memory/allocation/cuda_malloc_async_allocator.cc b/paddle/fluid/memory/allocation/cuda_malloc_async_allocator.cc
index cdc3f60da7c7e..7e0c513f5c81c 100644
--- a/paddle/fluid/memory/allocation/cuda_malloc_async_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_malloc_async_allocator.cc
@@ -27,7 +27,11 @@
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
+#if defined(PADDLE_WITH_CUDA)
 #include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
+#elif defined(PADDLE_WITH_HIP)
+#include "paddle/phi/backends/gpu/rocm/hip_graph.h"
+#endif
 
 namespace paddle {
 namespace memory {
@@ -47,11 +51,11 @@ void CUDAMallocAsyncAllocation::RecordStreamWithNoGraphCapturing(
   if (event_map_.find(stream) == event_map_.end()) {
     gpuEvent_t event;
     PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream));
+        gpuEventCreateWithFlags(&event, gpuEventDisableTiming));
+    PADDLE_ENFORCE_GPU_SUCCESS(gpuEventRecord(event, stream));
     event_map_[stream] = event;
   } else {
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_map_[stream], stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(gpuEventRecord(event_map_[stream], stream));
   }
 }
 
@@ -93,16 +97,16 @@ bool CUDAMallocAsyncAllocation::CanBeFreed(bool synchronize) {
   for (auto it = event_map_.begin(); it != event_map_.end();) {
     gpuEvent_t& event = it->second;
     if (synchronize) {
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event));
+      PADDLE_ENFORCE_GPU_SUCCESS(gpuEventSynchronize(event));
     } else {
-      gpuError_t err = cudaEventQuery(event);
-      if (err == cudaErrorNotReady) {
+      gpuError_t err = gpuEventQuery(event);
+      if (err == gpuErrorNotReady) {
         VLOG(9) << "Event " << event << " for " << ptr() << " is not completed";
         return false;
       }
       PADDLE_ENFORCE_GPU_SUCCESS(err);
     }
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
+    PADDLE_ENFORCE_GPU_SUCCESS(gpuEventDestroy(event));
     VLOG(8) << "Destroy event " << event;
     it = event_map_.erase(it);
   }
@@ -117,7 +121,7 @@ CUDAMallocAsyncAllocator::CUDAMallocAsyncAllocator(
       place_(place),
       default_stream_(default_stream) {
   PADDLE_ENFORCE_GPU_SUCCESS(
-      cudaStreamCreateWithPriority(&memory_stream_, cudaStreamNonBlocking, 0));
+      gpuStreamCreateWithPriority(&memory_stream_, gpuStreamNonBlocking, 0));
 }
 
 bool CUDAMallocAsyncAllocator::IsAllocThreadSafe() const { return true; }
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 9d82ca6ed1826..dfcb90dffecb1 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -18,8 +18,10 @@
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA)
 #include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
+#elif defined(PADDLE_WITH_HIP)
+#include "paddle/phi/backends/gpu/rocm/hip_graph.h"
 #endif
 
 namespace paddle {
@@ -48,7 +50,7 @@ void StreamSafeCUDAAllocation::RecordStream(gpuStream_t stream) {
                  [this] { phi::backends::gpu::SetDeviceId(place_.device); });
 
   std::lock_guard<SpinLock> lock_guard(outstanding_event_map_lock_);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing())) {
     graph_capturing_stream_set_.insert(stream);
     return;
@@ -66,7 +68,7 @@ void StreamSafeCUDAAllocation::EraseStream(gpuStream_t stream) {
 }
 
 bool StreamSafeCUDAAllocation::CanBeFreed() {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing())) {
     return graph_capturing_stream_set_.empty() &&
            outstanding_event_map_.empty();
diff --git a/paddle/fluid/operators/cuda_graph_with_in_out.h b/paddle/fluid/operators/cuda_graph_with_in_out.h
index 3f65450d30c0e..7547bdd436395 100644
--- a/paddle/fluid/operators/cuda_graph_with_in_out.h
+++ b/paddle/fluid/operators/cuda_graph_with_in_out.h
@@ -16,21 +16,21 @@
 
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #endif
 
 namespace paddle {
 namespace operators {
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class CUDAGraphWithInOuts {
  public:
   template <typename Callable>
   CUDAGraphWithInOuts(Callable &&callable,
                       platform::CUDAPlace place,
                       const std::vector<const phi::DenseTensor *> &in_ptrs,
-                      cudaStreamCaptureMode mode,
+                      gpuStreamCaptureMode mode,
                       int64_t pool_id) {
     in_indices_.resize(in_ptrs.size());
     ins_.reserve(in_ptrs.size());
@@ -102,7 +102,7 @@ static std::unique_ptr<CUDAGraphWithInOuts> CaptureCUDAGraph(
     const framework::ExecutionContext &ctx,
     const std::vector<std::string> &input_names,
     const std::vector<std::string> &output_names,
-    cudaStreamCaptureMode mode,
+    gpuStreamCaptureMode mode,
     int64_t pool_id) {
   std::vector<const phi::DenseTensor *> inputs;
   for (const auto &name : input_names) {
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index 9e2d1fc4c97fb..6006d7556423c 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -34,7 +34,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_DNNL
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/operators/cuda_graph_with_in_out.h"
 #endif
 #include "paddle/common/flags.h"
@@ -196,6 +196,20 @@ static cudaStreamCaptureMode StringToCUDAGraphCaptureMode(
         "Unsupported CUDA Graph capture mode %s", mode));
   }
 }
+#elif defined(PADDLE_WITH_HIP)
+static hipStreamCaptureMode StringToCUDAGraphCaptureMode(
+    const std::string &mode) {
+  if (mode == "global") {
+    return hipStreamCaptureModeGlobal;
+  } else if (mode == "thread_local") {
+    return hipStreamCaptureModeThreadLocal;
+  } else if (mode == "relaxed") {
+    return hipStreamCaptureModeRelaxed;
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Unsupported CUDA Graph capture mode %s", mode));
+  }
+}
 #endif
 
 }  // namespace details
@@ -211,7 +225,7 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
       return;
     }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     auto mode = details::StringToCUDAGraphCaptureMode(capture_mode);
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx.GetPlace()),
@@ -408,7 +422,7 @@ class RunProgramGradOpKernel : public framework::OpKernel<T> {
       return;
     }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     auto mode = details::StringToCUDAGraphCaptureMode(capture_mode);
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx.GetPlace()),
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
index 5b5efb43f9096..9d522d8b2f0fe 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -25,7 +25,7 @@ COMMON_DECLARE_bool(new_executor_use_cuda_graph);
 namespace paddle {
 namespace platform {
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void InitCUDNNRelatedHandle(phi::GPUContext* dev_ctx) {
   dev_ctx->cudnn_workspace_handle().ResetWorkspace();
 
@@ -82,7 +82,7 @@ phi::DeviceContext* SelectCUDAGraphDeviceContext(phi::GPUPlace place,
 }
 
 void BeginCUDAGraphCapture(phi::GPUPlace place,
-                           cudaStreamCaptureMode mode,
+                           gpuStreamCaptureMode mode,
                            int64_t pool_id) {
   auto* mutable_dev_ctx = SelectCUDAGraphDeviceContext(place, &pool_id);
   auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(mutable_dev_ctx);
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.h b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
index c076d33c88682..a1eca67a9ee87 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.h
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/common/macros.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
@@ -23,17 +24,17 @@ namespace paddle {
 namespace platform {
 
 // NOTE: These APIs are not thread-safe.
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 using CUDAGraph = phi::backends::gpu::CUDAGraph;
 
 void BeginCUDAGraphCapture(phi::GPUPlace place,
-                           cudaStreamCaptureMode mode,
+                           gpuStreamCaptureMode mode,
                            int64_t pool_id = CUDAGraph::kInvalidPoolID);
 std::unique_ptr<CUDAGraph> EndCUDAGraphCapture();
 #endif
 
 inline phi::GPUPlace CUDAGraphCapturingPlace() {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   return CUDAGraph::CapturingPlace();
 #else
   PADDLE_THROW(phi::errors::Unimplemented(
@@ -52,8 +53,8 @@ class SkipCUDAGraphCaptureGuard {
 
  public:
   SkipCUDAGraphCaptureGuard() {
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 10010
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 10010
     if (UNLIKELY(CUDAGraph::IsCapturing())) {
       CUDAGraph::EndSegmentCapture();
     }
@@ -62,8 +63,8 @@ class SkipCUDAGraphCaptureGuard {
   }
 
   ~SkipCUDAGraphCaptureGuard() {
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 10010
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 10010
     if (UNLIKELY(CUDAGraph::IsCapturing())) {
       CUDAGraph::BeginSegmentCapture();
     }
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 8fca9708b4b5d..36189cc7e4c90 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -35,6 +35,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/dynload/miopen.h"
+#include "paddle/phi/backends/gpu/rocm/hip_graph.h"
 #else
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
@@ -44,6 +45,8 @@ limitations under the License. */
 #if CUDA_VERSION >= 10020
 #include "paddle/fluid/platform/dynload/cuda_driver.h"
 #endif
+#else  // PADDLE_WITH_HIP
+#include "paddle/fluid/platform/dynload/rocm_driver.h"
 #endif
 
 COMMON_DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -256,7 +259,8 @@ class RecordedGpuMallocHelper {
    * would be clear.
    */
   gpuError_t MallocAsync(void **ptr, size_t size, gpuStream_t stream) {
-#if defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11020)
+#if defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11020)
     LockGuardPtr<std::mutex> lock(mtx_);
     if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
       return gpuErrorOutOfMemory;
@@ -264,19 +268,35 @@ class RecordedGpuMallocHelper {
     CUDADeviceGuard guard(dev_id_);
 
     std::call_once(set_cudamempoolattr_once_flag_, [&]() {
+#ifdef PADDLE_WITH_CUDA
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaDeviceGetDefaultMemPool(&memPool_, dev_id_));
+#else  // PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipDeviceGetDefaultMemPool(&memPool_, dev_id_));
+#endif
       uint64_t thresholdVal = FLAGS_cuda_memory_async_pool_realease_threshold;
       VLOG(10) << "[cudaMallocAsync] set cudaMemPoolAttrReleaseThreshold to "
                << thresholdVal;
+#ifdef PADDLE_WITH_CUDA
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemPoolSetAttribute(memPool_,
                                   cudaMemPoolAttrReleaseThreshold,
                                   reinterpret_cast<void *>(&thresholdVal)));
+#else  // PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemPoolSetAttribute(memPool_,
+                                  hipMemPoolAttrReleaseThreshold,
+                                  reinterpret_cast<void *>(&thresholdVal)));
+#endif
     });
 
     gpuError_t result;
+#ifdef PADDLE_WITH_CUDA
     result = cudaMallocAsync(ptr, size, stream);
+#else  // PADDLE_WITH_HIP
+    result = hipMallocAsync(ptr, size, stream);
+#endif
     VLOG(10) << "[cudaMallocAsync] ptr = " << (*ptr)
              << " size = " << static_cast<double>(size) / (1 << 20)
              << " MB result = " << result << " stream = " << stream;
@@ -343,18 +363,23 @@ class RecordedGpuMallocHelper {
   }
 
   void FreeAsync(void *ptr, size_t size, gpuStream_t stream) {
-#if defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11020)
+#if defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11020)
     // Purposefully allow cudaErrorCudartUnloading, because
     // that is returned if you ever call cudaFree after the
     // driver has already shutdown. This happens only if the
     // process is terminating, in which case we don't care if
     // cudaFree succeeds.
     CUDADeviceGuard guard(dev_id_);
+#ifdef PADDLE_WITH_CUDA
     auto err = cudaFreeAsync(ptr, stream);
+#else  // PADDLE_WITH_HIP
+    auto err = hipFreeAsync(ptr, stream);
+#endif
     VLOG(10) << "[cudaFreeAsync] ptr = " << ptr
              << " size =" << static_cast<double>(size) / (1 << 20)
              << " MB result = " << err << " stream = " << stream;
-    if (err != cudaErrorCudartUnloading) {
+    if (err != gpuErrorCudartUnloading) {
       PADDLE_ENFORCE_GPU_SUCCESS(err);
       cur_size_.fetch_sub(size);
       DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
@@ -449,6 +474,27 @@ class RecordedGpuMallocHelper {
   }
 
 #endif
+#else  // PADDLE_WITH_HIP
+  hipError_t MemCreate(hipMemGenericAllocationHandle_t *handle,
+                       size_t size,
+                       const hipMemAllocationProp *prop,
+                       unsigned long long flags) {  // NOLINT
+    auto result =
+        paddle::platform::dynload::hipMemCreate(handle, size, prop, flags);
+    if (result == hipSuccess) {
+      cur_size_.fetch_add(size);
+    }
+    return result;
+  }
+
+  hipError_t MemRelease(hipMemGenericAllocationHandle_t handle, size_t size) {
+    auto result = paddle::platform::dynload::hipMemRelease(handle);
+    if (result == hipSuccess) {
+      cur_size_.fetch_sub(size);
+    }
+    return result;
+  }
+
 #endif
 
  private:
@@ -460,6 +506,10 @@ class RecordedGpuMallocHelper {
   cudaMemPool_t memPool_;
   static std::once_flag set_cudamempoolattr_once_flag_;
 #endif
+#if defined(PADDLE_WITH_HIP)
+  hipMemPool_t memPool_;
+  static std::once_flag set_cudamempoolattr_once_flag_;
+#endif
 
   mutable std::unique_ptr<std::mutex> mtx_;
   static std::once_flag once_flag_;
@@ -468,7 +518,8 @@ class RecordedGpuMallocHelper {
 
 std::once_flag RecordedGpuMallocHelper::once_flag_;
 
-#if defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11020)
+#if defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11020)
 std::once_flag RecordedGpuMallocHelper::set_cudamempoolattr_once_flag_;
 #endif
 
@@ -516,6 +567,21 @@ CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle,
   return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size);
 }
 #endif
+#else  // PADDLE_WITH_HIP
+hipError_t RecordedGpuMemCreate(hipMemGenericAllocationHandle_t *handle,
+                                size_t size,
+                                const hipMemAllocationProp *prop,
+                                unsigned long long flags,  // NOLINT
+                                int dev_id) {
+  return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(
+      handle, size, prop, flags);
+}
+
+hipError_t RecordedGpuMemRelease(hipMemGenericAllocationHandle_t handle,
+                                 size_t size,
+                                 int dev_id) {
+  return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size);
+}
 #endif
 
 bool RecordedGpuMemGetInfo(size_t *avail,
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index c9afafdef7166..8a192ba919cad 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -1,5 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-// Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -33,11 +32,13 @@
 
 namespace paddle {
 
+// Note(qili93): CUDA Runtime API supported by HIP
+// https://github.com/ROCm/HIPIFY/blob/master/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md
+
 #ifdef PADDLE_WITH_HIP
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
   using GPU_TYPE = ROCM_TYPE;
-#else  // CDUA
-
+#else  // PADDLE_WITH_CUDA
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
   using GPU_TYPE = CUDA_TYPE;
 #endif
@@ -81,22 +82,22 @@ DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
                      cudnnDropoutDescriptor_t,
                      miopenDropoutDescriptor_t);
 DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
-
+DECLARE_TYPE_FOR_GPU(gpuIpcMemHandle_t, cudaIpcMemHandle_t, hipIpcMemHandle_t);
 DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
+DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode,
+                     cudaStreamCaptureMode,
+                     hipStreamCaptureMode);
 
 // TODO(Ming Huang): Since there is no blasLt handler,
 // use rocblas_handle for workround.
 DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
 
-using CUDAGraphID = unsigned long long;  // NOLINT
-
 #undef DECLARE_TYPE_FOR_GPU
 
 #ifdef PADDLE_WITH_HIP
 #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
   constexpr auto GPU_CV = ROCM_CV;
-#else  // CDUA
-
+#else  // PADDLE_WITH_CUDA
 #define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
   constexpr auto GPU_CV = CUDA_CV;
 #endif
@@ -106,8 +107,64 @@ DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
                          hipErrorOutOfMemory);
 DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
 DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
+DECLARE_CONSTANT_FOR_GPU(gpuErrorCudartUnloading,
+                         cudaErrorCudartUnloading,
+                         hipErrorDeinitialized);
+DECLARE_CONSTANT_FOR_GPU(gpuEventDisableTiming,
+                         cudaEventDisableTiming,
+                         hipEventDisableTiming);
+DECLARE_CONSTANT_FOR_GPU(gpuStreamNonBlocking,
+                         cudaStreamNonBlocking,
+                         hipStreamNonBlocking);
+DECLARE_CONSTANT_FOR_GPU(gpuIpcMemLazyEnablePeerAccess,
+                         cudaIpcMemLazyEnablePeerAccess,
+                         hipIpcMemLazyEnablePeerAccess);
 
 #undef DECLARE_CONSTANT_FOR_GPU
-}  // namespace paddle
 
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_FUNCTION_FOR_GPU(GPU_FUNC, CUDA_FUNC, ROCM_FUNC) \
+  const auto GPU_FUNC = ROCM_FUNC;
+#else  // PADDLE_WITH_CUDA
+#define DECLARE_FUNCTION_FOR_GPU(GPU_FUNC, CUDA_FUNC, ROCM_FUNC) \
+  const auto GPU_FUNC = CUDA_FUNC;
 #endif
+
+DECLARE_FUNCTION_FOR_GPU(gpuStreamCreateWithPriority,
+                         cudaStreamCreateWithPriority,
+                         hipStreamCreateWithPriority);
+DECLARE_FUNCTION_FOR_GPU(gpuStreamBeginCapture,
+                         cudaStreamBeginCapture,
+                         hipStreamBeginCapture);
+DECLARE_FUNCTION_FOR_GPU(gpuStreamEndCapture,
+                         cudaStreamEndCapture,
+                         hipStreamEndCapture);
+DECLARE_FUNCTION_FOR_GPU(gpuStreamGetCaptureInfo,
+                         cudaStreamGetCaptureInfo,
+                         hipStreamGetCaptureInfo);
+DECLARE_FUNCTION_FOR_GPU(gpuEventCreateWithFlags,
+                         cudaEventCreateWithFlags,
+                         hipEventCreateWithFlags);
+DECLARE_FUNCTION_FOR_GPU(gpuEventRecord, cudaEventRecord, hipEventRecord);
+DECLARE_FUNCTION_FOR_GPU(gpuEventDestroy, cudaEventDestroy, hipEventDestroy);
+DECLARE_FUNCTION_FOR_GPU(gpuEventQuery, cudaEventQuery, hipEventQuery);
+DECLARE_FUNCTION_FOR_GPU(gpuEventSynchronize,
+                         cudaEventSynchronize,
+                         hipEventSynchronize);
+DECLARE_FUNCTION_FOR_GPU(gpuStreamSynchronize,
+                         cudaStreamSynchronize,
+                         hipStreamSynchronize);
+DECLARE_FUNCTION_FOR_GPU(gpuIpcOpenMemHandle,
+                         cudaIpcOpenMemHandle,
+                         hipIpcOpenMemHandle);
+DECLARE_FUNCTION_FOR_GPU(gpuIpcCloseMemHandle,
+                         cudaIpcCloseMemHandle,
+                         hipIpcCloseMemHandle);
+
+#undef DECLARE_FUNCTION_FOR_GPU
+
+using CUDAGraphID = unsigned long long;  // NOLINT
+
+}  // namespace paddle
+
+#endif  // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/platform/dynload/rocm_driver.h b/paddle/fluid/platform/dynload/rocm_driver.h
index 5c8e18611c40a..5295ffb07c1d1 100644
--- a/paddle/fluid/platform/dynload/rocm_driver.h
+++ b/paddle/fluid/platform/dynload/rocm_driver.h
@@ -39,13 +39,33 @@ extern bool HasCUDADriver();
   __macro(hipModuleLoadData);                                 \
   __macro(hipModuleGetFunction);                              \
   __macro(hipModuleUnload);                                   \
-  /*rocm3.5 not support the function*/                        \
+  /* DTK not support the function*/                           \
   /* __macro(hipOccupancyMaxActiveBlocksPerMultiprocessor);*/ \
   __macro(hipModuleLaunchKernel);                             \
   __macro(hipLaunchKernel);                                   \
   __macro(hipGetDevice);                                      \
   __macro(hipGetDeviceCount);                                 \
-  __macro(hipDevicePrimaryCtxGetState)
+  __macro(hipDevicePrimaryCtxGetState);                       \
+  __macro(hipDeviceGetAttribute);                             \
+  __macro(hipDeviceGet)
+
+#define ROCM_ROUTINE_EACH_VVM(__macro)     \
+  __macro(hipMemGetAllocationGranularity); \
+  __macro(hipMemAddressReserve);           \
+  __macro(hipMemCreate);                   \
+  __macro(hipMemMap);                      \
+  __macro(hipMemSetAccess);                \
+  __macro(hipMemUnmap);                    \
+  __macro(hipMemRelease);                  \
+  __macro(hipMemAddressFree)
+
+#define ROCM_ROUTINE_EACH_GPU_GRAPH(__macro) \
+  __macro(hipGraphNodeGetType);              \
+  __macro(hipGraphKernelNodeGetParams);      \
+  __macro(hipGraphExecKernelNodeSetParams)
+
+ROCM_ROUTINE_EACH_VVM(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
+ROCM_ROUTINE_EACH_GPU_GRAPH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
 
 ROCM_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 86841a177d92e..8747b70414ddc 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -78,7 +78,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/prim/utils/utils.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
 #endif
 #include "paddle/common/macros.h"
@@ -978,12 +978,12 @@ PYBIND11_MODULE(libpaddle, m) {
 #endif
 
   m.def("is_cuda_graph_capturing", &platform::IsCUDAGraphCapturing);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   py::class_<phi::backends::gpu::CUDAGraph>(m, "CUDAGraph")
       .def_static("begin_capture",
                   [](platform::CUDAPlace place, int mode) {
                     platform::BeginCUDAGraphCapture(
-                        place, static_cast<cudaStreamCaptureMode>(mode));
+                        place, static_cast<paddle::gpuStreamCaptureMode>(mode));
                   })
       .def_static("end_capture", &platform::EndCUDAGraphCapture)
       .def_static("gen_new_memory_pool_id",
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 50da99217b153..80d5f14e627a3 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -14,7 +14,7 @@ if(WITH_GPU OR WITH_ROCM)
     list(APPEND BACKENDS_SRCS gpu/cuda/cuda_info.cc gpu/cuda/cuda_graph.cc)
   endif()
   if(WITH_ROCM)
-    list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc)
+    list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc gpu/rocm/hip_graph.cc)
   endif()
 endif()
 
diff --git a/paddle/phi/backends/dynload/rccl.cc b/paddle/phi/backends/dynload/rccl.cc
index 95e171842527b..ee347af62fb79 100644
--- a/paddle/phi/backends/dynload/rccl.cc
+++ b/paddle/phi/backends/dynload/rccl.cc
@@ -14,11 +14,20 @@ limitations under the License. */
 
 #include "paddle/phi/backends/dynload/rccl.h"
 
+ncclResult_t ncclCommInitRank2(ncclComm_t* newcomm,
+                               int nranks,
+                               ncclUniqueId commId,
+                               int myrank,
+                               int param) {
+  // fake impl for compilation
+  return ncclInvalidUsage;
+}
+
 namespace phi {
 namespace dynload {
 
 std::once_flag rccl_dso_flag;
-void *rccl_dso_handle;
+void* rccl_dso_handle;
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
diff --git a/paddle/phi/backends/dynload/rccl.h b/paddle/phi/backends/dynload/rccl.h
index e1018a3f253fa..0123107cd230e 100644
--- a/paddle/phi/backends/dynload/rccl.h
+++ b/paddle/phi/backends/dynload/rccl.h
@@ -20,6 +20,18 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
 #include "paddle/phi/backends/dynload/port.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+ncclResult_t ncclCommInitRank2(ncclComm_t* newcomm,
+                               int nranks,
+                               ncclUniqueId commId,
+                               int myrank,
+                               int param);
+#ifdef __cplusplus
+}
+#endif
+
 namespace phi {
 namespace dynload {
 
@@ -28,15 +40,21 @@ extern void* rccl_dso_handle;
 
 #define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name)                   \
   struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      using nccl_func = decltype(&::__name);                     \
+    static auto GetRCCLFunc() {                                  \
+      using rccl_func = decltype(&::__name);                     \
       std::call_once(rccl_dso_flag, []() {                       \
         rccl_dso_handle = phi::dynload::GetNCCLDsoHandle();      \
       });                                                        \
       static void* p_##__name = dlsym(rccl_dso_handle, #__name); \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
+      return reinterpret_cast<rccl_func>(p_##__name);            \
+    }                                                            \
+                                                                 \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return GetRCCLFunc()(args...);                             \
     }                                                            \
+                                                                 \
+    static bool IsValid() { return GetRCCLFunc() != nullptr; }   \
   };                                                             \
   extern DynLoad__##__name __name
 
@@ -44,6 +62,7 @@ extern void* rccl_dso_handle;
   __macro(ncclCommInitAll);             \
   __macro(ncclGetUniqueId);             \
   __macro(ncclCommInitRank);            \
+  __macro(ncclCommInitRank2);           \
   __macro(ncclCommAbort);               \
   __macro(ncclCommDestroy);             \
   __macro(ncclCommCount);               \
diff --git a/paddle/phi/backends/dynload/rocm_driver.h b/paddle/phi/backends/dynload/rocm_driver.h
index 4e456db44c904..bd221c3f1e32e 100644
--- a/paddle/phi/backends/dynload/rocm_driver.h
+++ b/paddle/phi/backends/dynload/rocm_driver.h
@@ -51,13 +51,33 @@ extern bool HasCUDADriver();
   __macro(hipModuleLoadData);                                 \
   __macro(hipModuleGetFunction);                              \
   __macro(hipModuleUnload);                                   \
-  /*rocm3.5 not support the function*/                        \
+  /* DTK not support the function*/                           \
   /* __macro(hipOccupancyMaxActiveBlocksPerMultiprocessor);*/ \
   __macro(hipModuleLaunchKernel);                             \
   __macro(hipLaunchKernel);                                   \
   __macro(hipGetDevice);                                      \
   __macro(hipGetDeviceCount);                                 \
-  __macro(hipDevicePrimaryCtxGetState)
+  __macro(hipDevicePrimaryCtxGetState);                       \
+  __macro(hipDeviceGetAttribute);                             \
+  __macro(hipDeviceGet)
+
+#define ROCM_ROUTINE_EACH_VVM(__macro)     \
+  __macro(hipMemGetAllocationGranularity); \
+  __macro(hipMemAddressReserve);           \
+  __macro(hipMemCreate);                   \
+  __macro(hipMemMap);                      \
+  __macro(hipMemSetAccess);                \
+  __macro(hipMemUnmap);                    \
+  __macro(hipMemRelease);                  \
+  __macro(hipMemAddressFree)
+
+#define ROCM_ROUTINE_EACH_GPU_GRAPH(__macro) \
+  __macro(hipGraphNodeGetType);              \
+  __macro(hipGraphKernelNodeGetParams);      \
+  __macro(hipGraphExecKernelNodeSetParams)
+
+ROCM_ROUTINE_EACH_VVM(DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
+ROCM_ROUTINE_EACH_GPU_GRAPH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
 
 ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
 
diff --git a/paddle/phi/backends/gpu/cuda/cuda_graph.cc b/paddle/phi/backends/gpu/cuda/cuda_graph.cc
index 728451f9bde40..43ec0a0c89c08 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_graph.cc
+++ b/paddle/phi/backends/gpu/cuda/cuda_graph.cc
@@ -301,8 +301,7 @@ void CUDAGraph::PrintToDotFiles(const std::string &dirname,
 
 #if CUDA_VERSION >= 11000
 void CUDAGraphNodeLauncher::KernelNodeLaunch(
-    parameterSetter_t parameterSetter,
-    cudaKernelCallback_t cudakernelCallback) {
+    parameterSetter_t parameterSetter, gpuKernelCallback_t cudakernelCallback) {
   if (UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing())) {
     unsigned int id = GenerateIdentifier();
     auto cudaFunc = cudakernelCallback(id);
@@ -333,7 +332,7 @@ CUDAGraphNodeLauncher::GetParameterSettersForExecGraph(cudaGraph_t graph) {
 
       PADDLE_ENFORCE_GPU_SUCCESS(
           dynload::cuGraphKernelNodeGetParams(cuNode, &cuParams));
-      CUDAKernelParams kernel_params(cuParams.kernelParams);
+      gpuKernelParams kernel_params(cuParams.kernelParams);
       auto kernel =
           parameterSetters.find(static_cast<cudaFunction_t>(cuParams.func));
       VLOG(10) << "[GetParameterSettersForExecGraph] cuParams.func = "
@@ -350,7 +349,7 @@ CUDAGraphNodeLauncher::GetParameterSettersForExecGraph(cudaGraph_t graph) {
           auto setter = parameterSetter->second;
           hooks.emplace_back([setter, cuNode, cuParams](
                                  cudaGraphExec_t exec_graph) {
-            CUDAKernelParams kernel_params(cuParams.kernelParams);
+            gpuKernelParams kernel_params(cuParams.kernelParams);
             setter(kernel_params);
             PADDLE_ENFORCE_GPU_SUCCESS(dynload::cuGraphExecKernelNodeSetParams(
                 static_cast<CUgraphExec>(exec_graph), cuNode, &cuParams));
@@ -369,7 +368,7 @@ CUDAGraphNodeLauncher::GetParameterSettersForExecGraph(cudaGraph_t graph) {
 void CUDAGraphNodeLauncher::KernelNodeLaunch(
     cudaFunction_t cudaFunc,
     parameterSetter_t parameterSetter,
-    cudaKernelCallback_t cudakernelCallback) {
+    gpuKernelCallback_t cudakernelCallback) {
   cudakernelCallback(0);
 }
 
diff --git a/paddle/phi/backends/gpu/cuda/cuda_graph.h b/paddle/phi/backends/gpu/cuda/cuda_graph.h
index db5e4fcbe2da6..dfc981850ca13 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_graph.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_graph.h
@@ -95,9 +95,9 @@ class CUDAGraphContextManager {
   std::set<DeviceContext *> capturing_ctxs_;
 };
 
-class CUDAKernelParams {
+class gpuKernelParams {
  public:
-  explicit CUDAKernelParams(void **params) : kernelParams(params) {}
+  explicit gpuKernelParams(void **params) : kernelParams(params) {}
 
   template <typename T>
   T &As(size_t idx) const {
@@ -132,20 +132,20 @@ class CUDAGraphNodeLauncher {
   //  Sets the kernel's parameters BEFORE activating the CUDA graph. It enables
   //  dynamic determination and setup of kernel arguments.
   //
-  //  parameterSetter_t parameterSetter = [saved_state](CUDAKernelParams
+  //  parameterSetter_t parameterSetter = [saved_state](gpuKernelParams
   //  &param){
   //      // Code to compute and the parameter values from the saved_state
   //      // ...
   //      param.As<type>(idx) = calculated_value;
   //  };
-  using parameterSetter_t = std::function<void(CUDAKernelParams &)>;
+  using parameterSetter_t = std::function<void(gpuKernelParams &)>;
 
   //  [CUDA Kernel Callback]
   //  Acts as the launcher for the kernel. It accepts an `unsigned int`
   //  identifier and uses it for the kernel launch.
   //  The `cudaGetFuncBySymbol` method can be used to fetch the `cudaFunction_t`
   //  reference of the kernel from the kernel pointer.
-  //  cudaKernelCallback_t cudaKernelCallback = [=](unsigned int id) {
+  //  gpuKernelCallback_t cudaKernelCallback = [=](unsigned int id) {
   //      // cudaFunction_t is REQUIRED to get here
   //      cudaFunction_t cudaFunc;
   //      PADDLE_ENFORCE_GPU_SUCCESS(cudaGetFuncBySymbol(&cudaFunc, &kernel));
@@ -153,18 +153,18 @@ class CUDAGraphNodeLauncher {
   //      kernel<<<>>>(id, ...);  // Launching the kernel with id
   //      return cudaFunc;
   //  };
-  using cudaKernelCallback_t = std::function<cudaFunction_t(unsigned int)>;
+  using gpuKernelCallback_t = std::function<cudaFunction_t(unsigned int)>;
 
   //  [Kernel Launch]
   //  With the callbacks defined and the CUDA function obtained, the kernel can
   //  be launched using the `KernelNodeLaunch` method.
   void KernelNodeLaunch(parameterSetter_t parameterSetter,
-                        cudaKernelCallback_t cudakernelCallback);
+                        gpuKernelCallback_t cudakernelCallback);
 
   std::vector<cudaGraphExecuterSetter_t> GetParameterSettersForExecGraph(
       cudaGraph_t graph);
 
-  parameterSetter_t GetParameterSetter(const CUDAKernelParams &params);
+  parameterSetter_t GetParameterSetter(const gpuKernelParams &params);
 
   static CUDAGraphNodeLauncher &Instance() {
     static CUDAGraphNodeLauncher *launcher = new CUDAGraphNodeLauncher;
@@ -185,7 +185,7 @@ class CUDAGraphNodeLauncher {
 #if CUDA_VERSION >= 10010
 static void ThrowErrorIfNotSupportCUDAGraph() {}
 #else
-enum cudaStreamCaptureMode {
+enum gpuStreamCaptureMode {
   cudaStreamCaptureModeGlobal = 0,
   cudaStreamCaptureModeThreadLocal = 1,
   cudaStreamCaptureModeRelaxed = 2
@@ -262,7 +262,7 @@ class CUDAGraph {
 
   static void BeginCapture(phi::GPUPlace place,
                            cudaStream_t stream,
-                           cudaStreamCaptureMode mode);
+                           gpuStreamCaptureMode mode);
   static std::unique_ptr<CUDAGraph> EndCapture();
 
   static void BeginSegmentCapture();
@@ -309,7 +309,7 @@ class CUDAGraph {
     }
   }
 
-  using SetSeedFunc = std::function<bool(CUDAKernelParams *, bool)>;
+  using SetSeedFunc = std::function<bool(gpuKernelParams *, bool)>;
   static void RecordRandomKernelInfo(SetSeedFunc set_seed_func) {
     std::lock_guard<std::mutex> guard(capturing_graph_->func_mtx_);
     capturing_graph_->set_seed_funcs_.emplace_back(std::move(set_seed_func));
@@ -324,7 +324,7 @@ class CUDAGraph {
 #if CUDA_VERSION >= 10010
   std::vector<cudaGraph_t> graphs_;
   std::vector<cudaGraphExec_t> exec_graphs_;
-  cudaStreamCaptureMode capture_mode_;
+  gpuStreamCaptureMode capture_mode_;
 #endif
   cudaStream_t stream_{nullptr};
   phi::GPUPlace place_;
@@ -368,7 +368,7 @@ class CUDAGraphCaptureModeGuard {
 
  public:
   explicit CUDAGraphCaptureModeGuard(
-      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {
+      gpuStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {
     if (UNLIKELY(CUDAGraph::IsCapturing())) {
       PADDLE_ENFORCE_GPU_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode));
       // After cudaThreadExchangeStreamCaptureMode is called,
@@ -385,7 +385,7 @@ class CUDAGraphCaptureModeGuard {
   }
 
  private:
-  cudaStreamCaptureMode old_mode_;
+  gpuStreamCaptureMode old_mode_;
 };
 #else
 class CUDAGraphCaptureModeGuard {
@@ -393,7 +393,7 @@ class CUDAGraphCaptureModeGuard {
 
  public:
   explicit CUDAGraphCaptureModeGuard(
-      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {}
+      gpuStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {}
 };
 #endif
 
diff --git a/paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h b/paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h
index 952dd355882e5..2d5810fbe1c9b 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h
@@ -17,9 +17,13 @@
 #include <cstddef>
 #include <utility>
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/backends/context_pool.h"
+#if defined(PADDLE_WITH_CUDA)
 #include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
+#else
+#include "paddle/phi/backends/gpu/rocm/hip_graph.h"
+#endif
 #include "paddle/phi/kernels/funcs/dropout_impl_util.h"
 #endif
 
@@ -28,7 +32,7 @@ namespace backends {
 namespace gpu {
 
 inline bool IsCUDAGraphCapturing() {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   return CUDAGraph::IsCapturing();
 #else
   return false;
@@ -39,7 +43,7 @@ inline bool IsCUDAGraphCapturing() {
 // Otherwise, invoke callback directly.
 template <typename Callback>
 inline void AddPostResetCallbackIfCapturingCUDAGraph(Callback &&callback) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (UNLIKELY(IsCUDAGraphCapturing())) {
     return CUDAGraph::AddPostResetCallbackDuringCapturing(
         std::forward<Callback>(callback));
@@ -52,7 +56,7 @@ template <typename T>
 inline T *RestoreHostMemIfCapturingCUDAGraph(T *host_mem, size_t size) {
   static_assert(std::is_trivial<T>::value, "T must be trivial type");
   static_assert(!std::is_same<T, void>::value, "T cannot be void");
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (UNLIKELY(IsCUDAGraphCapturing())) {
     size_t nbytes = size * sizeof(T);
     void *new_host_mem = new uint8_t[nbytes];
diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h
index fe4d6a6623a96..97f34de9a55a6 100644
--- a/paddle/phi/backends/gpu/gpu_types.h
+++ b/paddle/phi/backends/gpu/gpu_types.h
@@ -29,6 +29,9 @@
 
 namespace phi {
 
+// Note(qili93): CUDA Runtime API supported by HIP
+// https://github.com/ROCm/HIPIFY/blob/master/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md
+
 #ifdef PADDLE_WITH_HIP
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
   using GPU_TYPE = ROCM_TYPE;
@@ -50,6 +53,20 @@ DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
 DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
                      cudnnActivationMode_t,
                      miopenActivationMode_t);
+DECLARE_TYPE_FOR_GPU(gpuGraph_t, cudaGraph_t, hipGraph_t);
+DECLARE_TYPE_FOR_GPU(gpuFunction_t, cudaFunction_t, hipFunction_t);
+DECLARE_TYPE_FOR_GPU(gpuGraphExec_t, cudaGraphExec_t, hipGraphExec_t);
+DECLARE_TYPE_FOR_GPU(gpuGraphNode_t, cudaGraphNode_t, hipGraphNode_t);
+DECLARE_TYPE_FOR_GPU(gpuGraphNodeType, cudaGraphNodeType, hipGraphNodeType);
+DECLARE_TYPE_FOR_GPU(gpuKernelNodeParams,
+                     cudaKernelNodeParams,
+                     hipKernelNodeParams);
+DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode,
+                     cudaStreamCaptureMode,
+                     hipStreamCaptureMode);
+DECLARE_TYPE_FOR_GPU(gpuStreamCaptureStatus,
+                     cudaStreamCaptureStatus,
+                     hipStreamCaptureStatus);
 
 #undef DECLARE_TYPE_FOR_GPU
 
@@ -76,8 +93,75 @@ DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToHost,
 DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToDevice,
                          cudaMemcpyKind::cudaMemcpyDeviceToDevice,
                          hipMemcpyKind::hipMemcpyDeviceToDevice);
+DECLARE_CONSTANT_FOR_GPU(gpuEventDisableTiming,
+                         cudaEventDisableTiming,
+                         hipEventDisableTiming);
+DECLARE_CONSTANT_FOR_GPU(gpuStreamNonBlocking,
+                         cudaStreamNonBlocking,
+                         hipStreamNonBlocking);
+DECLARE_CONSTANT_FOR_GPU(gpuStreamCaptureModeThreadLocal,
+                         cudaStreamCaptureModeThreadLocal,
+                         hipStreamCaptureModeThreadLocal);
+DECLARE_CONSTANT_FOR_GPU(gpuStreamCaptureModeRelaxed,
+                         cudaStreamCaptureModeRelaxed,
+                         hipStreamCaptureModeRelaxed);
+DECLARE_CONSTANT_FOR_GPU(gpuStreamCaptureStatusActive,
+                         cudaStreamCaptureStatusActive,
+                         hipStreamCaptureStatusActive);
+DECLARE_CONSTANT_FOR_GPU(gpuGraphNodeTypeKernel,
+                         cudaGraphNodeTypeKernel,
+                         hipGraphNodeTypeKernel);
 
 #undef DECLARE_CONSTANT_FOR_GPU
+
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_FUNCTION_FOR_GPU(GPU_FUNC, CUDA_FUNC, ROCM_FUNC) \
+  const auto GPU_FUNC = ROCM_FUNC;
+#else  // PADDLE_WITH_CUDA
+#define DECLARE_FUNCTION_FOR_GPU(GPU_FUNC, CUDA_FUNC, ROCM_FUNC) \
+  const auto GPU_FUNC = CUDA_FUNC;
+#endif
+
+DECLARE_FUNCTION_FOR_GPU(gpuGraphGetNodes, cudaGraphGetNodes, hipGraphGetNodes);
+DECLARE_FUNCTION_FOR_GPU(gpuGraphGetEdges, cudaGraphGetEdges, hipGraphGetEdges);
+DECLARE_FUNCTION_FOR_GPU(gpuGraphLaunch, cudaGraphLaunch, hipGraphLaunch);
+DECLARE_FUNCTION_FOR_GPU(gpuGraphDestroy, cudaGraphDestroy, hipGraphDestroy);
+DECLARE_FUNCTION_FOR_GPU(gpuGraphExecDestroy,
+                         cudaGraphExecDestroy,
+                         hipGraphExecDestroy);
+DECLARE_FUNCTION_FOR_GPU(gpuGraphNodeGetType,
+                         cudaGraphNodeGetType,
+                         hipGraphNodeGetType);
+DECLARE_FUNCTION_FOR_GPU(gpuGraphExecKernelNodeSetParams,
+                         cudaGraphExecKernelNodeSetParams,
+                         hipGraphExecKernelNodeSetParams);
+DECLARE_FUNCTION_FOR_GPU(gpuGraphKernelNodeGetParams,
+                         cudaGraphKernelNodeGetParams,
+                         hipGraphKernelNodeGetParams);
+DECLARE_FUNCTION_FOR_GPU(gpuStreamCreateWithPriority,
+                         cudaStreamCreateWithPriority,
+                         hipStreamCreateWithPriority);
+DECLARE_FUNCTION_FOR_GPU(gpuStreamBeginCapture,
+                         cudaStreamBeginCapture,
+                         hipStreamBeginCapture);
+DECLARE_FUNCTION_FOR_GPU(gpuStreamEndCapture,
+                         cudaStreamEndCapture,
+                         hipStreamEndCapture);
+DECLARE_FUNCTION_FOR_GPU(gpuStreamGetCaptureInfo,
+                         cudaStreamGetCaptureInfo,
+                         hipStreamGetCaptureInfo);
+DECLARE_FUNCTION_FOR_GPU(gpuEventCreateWithFlags,
+                         cudaEventCreateWithFlags,
+                         hipEventCreateWithFlags);
+DECLARE_FUNCTION_FOR_GPU(gpuEventRecord, cudaEventRecord, hipEventRecord);
+DECLARE_FUNCTION_FOR_GPU(gpuEventDestroy, cudaEventDestroy, hipEventDestroy);
+DECLARE_FUNCTION_FOR_GPU(gpuEventQuery, cudaEventQuery, hipEventQuery);
+DECLARE_FUNCTION_FOR_GPU(gpuEventSynchronize,
+                         cudaEventSynchronize,
+                         hipEventSynchronize);
+
+#undef DECLARE_FUNCTION_FOR_GPU
+
 }  // namespace phi
 
 #endif  // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/phi/backends/gpu/rocm/hip_graph.cc b/paddle/phi/backends/gpu/rocm/hip_graph.cc
new file mode 100644
index 0000000000000..781cb41ae6983
--- /dev/null
+++ b/paddle/phi/backends/gpu/rocm/hip_graph.cc
@@ -0,0 +1,365 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/rocm/hip_graph.h"
+#include "glog/logging.h"
+#include "paddle/common/flags.h"
+
+COMMON_DECLARE_bool(use_cuda_malloc_async_allocator);
+COMMON_DECLARE_bool(auto_free_cudagraph_allocations_on_launch);
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+std::unique_ptr<CUDAGraph> CUDAGraph::capturing_graph_{nullptr};
+paddle::optional<std::thread::id> CUDAGraph::capturing_thread_id_{paddle::none};
+
+static std::vector<hipGraphNode_t> ToposortCUDAGraph(hipGraph_t graph) {
+  size_t num_nodes;
+  PADDLE_ENFORCE_GPU_SUCCESS(hipGraphGetNodes(graph, nullptr, &num_nodes));
+  std::vector<hipGraphNode_t> nodes(num_nodes);
+  PADDLE_ENFORCE_GPU_SUCCESS(hipGraphGetNodes(graph, nodes.data(), &num_nodes));
+
+  size_t num_edges;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipGraphGetEdges(graph, nullptr, nullptr, &num_edges));
+  std::vector<hipGraphNode_t> from(num_edges), to(num_edges);
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipGraphGetEdges(graph, from.data(), to.data(), &num_edges));
+
+  std::unordered_map<hipGraphNode_t, std::unordered_set<hipGraphNode_t>>
+      in_edges, out_edges;
+  for (auto node : nodes) {
+    in_edges[node];
+    out_edges[node];
+  }
+
+  for (size_t i = 0; i < num_edges; ++i) {
+    in_edges[to[i]].insert(from[i]);
+    out_edges[from[i]].insert(to[i]);
+  }
+
+  std::queue<hipGraphNode_t> q;
+  for (const auto &pair : in_edges) {
+    if (pair.second.empty()) {
+      q.push(pair.first);
+    }
+  }
+
+  nodes.clear();
+  while (!q.empty()) {
+    auto cur = q.front();
+    q.pop();
+    nodes.push_back(cur);
+
+    for (auto out_node : out_edges.at(cur)) {
+      auto &in_nodes = in_edges.at(out_node);
+      in_nodes.erase(cur);
+      if (in_nodes.empty()) {
+        q.push(out_node);
+      }
+    }
+  }
+  PADDLE_ENFORCE_EQ(
+      nodes.size(),
+      num_nodes,
+      phi::errors::InvalidArgument("Toposort error, this may be a bug."));
+  return nodes;
+}
+
+CUDAGraphID CUDAGraph::UniqueID() {
+  static std::atomic<CUDAGraphID> id;
+  return id.fetch_add(1);
+}
+
+int64_t CUDAGraph::UniqueMemoryPoolID() {
+  static std::atomic<int64_t> id(CUDAGraph::kDefaultPoolID + 1);
+  return id.fetch_add(1);
+}
+
+void CUDAGraph::Reset() {
+  if (is_reset_) return;
+#if defined(PADDLE_WITH_HIP)
+  for (auto graph : graphs_) {
+    PADDLE_ENFORCE_GPU_SUCCESS(hipGraphDestroy(graph));
+  }
+  graphs_.clear();
+  for (auto exec_graph : exec_graphs_) {
+    PADDLE_ENFORCE_GPU_SUCCESS(hipGraphExecDestroy(exec_graph));
+  }
+  exec_graphs_.clear();
+#endif
+  // callback should be called in reverse order because the latter added
+  // callback may rely on the former added callback.
+  for (auto iter = cudagraph_post_reset_callbacks_.rbegin();
+       iter != cudagraph_post_reset_callbacks_.rend();
+       ++iter) {
+    (*iter)();
+  }
+  cudagraph_post_reset_callbacks_.clear();
+  is_reset_ = true;
+}
+
+void CUDAGraph::Replay() {
+#if defined(PADDLE_WITH_HIP)
+  PADDLE_ENFORCE_EQ(is_reset_,
+                    false,
+                    phi::errors::PermissionDenied(
+                        "Cannot replay the CUDA Graph after reset is called."));
+  size_t n = exec_graphs_.size();
+  for (size_t i = 0; i < n; ++i) {
+    if (!is_first_run_) {
+      for (auto &hook : cudagraph_pre_replay_callbacks_[i]) {
+        hook(exec_graphs_[i]);
+      }
+    }
+    PADDLE_ENFORCE_GPU_SUCCESS(hipGraphLaunch(exec_graphs_[i], stream_));
+  }
+  is_first_run_ = false;
+#endif
+}
+
+void CUDAGraph::BeginSegmentCapture() {
+  ThrowErrorIfNotSupportCUDAGraph();
+#if defined(PADDLE_WITH_HIP)
+  PADDLE_ENFORCE_EQ(IsCapturing(),
+                    true,
+                    phi::errors::PermissionDenied(
+                        "BeginSegmentCapture should be called when CUDA "
+                        "Graph is capturing."));
+  if (IsThreadLocalCapturing()) {
+    PADDLE_ENFORCE_EQ(IsThisThreadCapturing(),
+                      true,
+                      phi::errors::PermissionDenied(
+                          "When capturing CUDA Graph in the thread local mode, "
+                          "you cannot begin segmented capturing in the thread "
+                          "which is not the one that starts the capturing."));
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamBeginCapture(
+      capturing_graph_->stream_, capturing_graph_->capture_mode_));
+  PADDLE_ENFORCE_EQ(
+      IsValidCapturing(),
+      true,
+      phi::errors::PermissionDenied("CUDA Graph should not be invalidated."));
+  VLOG(10) << "Begin to capture CUDA Graph with ID " << capturing_graph_->id_
+           << ", segment id " << capturing_graph_->graphs_.size()
+           << ", memory pool id " << capturing_graph_->pool_id_;
+#endif
+}
+
+void CUDAGraph::BeginCapture(phi::GPUPlace place,
+                             gpuStream_t stream,
+                             hipStreamCaptureMode mode) {
+  ThrowErrorIfNotSupportCUDAGraph();
+#if defined(PADDLE_WITH_HIP)
+  PADDLE_ENFORCE_EQ(IsCapturing(),
+                    false,
+                    phi::errors::PermissionDenied(
+                        "CUDA Graph can only captured one by one."));
+  PADDLE_ENFORCE_NOT_NULL(
+      stream,
+      phi::errors::PermissionDenied(
+          "CUDA Graph cannot be captured in default CUDA stream 0."));
+  capturing_graph_.reset(new CUDAGraph());
+  capturing_graph_->place_ = place;
+  capturing_graph_->stream_ = stream;
+  capturing_graph_->capture_mode_ = mode;
+  if (mode == hipStreamCaptureModeThreadLocal) {
+    capturing_thread_id_ = std::this_thread::get_id();
+    VLOG(10) << "Capturing CUDA Graph in thread local mode, thread id: "
+             << capturing_thread_id_;
+  }
+  BeginSegmentCapture();
+#endif
+}
+
+void CUDAGraph::EndSegmentCapture() {
+  ThrowErrorIfNotSupportCUDAGraph();
+#if defined(PADDLE_WITH_HIP)
+  PADDLE_ENFORCE_EQ(
+      IsCapturing(),
+      true,
+      phi::errors::PermissionDenied("No CUDA Graph is capturing."));
+  hipGraph_t graph;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipStreamEndCapture(capturing_graph_->stream_, &graph));
+  auto num_nodes = static_cast<size_t>(-1);
+  PADDLE_ENFORCE_GPU_SUCCESS(hipGraphGetNodes(graph, nullptr, &num_nodes));
+  if (num_nodes == 0) {
+    PADDLE_ENFORCE_GPU_SUCCESS(hipGraphDestroy(graph));
+    VLOG(10) << "Skip empty CUDA Graph with ID " << capturing_graph_->id_
+             << ", segment id " << capturing_graph_->graphs_.size()
+             << ", memory pool id " << capturing_graph_->pool_id_;
+    return;
+  }
+
+  for (auto &cudagraph_post_capture_callback :
+       capturing_graph_->cudagraph_post_capture_callbacks_) {
+    cudagraph_post_capture_callback();
+  }
+  capturing_graph_->cudagraph_post_capture_callbacks_.clear();
+
+  capturing_graph_->cudagraph_pre_replay_callbacks_.emplace_back(
+      CUDAGraphNodeLauncher::Instance().GetParameterSettersForExecGraph(graph));
+
+  // if forward graph is registered, this graph is a backward graph
+  // we check whether there is remain blocks that is unreleased by this
+  hipGraphExec_t exec_graph;
+  if (FLAGS_use_cuda_malloc_async_allocator &&
+      FLAGS_auto_free_cudagraph_allocations_on_launch) {
+#if defined(PADDLE_WITH_HIP)
+    VLOG(1) << "hipGraphInstantiateFlagAutoFreeOnLaunch is enabled!";
+    PADDLE_ENFORCE_GPU_SUCCESS(hipGraphInstantiateWithFlags(
+        &exec_graph, graph, hipGraphInstantiateFlagAutoFreeOnLaunch));
+#else
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "The cudaGraphInstantiateFlagAutoFreeOnLaunch is only supported when "
+        "CUDA version >= 11.4.0"));
+#endif
+  } else {
+#if defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        hipGraphInstantiate(&exec_graph, graph, nullptr, nullptr, 0));
+#endif
+  }
+  VLOG(10) << "End to capture CUDA Graph with ID " << capturing_graph_->id_
+           << ", segment id " << capturing_graph_->graphs_.size()
+           << ", memory pool id " << capturing_graph_->pool_id_;
+  capturing_graph_->graphs_.emplace_back(graph);
+  capturing_graph_->exec_graphs_.emplace_back(exec_graph);
+#endif
+}
+
+std::unique_ptr<CUDAGraph> CUDAGraph::EndCapture() {
+  EndSegmentCapture();
+  capturing_thread_id_ = paddle::none;
+  return std::move(capturing_graph_);
+}
+
+bool CUDAGraph::IsValidCapturing() {
+#if defined(PADDLE_WITH_HIP)
+  if (!IsCapturing()) return false;
+  hipStreamCaptureStatus status;
+  CUDAGraphID id;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipStreamGetCaptureInfo(capturing_graph_->stream_, &status, &id));
+  return status == hipStreamCaptureStatusActive;
+#else
+  return false;
+#endif
+}
+
+static std::string ConcatPath(const std::string &dirname,
+                              const std::string &filename) {
+#ifdef _WIN32
+  const std::array<char, 3> kFileSep = {"\\"};
+#else
+  const std::array<char, 2> kFileSep = {"/"};
+#endif
+  if (!dirname.empty() && dirname.back() == kFileSep[0]) {
+    return dirname + filename;
+  } else {
+    return dirname + kFileSep.data() + filename;
+  }
+}
+
+void CUDAGraph::PrintToDotFiles(const std::string &dirname,
+                                unsigned int flags) {
+  ThrowErrorIfNotSupportCUDAGraph();
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "The print_to_dot_files() method is not supported on ROCm/HIP"));
+}
+
+#if defined(PADDLE_WITH_HIP)
+void CUDAGraphNodeLauncher::KernelNodeLaunch(
+    parameterSetter_t parameterSetter, gpuKernelCallback_t cudakernelCallback) {
+  if (UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing())) {
+    unsigned int id = GenerateIdentifier();
+    auto cudaFunc = cudakernelCallback(id);
+
+    parameterSetters[cudaFunc][id] = parameterSetter;
+    VLOG(10) << "[KernelNodeLaunch] Launch kernel with cudaFunc = " << cudaFunc
+             << " id = " << id;
+  } else {
+    cudakernelCallback(0);
+  }
+}
+
+std::vector<cudaGraphExecuterSetter_t>
+CUDAGraphNodeLauncher::GetParameterSettersForExecGraph(hipGraph_t graph) {
+  size_t num_nodes;
+  PADDLE_ENFORCE_GPU_SUCCESS(hipGraphGetNodes(graph, nullptr, &num_nodes));
+  std::vector<hipGraphNode_t> nodes(num_nodes);
+  PADDLE_ENFORCE_GPU_SUCCESS(hipGraphGetNodes(graph, nodes.data(), &num_nodes));
+
+  std::vector<std::function<void(hipGraphExec_t)>> hooks;
+  for (auto node : nodes) {
+    hipGraphNode_t gpuNode = node;
+    hipGraphNodeType pType;
+    PADDLE_ENFORCE_GPU_SUCCESS(hipGraphNodeGetType(gpuNode, &pType));
+    if (pType == hipGraphNodeTypeKernel) {
+      hipKernelNodeParams gpuParams;
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          gpuGraphKernelNodeGetParams(gpuNode, &gpuParams));
+      gpuKernelParams kernel_params(gpuParams.kernelParams);
+      auto kernel =
+          parameterSetters.find(static_cast<gpuFunction_t>(gpuParams.func));
+      VLOG(10) << "[GetParameterSettersForExecGraph] gpuParams.func = "
+               << gpuParams.func;
+      // There exists a parameter setter
+      if (kernel != parameterSetters.end()) {
+        auto launchSequence = kernel->second;
+        unsigned int id = kernel_params.As<int>(0);
+
+        VLOG(10) << "[GetParameterSettersForExecGraph] Find launch kernel id = "
+                 << id;
+        auto parameterSetter = launchSequence.find(id);
+        if (parameterSetter != launchSequence.end()) {
+          auto setter = parameterSetter->second;
+          hooks.emplace_back(
+              [setter, gpuNode, gpuParams](hipGraphExec_t exec_graph) {
+                gpuKernelParams kernel_params(gpuParams.kernelParams);
+                setter(kernel_params);
+                PADDLE_ENFORCE_GPU_SUCCESS(hipGraphExecKernelNodeSetParams(
+                    exec_graph, gpuNode, &gpuParams));
+              });
+        } else {
+          PADDLE_THROW(
+              phi::errors::InvalidArgument("Error: does not find launch id"));
+        }
+      }
+    }
+  }
+
+  return hooks;
+}
+#else
+void CUDAGraphNodeLauncher::KernelNodeLaunch(
+    hipFunction_t cudaFunc,
+    parameterSetter_t parameterSetter,
+    gpuKernelCallback_t cudakernelCallback) {
+  cudakernelCallback(0);
+}
+
+std::vector<cudaGraphExecuterSetter_t>
+CUDAGraphNodeLauncher::GetParameterSettersForExecGraph(hipGraph_t graph) {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "CUDAGraphNodeLauncher is only supported when CUDA version >= 11.0"));
+}
+#endif
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/rocm/hip_graph.h b/paddle/phi/backends/gpu/rocm/hip_graph.h
new file mode 100644
index 0000000000000..cb92275227254
--- /dev/null
+++ b/paddle/phi/backends/gpu/rocm/hip_graph.h
@@ -0,0 +1,393 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <array>
+#include <atomic>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <set>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/common/errors.h"
+#include "paddle/common/macros.h"
+#include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/backends/device_code.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+class CUDAGraphContextManager {
+ public:
+  using DeviceContextMap =
+      std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>;
+
+  static CUDAGraphContextManager &Instance() {
+    static CUDAGraphContextManager *cuda_graph_ctx_manager =
+        new CUDAGraphContextManager;
+    return *cuda_graph_ctx_manager;
+  }
+
+  DeviceContext *Get(int64_t pool_id, const Place &place, int stream_priority) {
+    std::lock_guard<std::mutex> lk(ctx_mtx_);
+    VLOG(6) << "Get cuda graph device context for " << place;
+
+    DeviceContextMap &ctxs = cuda_graph_ctx_pool_[pool_id];
+    if (ctxs.find(place) == ctxs.end()) {
+      phi::memory_utils::EmplaceDeviceContexts(
+          &ctxs,
+          {place},
+          /*disable_setting_default_stream_for_allocator=*/true,
+          stream_priority);
+    }
+    return ctxs[place].get().get();
+  }
+
+  void RecordCapturingDeviceContext(DeviceContext *dev_ctx) {
+    capturing_ctxs_.insert(dev_ctx);
+  }
+
+  std::set<DeviceContext *> GetAllCapturingDeviceContexts() const {
+    return capturing_ctxs_;
+  }
+
+  void ClearDeviceContextsRecords() { capturing_ctxs_.clear(); }
+
+ private:
+  CUDAGraphContextManager() {}
+  DISABLE_COPY_AND_ASSIGN(CUDAGraphContextManager);
+
+  std::mutex ctx_mtx_;
+  std::unordered_map<int64_t, DeviceContextMap> cuda_graph_ctx_pool_;
+  std::set<DeviceContext *> capturing_ctxs_;
+};
+
+class gpuKernelParams {
+ public:
+  explicit gpuKernelParams(void **params) : kernelParams(params) {}
+
+  template <typename T>
+  T &As(size_t idx) const {
+    return *reinterpret_cast<T *>(kernelParams[idx]);
+  }
+
+  void **getParams() const { return kernelParams; }
+
+ private:
+  void **kernelParams;
+};
+
+using cudaGraphExecuterSetter_t = std::function<void(hipGraphExec_t)>;
+
+//  ** class CUDAGraphNodeLauncher
+//
+//  This class offers a interface for launching CUDA kernels in CUDA Graph, we
+//  utilize the `cudaGraphExecKernelNodeSetParams` function for parameter setup.
+//  Launching kernels via this class ensures proper management.
+//
+//  NOTE: It's essential that the first parameter for any kernel launched
+//  through this class is an `unsigned int` identifier. This identifier plays a
+//  crucial role in linking the CUDA kernel to its corresponding CUDA graph
+//  node. We tag each kernel launch with a unique identifier to maintain
+//  structured linkage with its CUDA graph node.
+//
+//  NOTE: This class use a singleton design pattern ensures there's only a
+//  single global instance accessible via the `Instance()` method.
+class CUDAGraphNodeLauncher {
+ public:
+  //  [Parameter Setter Callback]
+  //  Sets the kernel's parameters BEFORE activating the CUDA graph. It enables
+  //  dynamic determination and setup of kernel arguments.
+  //
+  //  parameterSetter_t parameterSetter = [saved_state](gpuKernelParams
+  //  &param){
+  //      // Code to compute and the parameter values from the saved_state
+  //      // ...
+  //      param.As<type>(idx) = calculated_value;
+  //  };
+  using parameterSetter_t = std::function<void(gpuKernelParams &)>;
+
+  //  [CUDA Kernel Callback]
+  //  Acts as the launcher for the kernel. It accepts an `unsigned int`
+  //  identifier and uses it for the kernel launch.
+  //  The `cudaGetFuncBySymbol` method can be used to fetch the `cudaFunction_t`
+  //  reference of the kernel from the kernel pointer.
+  //  gpuKernelCallback_t cudaKernelCallback = [=](unsigned int id) {
+  //      // cudaFunction_t is REQUIRED to get here
+  //      cudaFunction_t cudaFunc;
+  //      PADDLE_ENFORCE_GPU_SUCCESS(cudaGetFuncBySymbol(&cudaFunc, &kernel));
+  //
+  //      kernel<<<>>>(id, ...);  // Launching the kernel with id
+  //      return cudaFunc;
+  //  };
+  using gpuKernelCallback_t = std::function<hipFunction_t(unsigned int)>;
+
+  //  [Kernel Launch]
+  //  With the callbacks defined and the CUDA function obtained, the kernel can
+  //  be launched using the `KernelNodeLaunch` method.
+  void KernelNodeLaunch(parameterSetter_t parameterSetter,
+                        gpuKernelCallback_t cudakernelCallback);
+
+  std::vector<cudaGraphExecuterSetter_t> GetParameterSettersForExecGraph(
+      hipGraph_t graph);
+
+  parameterSetter_t GetParameterSetter(const gpuKernelParams &params);
+
+  static CUDAGraphNodeLauncher &Instance() {
+    static CUDAGraphNodeLauncher *launcher = new CUDAGraphNodeLauncher;
+    return *launcher;
+  }
+
+ private:
+  CUDAGraphNodeLauncher() : id(0) {}
+  DISABLE_COPY_AND_ASSIGN(CUDAGraphNodeLauncher);
+
+  unsigned int GenerateIdentifier() { return id++; }
+
+  unsigned int id;
+  std::unordered_map<hipFunction_t, std::map<unsigned int, parameterSetter_t>>
+      parameterSetters;
+};
+
+#if defined(PADDLE_WITH_HIP)
+static void ThrowErrorIfNotSupportCUDAGraph() {}
+#else
+enum gpuStreamCaptureMode {
+  hipStreamCaptureModeGlobal = 0,
+  hipStreamCaptureModeThreadLocal = 1,
+  hipStreamCaptureModeRelaxed = 2
+};
+static void ThrowErrorIfNotSupportCUDAGraph() {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "CUDA Graph is only supported when CUDA version >= 10.1"));
+}
+#endif
+
+using CUDAGraphID = unsigned long long;  // NOLINT
+
+// NOTE: Currently, we do not support to capture CUDA graph in parallel
+// NOTE: Do not use this class directly because it should be used with
+//       the memory pool.
+class CUDAGraph {
+  DISABLE_COPY_AND_ASSIGN(CUDAGraph);
+
+  // Since the constructor would throw error is CUDA_VERSION < 10010.
+  // The non-static method of CUDAGraph need not check CUDA_VERSION
+  // again.
+  CUDAGraph() {
+    ThrowErrorIfNotSupportCUDAGraph();
+    id_ = UniqueID();
+  }
+
+ public:
+  static constexpr int64_t kDefaultPoolID = 0;
+  static constexpr int64_t kInvalidPoolID = -1;
+
+  ~CUDAGraph() { Reset(); }
+
+  CUDAGraphID ID() const { return id_; }
+
+  static int64_t SetMemoryPoolID(int64_t pool_id) {
+    auto &pool_id_ = capturing_graph_->pool_id_;
+    PADDLE_ENFORCE_EQ(
+        pool_id_,
+        kInvalidPoolID,
+        phi::errors::InvalidArgument("Cannot reset memory pool id twice, the "
+                                     "former memory pool id is %d.",
+                                     pool_id_));
+    if (pool_id <= kInvalidPoolID) {
+      pool_id_ = UniqueMemoryPoolID();
+    } else {
+      PADDLE_ENFORCE_GE(
+          pool_id,
+          kDefaultPoolID,
+          phi::errors::InvalidArgument("Invalid memory pool id %d.", pool_id));
+      pool_id_ = pool_id;
+    }
+    return pool_id_;
+  }
+
+  int64_t PoolID() const { return pool_id_; }
+
+  static int64_t CapturingPoolID() { return capturing_graph_->pool_id_; }
+
+  void Replay();
+
+  void Reset();
+
+  void AddPostResetCallback(std::function<void()> callback) {
+    std::lock_guard<std::mutex> guard(mtx_);
+    cudagraph_post_reset_callbacks_.push_back(std::move(callback));
+  }
+
+  void AddPostCaptureCallback(std::function<void()> callback) {
+    std::lock_guard<std::mutex> guard(mtx_);
+    cudagraph_post_capture_callbacks_.push_back(std::move(callback));
+  }
+
+  void PrintToDotFiles(const std::string &dirname, unsigned int flags);
+
+  static void BeginCapture(phi::GPUPlace place,
+                           gpuStream_t stream,
+                           gpuStreamCaptureMode mode);
+  static std::unique_ptr<CUDAGraph> EndCapture();
+
+  static void BeginSegmentCapture();
+  static void EndSegmentCapture();
+
+  static void AddPostResetCallbackDuringCapturing(
+      std::function<void()> callback) {
+    capturing_graph_->AddPostResetCallback(std::move(callback));
+  }
+
+  static void AddPostCaptureCallbackDuringCapturing(
+      std::function<void()> callback) {
+    capturing_graph_->AddPostCaptureCallback(std::move(callback));
+  }
+
+  // No need to add CUDA_VERSION macro because capturing_graph_ would
+  // always be nullptr (constructor throws error)
+  static bool IsCapturing() { return capturing_graph_ != nullptr; }
+
+  static CUDAGraphID CapturingID() { return capturing_graph_->id_; }
+
+  static phi::GPUPlace CapturingPlace() { return capturing_graph_->place_; }
+
+  // This API can be used to debug which GPU operation is not
+  // supported during capturing CUDA Graph.
+  static bool IsValidCapturing();
+
+  static bool IsThreadLocalCapturing() {
+#if defined(PADDLE_WITH_HIP)
+    return IsCapturing() &&
+           capturing_graph_->capture_mode_ == hipStreamCaptureModeThreadLocal;
+#else
+    return false;
+#endif
+  }
+
+  static bool IsThisThreadCapturing() {
+    if (UNLIKELY(IsCapturing())) {
+      return IsThreadLocalCapturing()
+                 ? capturing_thread_id_.get() == std::this_thread::get_id()
+                 : true;
+    } else {
+      return false;
+    }
+  }
+
+  using SetSeedFunc = std::function<bool(gpuKernelParams *, bool)>;
+  static void RecordRandomKernelInfo(SetSeedFunc set_seed_func) {
+    std::lock_guard<std::mutex> guard(capturing_graph_->func_mtx_);
+    capturing_graph_->set_seed_funcs_.emplace_back(std::move(set_seed_func));
+  }
+
+  static int64_t UniqueMemoryPoolID();
+
+ private:
+  static CUDAGraphID UniqueID();
+
+ private:
+#if defined(PADDLE_WITH_HIP)
+  std::vector<hipGraph_t> graphs_;
+  std::vector<hipGraphExec_t> exec_graphs_;
+  gpuStreamCaptureMode capture_mode_;
+#endif
+  gpuStream_t stream_{nullptr};
+  phi::GPUPlace place_;
+  CUDAGraphID id_;
+  int64_t pool_id_{kInvalidPoolID};
+  bool is_reset_{false};
+  std::mutex mtx_;
+
+  std::vector<SetSeedFunc> set_seed_funcs_;
+
+  // Holds callbacks that are triggered after the CUDA graph is reset. These
+  // callbacks are used for operations that need to be performed following the
+  // reset of a CUDA graph.
+  std::vector<std::function<void()>> cudagraph_post_reset_callbacks_;
+
+  // Contains callbacks that are invoked after the CUDA graph has been captured.
+  // These callbacks are crucial for managing memory allocations related to the
+  // CUDA graph. They ensure that memory blocks not associated with a graph (as
+  // detailed in cuda_malloc_async_allocator) are not erroneously released
+  // during the graph's lifecycle.
+  std::vector<std::function<void()>> cudagraph_post_capture_callbacks_;
+
+  // Maintains a collection of 'pre-hooks' - functions that are executed before
+  // the CUDA graph is replayed. These pre-hooks are essential for setting up
+  // the necessary conditions or states required for the correct execution of
+  // the CUDA graph.
+  std::vector<std::vector<cudaGraphExecuterSetter_t>>
+      cudagraph_pre_replay_callbacks_;
+
+  std::mutex func_mtx_;
+
+  bool is_first_run_{true};
+
+  static paddle::optional<std::thread::id> capturing_thread_id_;
+  static std::unique_ptr<CUDAGraph> capturing_graph_;
+};
+
+#if defined(PADDLE_WITH_HIP)
+class CUDAGraphCaptureModeGuard {
+  DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);
+
+ public:
+  explicit CUDAGraphCaptureModeGuard(
+      gpuStreamCaptureMode mode = hipStreamCaptureModeRelaxed) {
+    if (UNLIKELY(CUDAGraph::IsCapturing())) {
+      PADDLE_ENFORCE_GPU_SUCCESS(hipThreadExchangeStreamCaptureMode(&mode));
+      // After cudaThreadExchangeStreamCaptureMode is called,
+      // the variable "mode" would be set to the old capturing mode.
+      old_mode_ = mode;
+    }
+  }
+
+  ~CUDAGraphCaptureModeGuard() PADDLE_MAY_THROW {
+    if (UNLIKELY(CUDAGraph::IsCapturing())) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipThreadExchangeStreamCaptureMode(&old_mode_));
+    }
+  }
+
+ private:
+  gpuStreamCaptureMode old_mode_;
+};
+#else
+class CUDAGraphCaptureModeGuard {
+  DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);
+
+ public:
+  explicit CUDAGraphCaptureModeGuard(
+      gpuStreamCaptureMode mode = hipStreamCaptureModeRelaxed) {}
+};
+#endif
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/rocm/rocm_info.cc b/paddle/phi/backends/gpu/rocm/rocm_info.cc
index edc23479c9238..b8ddea98b5c9e 100644
--- a/paddle/phi/backends/gpu/rocm/rocm_info.cc
+++ b/paddle/phi/backends/gpu/rocm/rocm_info.cc
@@ -173,7 +173,7 @@ int GetCurrentDeviceId() {
   return device_id;
 }
 
-std::array<int, 3> GetGpuMaxGridDimSize(int id) {
+std::array<unsigned int, 3> GetGpuMaxGridDimSize(int id) {
   PADDLE_ENFORCE_LT(
       id,
       GetGPUDeviceCount(),
@@ -181,7 +181,7 @@ std::array<int, 3> GetGpuMaxGridDimSize(int id) {
                                    "but received id is: %d. GPU count is: %d.",
                                    id,
                                    GetGPUDeviceCount()));
-  std::array<int, 3> ret;
+  std::array<unsigned int, 3> ret;
   int size;
   auto error_code_x =
       hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id);
diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc
index 6169681885b7b..6cf80c350cd04 100644
--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -14,8 +14,10 @@
 
 #include "paddle/phi/core/device_context.h"
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA)
 #include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
+#elif defined(PADDLE_WITH_HIP)
+#include "paddle/phi/backends/gpu/rocm/hip_graph.h"
 #endif
 
 #include "paddle/phi/core/dense_tensor.h"
@@ -70,7 +72,7 @@ struct DeviceContext::Impl {
     pinned_allocator_ = allocator;
   }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void SetCUDAGraphAllocator(const Allocator* allocator) {
     // NOTE (Yuang): cuda graph allocator can be set to nullptr, so don't check
     // validation of the allocator here
@@ -163,7 +165,7 @@ struct DeviceContext::Impl {
         (fake_alloc || tensor->numel() == 0) && requested_size == 0
             ? zero_allocator_
             : (pinned ? pinned_allocator_ : device_allocator_);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     bool must_cuda_graph_allocator =
         (!fake_alloc && tensor->numel() != 0) && !pinned;
     if (must_cuda_graph_allocator &&
@@ -289,7 +291,7 @@ struct DeviceContext::Impl {
   const Allocator* zero_allocator_{nullptr};
   const Allocator* host_zero_allocator_{nullptr};
   const Allocator* pinned_allocator_{nullptr};
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   const Allocator* cuda_graph_allocator_{nullptr};
 #endif
   Generator* device_generator_{nullptr};
@@ -309,7 +311,7 @@ DeviceContext::DeviceContext(const DeviceContext& other) {
   impl_->SetPinnedAllocator(&other.GetPinnedAllocator());
   impl_->SetHostGenerator(other.GetHostGenerator());
   impl_->SetGenerator(other.GetGenerator());
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (other.IsCUDAGraphAllocatorValid()) {
     impl_->SetCUDAGraphAllocator(&other.GetCUDAGraphAllocator());
   }
@@ -340,7 +342,7 @@ const Allocator& DeviceContext::GetHostAllocator() const {
   return impl_->GetHostAllocator();
 }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void DeviceContext::SetCUDAGraphAllocator(const Allocator* allocator) {
   impl_->SetCUDAGraphAllocator(allocator);
 }
diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h
index 25d748c915086..9ead0e2c32b23 100644
--- a/paddle/phi/core/device_context.h
+++ b/paddle/phi/core/device_context.h
@@ -115,7 +115,7 @@ class PADDLE_API DeviceContext {
 
   const Allocator& GetPinnedAllocator() const;
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   /**
    * @brief Set the CUDA graph Allocator object.
    *
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 80d61ebc9a9a6..304fd3cef793a 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -213,6 +213,7 @@ if(WITH_ROCM)
     "gpu/put_along_axis_grad_kernel.cu"
     "gpu/put_along_axis_kernel.cu"
     "gpu/qr_kernel.cu"
+    "gpu/rms_norm_grad_kernel.cu"
     "gpu/svd_kernel.cu"
     "gpudnn/mha_cudnn_frontend.cu"
     "fusion/gpu/block_multi_head_attention_kernel.cu"
diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
index 03bc6ca85efed..463272a37c00d 100644
--- a/paddle/phi/kernels/funcs/dropout_impl.cu.h
+++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -368,7 +368,7 @@ void DropoutFwGPUKernelDriver(
 
       phi::backends::gpu::CUDAGraphNodeLauncher::parameterSetter_t
           parameterSetter = [offset, dev_ctx_p, state_index, is_fix_seed](
-                                phi::backends::gpu::CUDAKernelParams& params) {
+                                phi::backends::gpu::gpuKernelParams& params) {
             if (!is_fix_seed) {
               // we assume seed is null pointer
               // seed copy to cpu is meaningless here
@@ -389,7 +389,7 @@ void DropoutFwGPUKernelDriver(
             }
           };
 
-      phi::backends::gpu::CUDAGraphNodeLauncher::cudaKernelCallback_t
+      phi::backends::gpu::CUDAGraphNodeLauncher::gpuKernelCallback_t
           cudaKernelCallback = [=](unsigned int id) {
             void* functionPtr =
                 reinterpret_cast<void*>(&(VectorizedRandomGenerator<T>));
diff --git a/paddle/phi/kernels/funcs/segmented_array.h b/paddle/phi/kernels/funcs/segmented_array.h
index e6ecb9819e505..4b4b1b59db66e 100644
--- a/paddle/phi/kernels/funcs/segmented_array.h
+++ b/paddle/phi/kernels/funcs/segmented_array.h
@@ -118,7 +118,7 @@ struct ArraySetterBase {
         phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
 
     int8_t* restored = reinterpret_cast<int8_t*>(src);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (use_cuda_graph) {
       restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph<int8_t>(
           restored, num_bytes);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
index ff6380ceeec0a..801f070251fb2 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
@@ -218,7 +218,7 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
     // seed_offset_data should preserved by cudaGraph pool
     const phi::GPUContext* dev_ctx_p = &dev_ctx;
     auto parameterSetter = [offset, dev_ctx_p, seed_offset](
-                               phi::backends::gpu::CUDAKernelParams& params) {
+                               phi::backends::gpu::gpuKernelParams& params) {
       const auto* seed_offset_data = seed_offset.data<int64_t>();
       const uint64_t seed_data = static_cast<uint64_t>(seed_offset_data[0]);
       const uint64_t increment = static_cast<uint64_t>(seed_offset_data[1]);
@@ -229,7 +229,7 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
                << ", increment = " << increment;
     };
 
-    phi::backends::gpu::CUDAGraphNodeLauncher::cudaKernelCallback_t
+    phi::backends::gpu::CUDAGraphNodeLauncher::gpuKernelCallback_t
         cudaKernelCallback = [=](unsigned int id) {
           void* functionPtr = reinterpret_cast<void*>(
               &(VectorizedDropoutBackward<T, NoMaskBwFunctor<T, float>>));
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
index 5ec23e777211b..c95c5fbf0ca3d 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
@@ -211,7 +211,7 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
          seed_offset_data,
          state_index,
          seed_tensor_ptr,
-         fix_seed](phi::backends::gpu::CUDAKernelParams& params) {
+         fix_seed](phi::backends::gpu::gpuKernelParams& params) {
           if (!fix_seed) {
             auto gen_cuda = dev_ctx_p->GetGenerator();
             // ensure the generator use correct state index
@@ -233,7 +233,7 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
             seed_offset_data[1] = static_cast<int64_t>(increment);
           }
         };
-    phi::backends::gpu::CUDAGraphNodeLauncher::cudaKernelCallback_t
+    phi::backends::gpu::CUDAGraphNodeLauncher::gpuKernelCallback_t
         cudaKernelCallback = [=](unsigned int id) {
           void* functionPtr = reinterpret_cast<void*>(
               &(VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>));