Skip to content

Commit

Permalink
[DCU] fix compile error on develop (#62832)
Browse files Browse the repository at this point in the history
* [DCU] fix build error, test=develop

* fix py3 cpu ci build error
  • Loading branch information
qili93 authored Mar 20, 2024
1 parent 4f06a9c commit 6925c9d
Show file tree
Hide file tree
Showing 38 changed files with 1,204 additions and 132 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ static void RunProgramDescs(const ProgramDescs &programs,

FetchResultType ScopeBufferedSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors, bool return_merged) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::IsCUDAGraphCapturing()) {
strategy_.num_iteration_per_drop_scope_ =
std::numeric_limits<size_t>::max();
Expand Down
12 changes: 6 additions & 6 deletions paddle/fluid/framework/new_executor/pir_interpreter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ PirInterpreter::PirInterpreter(const platform::Place& place,
<< std::chrono::high_resolution_clock::now().time_since_epoch().count();
BuildScope(*ir_block_, ss.str(), value_exe_info_.get());

#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
calculate_stream_timer_ = std::make_unique<phi::CalculateStreamTimer>(place);
#endif
}
Expand Down Expand Up @@ -299,7 +299,7 @@ void PirInterpreter::ShareBuildResultsFrom(const InterpreterBaseImpl& src) {

std::tuple<double, double> PirInterpreter::InterpreterRunTime() {
double start_time = 0, end_time = 0;
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
start_time = calculate_stream_timer_->StartTime();
end_time = calculate_stream_timer_->EndTime();
#endif
Expand Down Expand Up @@ -337,7 +337,7 @@ std::shared_ptr<interpreter::AsyncWorkQueue> PirInterpreter::GetWorkQueue() {

void PirInterpreter::PrepareForCUDAGraphCapture() {
if (!FLAGS_new_executor_use_cuda_graph) return;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ(
platform::IsCUDAGraphCapturing(),
false,
Expand All @@ -362,7 +362,7 @@ void PirInterpreter::PrepareForCUDAGraphCapture() {

void PirInterpreter::CheckCUDAGraphBeforeRun(
const std::vector<std::string>& feed_names) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::IsCUDAGraphCapturing()) {
PADDLE_ENFORCE_EQ(
feed_names.empty(),
Expand Down Expand Up @@ -1724,7 +1724,7 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) {

try {
instr_node->WaitEvent(cur_place);
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (enable_job_schedule_profiler_) {
std::string op_name = instr_node->Name();
::pir::Operation* op = instr_node->Operation();
Expand Down Expand Up @@ -1772,7 +1772,7 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) {
}
VLOG(5) << "after run kernel";
instr_node->RecordEvent(cur_place);
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (enable_job_schedule_profiler_) {
if (instr_node->Id() == last_calculate_instr_id_ &&
calculate_stream_timer_->IsStarted()) {
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/framework/new_executor/pir_interpreter.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include "paddle/fluid/framework/new_executor/interpreter_base_impl.h"
#include "paddle/pir/include/core/value.h"

#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/phi/kernels/autotune/gpu_timer.h"
#endif

Expand Down Expand Up @@ -274,7 +274,7 @@ class PirInterpreter : public InterpreterBaseImpl {
// belongs to a parameter and cannot GC.
std::unordered_set<std::string> parameter_var_names_;

#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::unique_ptr<phi::CalculateStreamTimer> calculate_stream_timer_;
#endif
size_t last_calculate_instr_id_;
Expand Down
10 changes: 5 additions & 5 deletions paddle/fluid/framework/new_executor/program_interpreter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
if (fetch_var) {
auto fetch_list =
std::move(*fetch_var->GetMutable<framework::FetchList>());
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::IsCUDAGraphCapturing()) {
PADDLE_ENFORCE_EQ(fetch_list.empty(),
true,
Expand Down Expand Up @@ -269,7 +269,7 @@ FetchList ProgramInterpreter::Run(
if (fetch_var) {
auto fetch_list =
std::move(*fetch_var->GetMutable<framework::FetchList>());
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::IsCUDAGraphCapturing()) {
PADDLE_ENFORCE_EQ(fetch_list.empty(),
true,
Expand Down Expand Up @@ -533,7 +533,7 @@ void ProgramInterpreter::BuildInplace() {

void ProgramInterpreter::PrepareForCUDAGraphCapture() {
if (!FLAGS_new_executor_use_cuda_graph) return;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ(
platform::IsCUDAGraphCapturing(),
false,
Expand Down Expand Up @@ -579,7 +579,7 @@ void ProgramInterpreter::PrepareForCUDAGraphCapture() {

void ProgramInterpreter::CheckCUDAGraphBeforeRun(
const std::vector<std::string>& feed_names) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::IsCUDAGraphCapturing()) {
PADDLE_ENFORCE_EQ(
feed_names.empty(),
Expand Down Expand Up @@ -862,7 +862,7 @@ void ProgramInterpreter::BuildOpFuncNode(
auto& op_func_node = nodes[op_idx];
stream_analyzer_.SetForceEventsToWaitInfo(force_events_to_wait_);
auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_new_executor_use_cuda_graph) {
auto& op = op_func_node.operator_base_;
auto& op_type = op->Type();
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/framework/parallel_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1416,7 +1416,7 @@ void ParallelExecutor::PreludeToRun(
platform::RecordEvent record_run(
"ParallelExecutor::Run", platform::TracerEventType::UserDefined, 1);
VLOG(3) << "enter ParallelExecutor Run";
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::IsCUDAGraphCapturing()) {
PADDLE_ENFORCE_EQ(fetch_tensors.empty(),
true,
Expand Down Expand Up @@ -1804,7 +1804,7 @@ const ir::Graph &ParallelExecutor::Graph() const {
void ParallelExecutor::PrepareForCUDAGraphCapture(ir::Graph *graph) {
const auto &build_strategy = member_->build_strategy_;
if (!build_strategy.allow_cuda_graph_capture_) return;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ(
build_strategy.async_mode_,
false,
Expand Down
22 changes: 14 additions & 8 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2691,7 +2691,7 @@ void AnalysisPredictor::HookCollectShapeRangeInfo() {
int32_tensor.data<int>(),
int32_tensor.numel() * sizeof(int));
} else if (platform::is_gpu_place(tensor->place())) {
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto *dev_ctx = pool.Get(tensor->place());
auto &int32_tensor = *tensor;
if (tensor->dtype() == phi::DataType::INT64) {
Expand Down Expand Up @@ -2914,7 +2914,7 @@ bool AnalysisPredictor::LoadParameters() {
}

uint64_t AnalysisPredictor::TryShrinkMemory() {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (config_.use_gpu()) {
paddle::platform::EmptyCache();
}
Expand Down Expand Up @@ -3607,39 +3607,39 @@ bool InternalUtils::RunWithRuntimeConfig(paddle_infer::Predictor *p,

void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c,
bool with_interleaved) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
c->trt_with_interleaved_ = with_interleaved;
#endif
}

void InternalUtils::SetTransformerPosid(
paddle_infer::Config *c, const std::string &tensorrt_transformer_posid) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
c->tensorrt_transformer_posid_ = tensorrt_transformer_posid;
#endif
}

void InternalUtils::SetTransformerMaskid(
paddle_infer::Config *c, const std::string &tensorrt_transformer_maskid) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
c->tensorrt_transformer_maskid_ = tensorrt_transformer_maskid;
#endif
}

void InternalUtils::DisableTensorRtHalfOps(
paddle_infer::Config *c, const std::unordered_set<std::string> &ops) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
c->trt_ops_run_float_ = ops;
#endif
}

void InternalUtils::SyncStream(paddle_infer::Predictor *p) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto *pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
auto *dev_ctx = reinterpret_cast<phi::GPUContext *>(pool.Get(pred->place_));
cudaStreamSynchronize(dev_ctx->stream());
paddle::gpuStreamSynchronize(dev_ctx->stream());
#endif
}
void InternalUtils::SyncStream(cudaStream_t stream) {
Expand All @@ -3648,5 +3648,11 @@ void InternalUtils::SyncStream(cudaStream_t stream) {
#endif
}

void InternalUtils::SyncStream(hipStream_t stream) {
#ifdef PADDLE_WITH_HIP
hipStreamSynchronize(stream);
#endif
}

} // namespace experimental
} // namespace paddle_infer
1 change: 1 addition & 0 deletions paddle/fluid/inference/api/paddle_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,7 @@ class PD_INFER_DECL InternalUtils {

static void SyncStream(paddle_infer::Predictor* pred);
static void SyncStream(cudaStream_t stream);
static void SyncStream(hipStream_t stream);
template <typename T>
static void CopyFromCpuWithIoStream(paddle_infer::Tensor* t,
const T* data,
Expand Down
20 changes: 13 additions & 7 deletions paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,10 @@
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_context.h"

#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA)
#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
#elif defined(PADDLE_WITH_HIP)
#include "paddle/phi/backends/gpu/rocm/hip_graph.h"
#endif

#if CUDA_VERSION >= 10020
Expand All @@ -49,6 +51,10 @@
#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
#include "paddle/fluid/platform/dynload/cuda_driver.h"
#endif

#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/memory/allocation/cuda_malloc_async_allocator.h" // NOLINT
#endif
#endif

#ifdef PADDLE_WITH_XPU
Expand Down Expand Up @@ -107,7 +113,7 @@ namespace paddle {
namespace memory {
namespace allocation {

#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class CUDAGraphAllocator
: public Allocator,
public std::enable_shared_from_this<CUDAGraphAllocator> {
Expand Down Expand Up @@ -158,7 +164,7 @@ class CUDAGraphAllocator
#endif

static bool IsCUDAGraphCapturing() {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing());
#else
return false;
Expand Down Expand Up @@ -329,7 +335,7 @@ class AllocatorFacadePrivate {

CheckAllocThreadSafe();

#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// No need to wrap CUDAGraphAllocator for StreamSafeCUDAAllocator
if (!is_stream_safe_cuda_allocator_used_ &&
UNLIKELY(IsCUDAGraphCapturing())) {
Expand Down Expand Up @@ -1120,7 +1126,7 @@ class AllocatorFacadePrivate {
allocator = std::make_shared<StatAllocator>(allocator);
}

#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void WrapCUDAGraphAllocator() {
for (auto& item : allocators_) {
auto& allocator = item.second;
Expand Down Expand Up @@ -1511,7 +1517,7 @@ AllocatorFacade& AllocatorFacade::Instance() {
}

AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// if we use cuda_malloc_async_allocator, we don't need to open a private pool
// for each graph
if (UNLIKELY(IsCUDAGraphCapturing()) &&
Expand Down Expand Up @@ -1702,7 +1708,7 @@ void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
}
}

#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(int64_t id) {
PADDLE_ENFORCE_EQ(GetAllocatorStrategy(),
AllocatorStrategy::kAutoGrowth,
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/memory/allocation/allocator_facade.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ class AllocatorFacade {
void SetDefaultStream(const platform::CUDAPlace& place, gpuStream_t stream);
#endif

#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void PrepareMemoryPoolForCUDAGraph(int64_t id);
void RemoveMemoryPoolOfCUDAGraph(int64_t id);
#endif
Expand All @@ -116,7 +116,7 @@ class AllocatorFacade {
private:
AllocatorFacade();
AllocatorFacadePrivate* m_;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::unordered_map<int64_t, std::unique_ptr<AllocatorFacadePrivate>>
cuda_graph_map_;
std::unordered_map<int64_t, int64_t> cuda_graph_ref_cnt_;
Expand Down
9 changes: 4 additions & 5 deletions paddle/fluid/memory/allocation/cuda_ipc_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,16 @@ std::shared_ptr<void> GetIpcBasePtr(std::string handle) {
// The IpcMemHandle can only open once for the same handle,
// so here we cache it here.
void *baseptr = nullptr;
auto ipc_handle =
reinterpret_cast<const cudaIpcMemHandle_t *>(handle.c_str());
PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcOpenMemHandle(
&baseptr, *ipc_handle, cudaIpcMemLazyEnablePeerAccess));
auto ipc_handle = reinterpret_cast<const gpuIpcMemHandle_t *>(handle.c_str());
PADDLE_ENFORCE_GPU_SUCCESS(gpuIpcOpenMemHandle(
&baseptr, *ipc_handle, gpuIpcMemLazyEnablePeerAccess));
// Close ipc handle on the same device.
int device_id = platform::GetCurrentDeviceId();
// Add deleter to close ipc handle.
auto sp = std::shared_ptr<void>(baseptr, [handle, device_id](void *ptr) {
platform::CUDADeviceGuard guard(device_id);
std::lock_guard<std::mutex> lock(ipc_mutex_);
PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcCloseMemHandle(ptr));
PADDLE_ENFORCE_GPU_SUCCESS(gpuIpcCloseMemHandle(ptr));
ipc_handle_to_baseptr_.erase(handle);
VLOG(6) << "cudaIpcCloseMemHandle for ptr:"
<< "\t" << ptr;
Expand Down
Loading

0 comments on commit 6925c9d

Please sign in to comment.