Skip to content

Commit

Permalink
use dynload for cufft (PaddlePaddle#46)
Browse files Browse the repository at this point in the history
* use std::ptrdiff_t as datatype of stride (instead of int64_t) to avoid argument mismatch on some platforms.

* add complex support for fill_zeros_like

* use dynload for cufft
  • Loading branch information
Feiyu Chan authored Sep 15, 2021
1 parent 2cb21c0 commit 5ccdf98
Show file tree
Hide file tree
Showing 11 changed files with 270 additions and 62 deletions.
47 changes: 24 additions & 23 deletions paddle/fluid/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,29 +98,30 @@ else()
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
endif()

op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS ${OP_HEADER_DEPS})
if (WITH_GPU)
find_library(CUFFT_LIB libcufft.so
PATHS
${CUDA_TOOLKIT_ROOT_DIR}/lib64/
NO_DEFAULT_PATH
)
target_link_libraries(spectral_op ${CUFFT_LIB})
endif()
if(WITH_ONEMKL)
find_library(ONEMKL_CORE libmkl_core.so
PATHS
${MKL_ROOT}/lib/${MKL_ARCH}
NO_DEFAULT_PATH
)
find_library(ONEMKL_THREAD libmkl_intel_thread.so
PATHS
${MKL_ROOT}/lib/${MKL_ARCH}
NO_DEFAULT_PATH
)
target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE})
target_link_libraries(spectral_op MKL::mkl_core MKL::mkl_intel_thread)
endif()
op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda ${OP_HEADER_DEPS})
# op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS ${OP_HEADER_DEPS})
# if (WITH_GPU)
# find_library(CUFFT_LIB libcufft.so
# PATHS
# ${CUDA_TOOLKIT_ROOT_DIR}/lib64/
# NO_DEFAULT_PATH
# )
# target_link_libraries(spectral_op ${CUFFT_LIB})
# endif()
# if(WITH_ONEMKL)
# find_library(ONEMKL_CORE libmkl_core.so
# PATHS
# ${MKL_ROOT}/lib/${MKL_ARCH}
# NO_DEFAULT_PATH
# )
# find_library(ONEMKL_THREAD libmkl_intel_thread.so
# PATHS
# ${MKL_ROOT}/lib/${MKL_ARCH}
# NO_DEFAULT_PATH
# )
# target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE})
# target_link_libraries(spectral_op MKL::mkl_core MKL::mkl_intel_thread)
# endif()

op_library(lstm_op DEPS ${OP_HEADER_DEPS} lstm_compute)
op_library(eye_op DEPS ${OP_HEADER_DEPS})
Expand Down
13 changes: 11 additions & 2 deletions paddle/fluid/operators/fill_zeros_like_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/fill_zeros_like_op.h"
#include "paddle/fluid/platform/complex.h"

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -93,12 +94,20 @@ REGISTER_OP_CPU_KERNEL(
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);

REGISTER_OP_CPU_KERNEL(
fill_zeros_like2,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
13 changes: 11 additions & 2 deletions paddle/fluid/operators/fill_zeros_like_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ limitations under the License. */

#include "paddle/fluid/operators/fill_zeros_like_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/float16.h"

namespace ops = paddle::operators;
Expand All @@ -25,7 +26,11 @@ REGISTER_OP_CUDA_KERNEL(
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);

REGISTER_OP_CUDA_KERNEL(
fill_zeros_like2,
Expand All @@ -35,4 +40,8 @@ REGISTER_OP_CUDA_KERNEL(
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
48 changes: 31 additions & 17 deletions paddle/fluid/operators/spectral_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -240,15 +240,29 @@ class FFTC2ROp : public framework::OperatorWithKernel {
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "fft_c2r");

const auto axes = ctx->Attrs().Get<std::vector<int64_t>>("axes");
const auto x_dim = ctx->GetInputDim("X");
for (size_t i = 0; i < axes.size() - 1L; i++) {
const auto fft_n_point = (x_dim[axes[i]] - 1) * 2;
PADDLE_ENFORCE_GT(fft_n_point, 0,
platform::errors::InvalidArgument(
"Invalid fft n-point (%d).", fft_n_point));
}

const int64_t last_dim_size = ctx->Attrs().Get<int64_t>("last_dim_size");
framework::DDim out_dim(ctx->GetInputDim("X"));
const int64_t last_fft_axis = axes.back();
if (last_dim_size == 0) {
const int64_t last_fft_dim_size = out_dim.at(last_fft_axis);
out_dim.at(last_fft_axis) = (last_fft_dim_size - 1) * 2;
const int64_t fft_n_point = (last_fft_dim_size - 1) * 2;
PADDLE_ENFORCE_GT(fft_n_point, 0,
platform::errors::InvalidArgument(
"Invalid fft n-point (%d).", fft_n_point));
out_dim.at(last_fft_axis) = fft_n_point;
} else {
out_dim.at(last_fft_axis) = ctx->Attrs().Get<int64_t>("last_dim_size");
PADDLE_ENFORCE_GT(last_dim_size, 0,
platform::errors::InvalidArgument(
"Invalid fft n-point (%d).", last_dim_size));
out_dim.at(last_fft_axis) = last_dim_size;
}
ctx->SetOutputDim("Out", out_dim);
}
Expand Down Expand Up @@ -681,11 +695,11 @@ struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
const auto& input_dim = x->dims();
const std::vector<size_t> in_sizes =
framework::vectorize<size_t>(input_dim);
std::vector<int64_t> in_strides =
framework::vectorize<int64_t>(framework::stride(input_dim));
std::vector<std::ptrdiff_t> in_strides =
framework::vectorize<std::ptrdiff_t>(framework::stride(input_dim));
const int64_t data_size = sizeof(C);
std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
[](int64_t s) { return s * data_size; });
[](std::ptrdiff_t s) { return s * data_size; });

const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
auto* out_data = reinterpret_cast<C*>(out->data<To>());
Expand Down Expand Up @@ -714,24 +728,24 @@ struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
const auto& input_dim = x->dims();
const std::vector<size_t> in_sizes =
framework::vectorize<size_t>(input_dim);
std::vector<int64_t> in_strides =
framework::vectorize<int64_t>(framework::stride(input_dim));
std::vector<std::ptrdiff_t> in_strides =
framework::vectorize<std::ptrdiff_t>(framework::stride(input_dim));
{
const int64_t data_size = sizeof(R);
std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
[](int64_t s) { return s * data_size; });
[](std::ptrdiff_t s) { return s * data_size; });
}

const auto& output_dim = out->dims();
const std::vector<size_t> out_sizes =
framework::vectorize<size_t>(output_dim);
std::vector<int64_t> out_strides =
framework::vectorize<int64_t>(framework::stride(output_dim));
std::vector<std::ptrdiff_t> out_strides =
framework::vectorize<std::ptrdiff_t>(framework::stride(output_dim));
{
const int64_t data_size = sizeof(C);
std::transform(out_strides.begin(), out_strides.end(),
out_strides.begin(),
[](int64_t s) { return s * data_size; });
[](std::ptrdiff_t s) { return s * data_size; });
}

const auto* in_data = x->data<R>();
Expand Down Expand Up @@ -761,24 +775,24 @@ struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
const auto& input_dim = x->dims();
const std::vector<size_t> in_sizes =
framework::vectorize<size_t>(input_dim);
std::vector<int64_t> in_strides =
framework::vectorize<int64_t>(framework::stride(input_dim));
std::vector<std::ptrdiff_t> in_strides =
framework::vectorize<std::ptrdiff_t>(framework::stride(input_dim));
{
const int64_t data_size = sizeof(C);
std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
[](int64_t s) { return s * data_size; });
[](std::ptrdiff_t s) { return s * data_size; });
}

const auto& output_dim = out->dims();
const std::vector<size_t> out_sizes =
framework::vectorize<size_t>(output_dim);
std::vector<int64_t> out_strides =
framework::vectorize<int64_t>(framework::stride(output_dim));
std::vector<std::ptrdiff_t> out_strides =
framework::vectorize<std::ptrdiff_t>(framework::stride(output_dim));
{
const int64_t data_size = sizeof(R);
std::transform(out_strides.begin(), out_strides.end(),
out_strides.begin(),
[](int64_t s) { return s * data_size; });
[](std::ptrdiff_t s) { return s * data_size; });
}

const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
Expand Down
25 changes: 17 additions & 8 deletions paddle/fluid/operators/spectral_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "paddle/fluid/operators/conj_op.h"
#include "paddle/fluid/operators/spectral_op.h"
#include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/dynload/cufft.h"

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -141,15 +142,16 @@ class CuFFTHandle {
::cufftHandle handle_;

public:
CuFFTHandle() { CUFFT_CHECK(cufftCreate(&handle_)); }
CuFFTHandle() { CUFFT_CHECK(platform::dynload::cufftCreate(&handle_)); }

::cufftHandle& get() { return handle_; }
const ::cufftHandle& get() const { return handle_; }

~CuFFTHandle() {
// Not using fftDestroy() for rocFFT to work around double freeing of handles
#ifndef __HIPCC__
cufftDestroy(handle_);
std::cout << "Dtor of CuFFTHandle" << std::endl;
CUFFT_CHECK(platform::dynload::cufftDestroy(handle_));
#endif
}
};
Expand Down Expand Up @@ -245,7 +247,8 @@ class CuFFTConfig {
#endif

// disable auto allocation of workspace to use THC allocator
CUFFT_CHECK(cufftSetAutoAllocation(plan(), /* autoAllocate */ 0));
CUFFT_CHECK(platform::dynload::cufftSetAutoAllocation(
plan(), /* autoAllocate */ 0));

size_t ws_size_t;

Expand All @@ -258,7 +261,7 @@ class CuFFTConfig {
batch, &ws_size_t));
#else

CUFFT_CHECK(cufftXtMakePlanMany(
CUFFT_CHECK(platform::dynload::cufftXtMakePlanMany(
plan(), signal_ndim, signal_sizes.data(),
/* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
/* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
Expand Down Expand Up @@ -364,6 +367,11 @@ class PlanLRUCache {
return *this;
}

~PlanLRUCache() {
std::cout << "DTor of PlanLRUCache" << std::endl;
clear();
}

// If key is in this cache, return the cached config. Otherwise, emplace the
// config in this cache and return it.
CuFFTConfig& lookup(PlanKey params) {
Expand Down Expand Up @@ -498,8 +506,8 @@ static void exec_cufft_plan(const CuFFTConfig& config, void* in_data,
PADDLE_THROW(platform::errors::InvalidArgument(
"hipFFT only support transforms of type float32 and float64"));
#else
CUFFT_CHECK(cufftXtExec(plan, in_data, out_data,
forward ? CUFFT_FORWARD : CUFFT_INVERSE));
CUFFT_CHECK(platform::dynload::cufftXtExec(
plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
#endif
}

Expand Down Expand Up @@ -641,10 +649,11 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
auto& plan = config->plan();

// prepare cufft for execution
CUFFT_CHECK(cufftSetStream(plan, ctx.stream()));
CUFFT_CHECK(platform::dynload::cufftSetStream(plan, ctx.stream()));
framework::Tensor workspace_tensor;
workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
CUFFT_CHECK(cufftSetWorkArea(plan, workspace_tensor.data<To>()));
CUFFT_CHECK(
platform::dynload::cufftSetWorkArea(plan, workspace_tensor.data<To>()));

// execute transform plan
if (fft_type == FFTTransformType::C2R && forward) {
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/platform/dynload/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)

list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc)
list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc cufft.cc)

if (NOT WITH_NV_JETSON)
list(APPEND CUDA_SRCS nvjpeg.cc)
Expand Down
44 changes: 44 additions & 0 deletions paddle/fluid/platform/dynload/cufft.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/platform/dynload/cufft.h"
#include "paddle/fluid/platform/enforce.h"

namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cufft_dso_flag;
void* cufft_dso_handle = nullptr;

#define DEFINE_WRAP(__name) DynLoad__##__name __name

CUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);

bool HasCUFFT() {
std::call_once(cufft_dso_flag,
[]() { cufft_dso_handle = GetCUFFTDsoHandle(); });
return cufft_dso_handle != nullptr;
}

void EnforceCUFFTLoaded(const char* fn_name) {
PADDLE_ENFORCE_NOT_NULL(
cufft_dso_handle,
platform::errors::PreconditionNotMet(
"Cannot load cufft shared library. Cannot invoke method %s.",
fn_name));
}

} // namespace dynload
} // namespace platform
} // namespace paddle
Loading

0 comments on commit 5ccdf98

Please sign in to comment.