use dynload for cufft (PaddlePaddle#46)

* use std::ptrdiff_t as datatype of stride (instead of int64_t) to avoid argument mismatch on some platforms. * add complex support for fill_zeros_like * use dynload for cufft
KPatr1ck · Sep 15, 2021 · 5ccdf98 · 5ccdf98
1 parent 2cb21c0
commit 5ccdf98
Show file tree

Hide file tree

Showing 11 changed files with 270 additions and 62 deletions.
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
@@ -98,29 +98,30 @@ else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
 
-op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS ${OP_HEADER_DEPS})
-if (WITH_GPU)
-    find_library(CUFFT_LIB libcufft.so
-        PATHS
-        ${CUDA_TOOLKIT_ROOT_DIR}/lib64/
-        NO_DEFAULT_PATH
-    )
-    target_link_libraries(spectral_op ${CUFFT_LIB})
-endif()
-if(WITH_ONEMKL)
-    find_library(ONEMKL_CORE libmkl_core.so
-        PATHS
-        ${MKL_ROOT}/lib/${MKL_ARCH}
-        NO_DEFAULT_PATH
-    )
-    find_library(ONEMKL_THREAD libmkl_intel_thread.so
-        PATHS
-        ${MKL_ROOT}/lib/${MKL_ARCH}
-        NO_DEFAULT_PATH
-    )
-    target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE})
-    target_link_libraries(spectral_op MKL::mkl_core MKL::mkl_intel_thread)
-endif()
+op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda ${OP_HEADER_DEPS})
+# op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS ${OP_HEADER_DEPS})
+# if (WITH_GPU)
+#     find_library(CUFFT_LIB libcufft.so
+#         PATHS
+#         ${CUDA_TOOLKIT_ROOT_DIR}/lib64/
+#         NO_DEFAULT_PATH
+#     )
+#     target_link_libraries(spectral_op ${CUFFT_LIB})
+# endif()
+# if(WITH_ONEMKL)
+#     find_library(ONEMKL_CORE libmkl_core.so
+#         PATHS
+#         ${MKL_ROOT}/lib/${MKL_ARCH}
+#         NO_DEFAULT_PATH
+#     )
+#     find_library(ONEMKL_THREAD libmkl_intel_thread.so
+#         PATHS
+#         ${MKL_ROOT}/lib/${MKL_ARCH}
+#         NO_DEFAULT_PATH
+#     )
+#     target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE})
+#     target_link_libraries(spectral_op MKL::mkl_core MKL::mkl_intel_thread)
+# endif()
 
 op_library(lstm_op DEPS ${OP_HEADER_DEPS}  lstm_compute)
 op_library(eye_op DEPS ${OP_HEADER_DEPS})

diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fill_zeros_like_op.h"
+#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -93,12 +94,20 @@ REGISTER_OP_CPU_KERNEL(
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext,
+                             paddle::platform::complex<float>>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext,
+                             paddle::platform::complex<double>>);
 
 REGISTER_OP_CPU_KERNEL(
     fill_zeros_like2,
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
     ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext,
+                             paddle::platform::complex<float>>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext,
+                             paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fill_zeros_like_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
@@ -25,7 +26,11 @@ REGISTER_OP_CUDA_KERNEL(
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
                              paddle::platform::float16>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
+                             paddle::platform::complex<float>>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
+                             paddle::platform::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
     fill_zeros_like2,
@@ -35,4 +40,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
                              paddle::platform::float16>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
+                             paddle::platform::complex<float>>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
+                             paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc
@@ -240,15 +240,29 @@ class FFTC2ROp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "fft_c2r");
 
     const auto axes = ctx->Attrs().Get<std::vector<int64_t>>("axes");
+    const auto x_dim = ctx->GetInputDim("X");
+    for (size_t i = 0; i < axes.size() - 1L; i++) {
+      const auto fft_n_point = (x_dim[axes[i]] - 1) * 2;
+      PADDLE_ENFORCE_GT(fft_n_point, 0,
+                        platform::errors::InvalidArgument(
+                            "Invalid fft n-point (%d).", fft_n_point));
+    }
 
     const int64_t last_dim_size = ctx->Attrs().Get<int64_t>("last_dim_size");
     framework::DDim out_dim(ctx->GetInputDim("X"));
     const int64_t last_fft_axis = axes.back();
     if (last_dim_size == 0) {
       const int64_t last_fft_dim_size = out_dim.at(last_fft_axis);
-      out_dim.at(last_fft_axis) = (last_fft_dim_size - 1) * 2;
+      const int64_t fft_n_point = (last_fft_dim_size - 1) * 2;
+      PADDLE_ENFORCE_GT(fft_n_point, 0,
+                        platform::errors::InvalidArgument(
+                            "Invalid fft n-point (%d).", fft_n_point));
+      out_dim.at(last_fft_axis) = fft_n_point;
     } else {
-      out_dim.at(last_fft_axis) = ctx->Attrs().Get<int64_t>("last_dim_size");
+      PADDLE_ENFORCE_GT(last_dim_size, 0,
+                        platform::errors::InvalidArgument(
+                            "Invalid fft n-point (%d).", last_dim_size));
+      out_dim.at(last_fft_axis) = last_dim_size;
     }
     ctx->SetOutputDim("Out", out_dim);
   }
@@ -681,11 +695,11 @@ struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
     const auto& input_dim = x->dims();
     const std::vector<size_t> in_sizes =
         framework::vectorize<size_t>(input_dim);
-    std::vector<int64_t> in_strides =
-        framework::vectorize<int64_t>(framework::stride(input_dim));
+    std::vector<std::ptrdiff_t> in_strides =
+        framework::vectorize<std::ptrdiff_t>(framework::stride(input_dim));
     const int64_t data_size = sizeof(C);
     std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
-                   [](int64_t s) { return s * data_size; });
+                   [](std::ptrdiff_t s) { return s * data_size; });
 
     const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
     auto* out_data = reinterpret_cast<C*>(out->data<To>());
@@ -714,24 +728,24 @@ struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
     const auto& input_dim = x->dims();
     const std::vector<size_t> in_sizes =
         framework::vectorize<size_t>(input_dim);
-    std::vector<int64_t> in_strides =
-        framework::vectorize<int64_t>(framework::stride(input_dim));
+    std::vector<std::ptrdiff_t> in_strides =
+        framework::vectorize<std::ptrdiff_t>(framework::stride(input_dim));
     {
       const int64_t data_size = sizeof(R);
       std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
-                     [](int64_t s) { return s * data_size; });
+                     [](std::ptrdiff_t s) { return s * data_size; });
     }
 
     const auto& output_dim = out->dims();
     const std::vector<size_t> out_sizes =
         framework::vectorize<size_t>(output_dim);
-    std::vector<int64_t> out_strides =
-        framework::vectorize<int64_t>(framework::stride(output_dim));
+    std::vector<std::ptrdiff_t> out_strides =
+        framework::vectorize<std::ptrdiff_t>(framework::stride(output_dim));
     {
       const int64_t data_size = sizeof(C);
       std::transform(out_strides.begin(), out_strides.end(),
                      out_strides.begin(),
-                     [](int64_t s) { return s * data_size; });
+                     [](std::ptrdiff_t s) { return s * data_size; });
     }
 
     const auto* in_data = x->data<R>();
@@ -761,24 +775,24 @@ struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
     const auto& input_dim = x->dims();
     const std::vector<size_t> in_sizes =
         framework::vectorize<size_t>(input_dim);
-    std::vector<int64_t> in_strides =
-        framework::vectorize<int64_t>(framework::stride(input_dim));
+    std::vector<std::ptrdiff_t> in_strides =
+        framework::vectorize<std::ptrdiff_t>(framework::stride(input_dim));
     {
       const int64_t data_size = sizeof(C);
       std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
-                     [](int64_t s) { return s * data_size; });
+                     [](std::ptrdiff_t s) { return s * data_size; });
     }
 
     const auto& output_dim = out->dims();
     const std::vector<size_t> out_sizes =
         framework::vectorize<size_t>(output_dim);
-    std::vector<int64_t> out_strides =
-        framework::vectorize<int64_t>(framework::stride(output_dim));
+    std::vector<std::ptrdiff_t> out_strides =
+        framework::vectorize<std::ptrdiff_t>(framework::stride(output_dim));
     {
       const int64_t data_size = sizeof(R);
       std::transform(out_strides.begin(), out_strides.end(),
                      out_strides.begin(),
-                     [](int64_t s) { return s * data_size; });
+                     [](std::ptrdiff_t s) { return s * data_size; });
     }
 
     const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());

diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu
@@ -26,6 +26,7 @@
 #include "paddle/fluid/operators/conj_op.h"
 #include "paddle/fluid/operators/spectral_op.h"
 #include "paddle/fluid/operators/transpose_op.h"
+#include "paddle/fluid/platform/dynload/cufft.h"
 
 namespace paddle {
 namespace operators {
@@ -141,15 +142,16 @@ class CuFFTHandle {
   ::cufftHandle handle_;
 
  public:
-  CuFFTHandle() { CUFFT_CHECK(cufftCreate(&handle_)); }
+  CuFFTHandle() { CUFFT_CHECK(platform::dynload::cufftCreate(&handle_)); }
 
   ::cufftHandle& get() { return handle_; }
   const ::cufftHandle& get() const { return handle_; }
 
   ~CuFFTHandle() {
 // Not using fftDestroy() for rocFFT to work around double freeing of handles
 #ifndef __HIPCC__
-    cufftDestroy(handle_);
+    std::cout << "Dtor of CuFFTHandle" << std::endl;
+    CUFFT_CHECK(platform::dynload::cufftDestroy(handle_));
 #endif
   }
 };
@@ -245,7 +247,8 @@ class CuFFTConfig {
 #endif
 
     // disable auto allocation of workspace to use THC allocator
-    CUFFT_CHECK(cufftSetAutoAllocation(plan(), /* autoAllocate */ 0));
+    CUFFT_CHECK(platform::dynload::cufftSetAutoAllocation(
+        plan(), /* autoAllocate */ 0));
 
     size_t ws_size_t;
 
@@ -258,7 +261,7 @@ class CuFFTConfig {
         batch, &ws_size_t));
 #else
 
-    CUFFT_CHECK(cufftXtMakePlanMany(
+    CUFFT_CHECK(platform::dynload::cufftXtMakePlanMany(
         plan(), signal_ndim, signal_sizes.data(),
         /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
         /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
@@ -364,6 +367,11 @@ class PlanLRUCache {
     return *this;
   }
 
+  ~PlanLRUCache() {
+    std::cout << "DTor of PlanLRUCache" << std::endl;
+    clear();
+  }
+
   // If key is in this cache, return the cached config. Otherwise, emplace the
   // config in this cache and return it.
   CuFFTConfig& lookup(PlanKey params) {
@@ -498,8 +506,8 @@ static void exec_cufft_plan(const CuFFTConfig& config, void* in_data,
   PADDLE_THROW(platform::errors::InvalidArgument(
       "hipFFT only support transforms of type float32 and float64"));
 #else
-  CUFFT_CHECK(cufftXtExec(plan, in_data, out_data,
-                          forward ? CUFFT_FORWARD : CUFFT_INVERSE));
+  CUFFT_CHECK(platform::dynload::cufftXtExec(
+      plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
 #endif
 }
 
@@ -641,10 +649,11 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
   auto& plan = config->plan();
 
   // prepare cufft for execution
-  CUFFT_CHECK(cufftSetStream(plan, ctx.stream()));
+  CUFFT_CHECK(platform::dynload::cufftSetStream(plan, ctx.stream()));
   framework::Tensor workspace_tensor;
   workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
-  CUFFT_CHECK(cufftSetWorkArea(plan, workspace_tensor.data<To>()));
+  CUFFT_CHECK(
+      platform::dynload::cufftSetWorkArea(plan, workspace_tensor.data<To>()));
 
   // execute transform plan
   if (fft_type == FFTTransformType::C2R && forward) {

diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 
-list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc)
+list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc cufft.cc)
 
 if (NOT WITH_NV_JETSON)
     list(APPEND CUDA_SRCS nvjpeg.cc)

diff --git a/paddle/fluid/platform/dynload/cufft.cc b/paddle/fluid/platform/dynload/cufft.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/cufft.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag cufft_dso_flag;
+void* cufft_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasCUFFT() {
+  std::call_once(cufft_dso_flag,
+                 []() { cufft_dso_handle = GetCUFFTDsoHandle(); });
+  return cufft_dso_handle != nullptr;
+}
+
+void EnforceCUFFTLoaded(const char* fn_name) {
+  PADDLE_ENFORCE_NOT_NULL(
+      cufft_dso_handle,
+      platform::errors::PreconditionNotMet(
+          "Cannot load cufft shared library. Cannot invoke method %s.",
+          fn_name));
+}
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle