Skip to content

Commit

Permalink
[Runtime] Dynamically load cuTensorMapEncodeTiled (#4330) (#4339)
Browse files Browse the repository at this point in the history
Cherry Pick of : 
#4330
#4335

to release/3.0.x branch

---------

Co-authored-by: Nikita Shulga <[email protected]>
Co-authored-by: Keren Zhou <[email protected]>
  • Loading branch information
3 people committed Jul 17, 2024
1 parent 5cbb516 commit 91f24d8
Showing 1 changed file with 30 additions and 3 deletions.
33 changes: 30 additions & 3 deletions third_party/nvidia/backend/driver.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,17 @@ static bool gpuAssert(CUresult code, const char *file, int line) {
} \
} while (0)

// Used to check if functions exist in old CUDA driver versions.
#define INITIALIZE_FUNCTION_POINTER_IF_NULL(funcPointer, initializerFunction) \
do { \
if ((funcPointer) == NULL) { \
(funcPointer) = (initializerFunction)(); \
if ((funcPointer) == NULL) { \
return NULL; \
} \
} \
} while (0)

static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
int device_id;
if (!PyArg_ParseTuple(args, "i", &device_id))
Expand Down Expand Up @@ -143,6 +154,14 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) {
typedef CUresult (*cuOccupancyMaxActiveClusters_t)(
int *numClusters, CUfunction func, const CUlaunchConfig *config);

typedef CUresult (*cuTensorMapEncodeTiled_t)(
CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
const cuuint64_t *globalStrides, const cuuint32_t *boxDim,
const cuuint32_t *elementStrides, CUtensorMapInterleave interleave,
CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
CUtensorMapFloatOOBfill oobFill);

#define defineGetFunctionHandle(name, symbolName) \
static symbolName##_t name() { \
/* Open the shared library */ \
Expand All @@ -168,6 +187,9 @@ typedef CUresult (*cuOccupancyMaxActiveClusters_t)(
defineGetFunctionHandle(getCuOccupancyMaxActiveClustersHandle,
cuOccupancyMaxActiveClusters);

defineGetFunctionHandle(getCuTensorMapEncodeTiledHandle,
cuTensorMapEncodeTiled);

static PyObject *occupancyMaxActiveClusters(PyObject *self, PyObject *args) {
int clusterDimX = -1, clusterDimY = -1, clusterDimZ = -1,
maxActiveClusters = -1;
Expand Down Expand Up @@ -204,9 +226,8 @@ static PyObject *occupancyMaxActiveClusters(PyObject *self, PyObject *args) {
config.attrs = launchAttr;

static cuOccupancyMaxActiveClusters_t cuOccupancyMaxActiveClusters = NULL;
if (cuOccupancyMaxActiveClusters == NULL) {
cuOccupancyMaxActiveClusters = getCuOccupancyMaxActiveClustersHandle();
}
INITIALIZE_FUNCTION_POINTER_IF_NULL(cuOccupancyMaxActiveClusters,
getCuOccupancyMaxActiveClustersHandle);

Py_BEGIN_ALLOW_THREADS;
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncSetAttribute(
Expand Down Expand Up @@ -289,6 +310,9 @@ static PyObject *fill1DTMADescriptor(PyObject *self, PyObject *args) {
}
assert((elementSize * tensorDim) >= 32 && "block size too small.");
int rank = 1;
static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiled = NULL;
INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled,
getCuTensorMapEncodeTiledHandle);
CUresult result = cuTensorMapEncodeTiled(
(CUtensorMap *)desc, type, rank, (void *)global_address, dims,
globalStrides, boxDim, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE,
Expand Down Expand Up @@ -350,6 +374,9 @@ static PyObject *fill2DTMADescriptor(PyObject *self, PyObject *args) {
if (contigDimSizeInByte > 128) {
tensorDims[0] = 128 / elementSize;
}
static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiled = NULL;
INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled,
getCuTensorMapEncodeTiledHandle);
CUresult result = cuTensorMapEncodeTiled(
(CUtensorMap *)desc, type, rank, (void *)global_address, dims,
globalStrides, tensorDims, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE,
Expand Down

0 comments on commit 91f24d8

Please sign in to comment.