Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CUDA 12.4 Update 1: CUPTI does not trace kernels anymore #2328

Closed
maleadt opened this issue Apr 16, 2024 · 1 comment
Closed

CUDA 12.4 Update 1: CUPTI does not trace kernels anymore #2328

maleadt opened this issue Apr 16, 2024 · 1 comment
Labels
bug Something isn't working upstream Somebody else's problem.

Comments

@maleadt
Copy link
Member

maleadt commented Apr 16, 2024

MWE:

using CUDA

function callback_trace(f)
    cfg = CUPTI.CallbackConfig([CUPTI.CUPTI_CB_DOMAIN_RESOURCE]) do domain, id, data
        println(id)
    end

    CUPTI.enable!(cfg) do
        f()
    end
end

function activity_trace(f)
    activity_kinds = [
        CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
        CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API,
    ]
    cfg = CUPTI.ActivityConfig(activity_kinds)

    CUPTI.enable!(cfg) do
        f()
    end
end

# cublasScopy_v2
operation() = CUBLAS.copy!(1, CUDA.ones(1), CUDA.ones(1))

println("first callback trace")
callback_trace(operation)

println("second callback trace")
callback_trace(operation)

println("activity trace")
activity_trace(operation)

println("third callback trace")
callback_trace(operation)

println("fourth callback trace")
callback_trace(operation)

On CUDA 12.4 Update 1, this gives the following:

first callback trace
3
6
6
second callback trace
activity trace
third callback trace
8
fourth callback trace
8

i.e., callback traces do not contain CUPTI_CBID_RESOURCE_MODULE_PROFILED=8 entries anymore until CUPTI's activity API has been activated.

C++ MWE:

#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <cupti.h>
#include <iostream>

#define CUDA_API_CALL(call)                                                    \
  do {                                                                         \
    cudaError_t _status = call;                                                \
    if (_status != cudaSuccess) {                                              \
      fprintf(stderr, "%s:%d: Error: %s failed with error: %s.\n", __FILE__,   \
              __LINE__, #call, cudaGetErrorString(_status));                   \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  } while (0)

#define CUPTI_API_CALL(apiFunctionCall)                                        \
  do {                                                                         \
    CUptiResult _status = apiFunctionCall;                                     \
    if (_status != CUPTI_SUCCESS) {                                            \
      const char *pErrorString;                                                \
      cuptiGetResultString(_status, &pErrorString);                            \
                                                                               \
      fprintf(stderr, "%s:%d: Error: Function %s failed with error: %s.\n",    \
              __FILE__, __LINE__, #apiFunctionCall, pErrorString);             \
                                                                               \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  } while (0)

static void CUDAAPI cuptiCallback(void *userdata, CUpti_CallbackDomain domain,
                                  CUpti_CallbackId cbid,
                                  const CUpti_CallbackData *cbInfo) {
  // Print the callback ID
  std::cout << cbid << std::endl;
}

void callbackTrace(void (*operation)()) {
  CUpti_SubscriberHandle subscriber;

  // Subscribe to the callback
  CUPTI_API_CALL(
      cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)cuptiCallback, nullptr));
  CUPTI_API_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE));

  // Run the operation
  operation();

  // Unsubscribe
  CUPTI_API_CALL(cuptiEnableDomain(0, subscriber, CUPTI_CB_DOMAIN_RESOURCE));
  CUPTI_API_CALL(cuptiUnsubscribe(subscriber));
}

#define BUF_SIZE (8 * 1024 * 1024)
#define ALIGN_SIZE (8)
#define ALIGN_BUFFER(buffer, align)                                            \
  (((uintptr_t)(buffer) & ((align)-1))                                         \
       ? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1)))            \
       : (buffer))
static void CUPTIAPI BufferRequested(uint8_t **ppBuffer, size_t *pSize,
                                     size_t *pMaxNumRecords) {
  uint8_t *pBuffer = (uint8_t *)malloc(BUF_SIZE + ALIGN_SIZE);

  *pSize = BUF_SIZE;
  *ppBuffer = ALIGN_BUFFER(pBuffer, ALIGN_SIZE);
  *pMaxNumRecords = 0;
}
static void CUPTIAPI BufferCompleted(CUcontext context, uint32_t streamId,
                                     uint8_t *pBuffer, size_t size,
                                     size_t validSize) {

  free(pBuffer);
}

void activityTrace(void (*operation)()) {
  CUpti_Activity activity;

  CUPTI_API_CALL(
      cuptiActivityRegisterCallbacks(BufferRequested, BufferCompleted));

  // Enable the activity kind
  CUPTI_API_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
  CUPTI_API_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API));

  // Run the operation
  operation();

  CUPTI_API_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
  CUPTI_API_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API));
  cuptiActivityFlushAll(0);
}

void cublasOperation() {
  cublasHandle_t handle;
  float *d_A, *d_B;
  const float alpha = 1.0f;
  CUDA_API_CALL(cudaMalloc(&d_A, sizeof(float)));
  CUDA_API_CALL(cudaMalloc(&d_B, sizeof(float)));

  cublasCreate(&handle);
  cublasScopy(handle, 1, d_A, 1, d_B, 1);
  cublasDestroy(handle);

  CUDA_API_CALL(cudaFree(d_A));
  CUDA_API_CALL(cudaFree(d_B));
}

int main() {
  std::cout << "First callback trace" << std::endl;
  callbackTrace(cublasOperation);

  std::cout << "Second callback trace" << std::endl;
  callbackTrace(cublasOperation);

  std::cout << "Activity trace" << std::endl;
  activityTrace(cublasOperation);

  std::cout << "Third callback trace" << std::endl;
  callbackTrace(cublasOperation);

  std::cout << "Fourth callback trace" << std::endl;
  callbackTrace(cublasOperation);

  return 0;
}
❯ g++ wip.cpp -isystem/opt/cuda/extras/CUPTI/include -isystem/opt/cuda/include -L/opt/cuda/extras/CUPTI/lib64 -L/opt/cuda/lib -o wip -lcupti -lcudart -lcublas

❯ LD_LIBRARY_PATH=/home/tim/Julia/depot/artifacts/7b52eb5d9f5c3debcf418c1ffa049960190f6e47/lib ./wip
First callback trace
5
1
6
6
8
Second callback trace
8
Activity trace
Third callback trace
8
Fourth callback trace
8

❯ LD_LIBRARY_PATH=/home/tim/Julia/depot/artifacts/10e364e8eb619d3f49a273c168dc1b8451dcf718/lib ./wip
First callback trace
5
1
6
6
Second callback trace
Activity trace
Third callback trace
8
Fourth callback trace
8
@maleadt maleadt added bug Something isn't working upstream Somebody else's problem. labels Apr 16, 2024
@maleadt maleadt changed the title CUDA 12.4 Update 1: CUPTI does not trace external kernels anymore CUDA 12.4 Update 1: CUPTI does not trace kernels anymore Apr 16, 2024
@maleadt
Copy link
Member Author

maleadt commented May 24, 2024

Confirmed by upstream, fixed in CUDA 12.5 Update 1.

@maleadt maleadt closed this as completed Sep 18, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working upstream Somebody else's problem.
Projects
None yet
Development

No branches or pull requests

1 participant