From 460c47fa90e16d28afd7c93015d36e92719b5737 Mon Sep 17 00:00:00 2001 From: Rafal Rudnicki Date: Mon, 29 Jul 2024 16:39:48 +0200 Subject: [PATCH] add CUDA provider --- .github/workflows/basic.yml | 20 +- .github/workflows/benchmarks.yml | 1 + .github/workflows/codeql.yml | 1 + .github/workflows/fast.yml | 1 + .github/workflows/gpu.yml | 69 +++- .github/workflows/nightly.yml | 1 + .github/workflows/pr_push.yml | 1 + .github/workflows/sanitizers.yml | 2 + .github/workflows/valgrind.yml | 1 + CMakeLists.txt | 6 + README.md | 26 +- benchmark/CMakeLists.txt | 7 +- benchmark/ubench.c | 5 +- cmake/FindCUDA.cmake | 35 ++ examples/CMakeLists.txt | 47 ++- examples/cuda_shared_memory/CMakeLists.txt | 74 ++++ .../cuda_shared_memory/cuda_shared_memory.c | 115 ++++++ .../CMakeLists.txt | 6 +- .../level_zero_shared_memory.c} | 0 include/umf/memory_provider_gpu.h | 31 ++ include/umf/providers/provider_cuda.h | 30 ++ include/umf/providers/provider_level_zero.h | 12 +- scripts/docs_config/examples.rst | 7 +- scripts/qemu/run-build.sh | 1 + src/CMakeLists.txt | 36 ++ src/provider/provider_cuda.c | 344 ++++++++++++++++++ test/CMakeLists.txt | 47 ++- test/providers/cuda_helpers.cpp | 288 +++++++++++++++ test/providers/cuda_helpers.h | 36 ++ test/providers/provider_cuda.cpp | 189 ++++++++++ 30 files changed, 1406 insertions(+), 33 deletions(-) create mode 100644 cmake/FindCUDA.cmake create mode 100644 examples/cuda_shared_memory/CMakeLists.txt create mode 100644 examples/cuda_shared_memory/cuda_shared_memory.c rename examples/{gpu_shared_memory => level_zero_shared_memory}/CMakeLists.txt (93%) rename examples/{gpu_shared_memory/gpu_shared_memory.c => level_zero_shared_memory/level_zero_shared_memory.c} (100%) create mode 100644 include/umf/memory_provider_gpu.h create mode 100644 include/umf/providers/provider_cuda.h create mode 100644 src/provider/provider_cuda.c create mode 100644 test/providers/cuda_helpers.cpp create mode 100644 test/providers/cuda_helpers.h create mode 100644 test/providers/provider_cuda.cpp diff --git a/.github/workflows/basic.yml b/.github/workflows/basic.yml index 232f968695..8bca12f5a7 100644 --- a/.github/workflows/basic.yml +++ b/.github/workflows/basic.yml @@ -22,6 +22,7 @@ jobs: compiler: [{c: gcc, cxx: g++}] shared_library: ['OFF'] level_zero_provider: ['ON'] + cuda_provider: ['ON'] install_tbb: ['ON'] disable_hwloc: ['OFF'] link_hwloc_statically: ['OFF'] @@ -31,6 +32,7 @@ jobs: compiler: {c: gcc-7, cxx: g++-7} shared_library: 'OFF' level_zero_provider: 'ON' + cuda_provider: 'ON' install_tbb: 'ON' disable_hwloc: 'OFF' link_hwloc_statically: 'OFF' @@ -39,6 +41,7 @@ jobs: compiler: {c: clang, cxx: clang++} shared_library: 'OFF' level_zero_provider: 'ON' + cuda_provider: 'ON' install_tbb: 'ON' disable_hwloc: 'OFF' link_hwloc_statically: 'OFF' @@ -47,6 +50,7 @@ jobs: compiler: {c: gcc, cxx: g++} shared_library: 'ON' level_zero_provider: 'ON' + cuda_provider: 'ON' install_tbb: 'ON' disable_hwloc: 'OFF' link_hwloc_statically: 'OFF' @@ -55,15 +59,17 @@ jobs: compiler: {c: gcc, cxx: g++} shared_library: 'ON' level_zero_provider: 'ON' + cuda_provider: 'ON' install_tbb: 'ON' disable_hwloc: 'OFF' link_hwloc_statically: 'OFF' - # test level_zero_provider='OFF' + # test level_zero_provider='OFF' and cuda_provider='OFF' - os: 'ubuntu-22.04' build_type: Release compiler: {c: gcc, cxx: g++} shared_library: 'OFF' level_zero_provider: 'OFF' + cuda_provider: 'OFF' install_tbb: 'ON' disable_hwloc: 'OFF' link_hwloc_statically: 'OFF' @@ -73,6 +79,7 @@ jobs: compiler: {c: icx, cxx: icpx} shared_library: 'ON' level_zero_provider: 'ON' + cuda_provider: 'ON' install_tbb: 'ON' disable_hwloc: 'OFF' link_hwloc_statically: 'OFF' @@ -82,6 +89,7 @@ jobs: compiler: {c: gcc, cxx: g++} shared_library: 'ON' level_zero_provider: 'ON' + cuda_provider: 'ON' install_tbb: 'OFF' disable_hwloc: 'OFF' link_hwloc_statically: 'OFF' @@ -90,6 +98,7 @@ jobs: compiler: {c: gcc, cxx: g++} shared_library: 'ON' level_zero_provider: 'ON' + cuda_provider: 'ON' install_tbb: 'ON' disable_hwloc: 'ON' link_hwloc_statically: 'OFF' @@ -98,6 +107,7 @@ jobs: compiler: {c: gcc, cxx: g++} shared_library: 'ON' level_zero_provider: 'ON' + cuda_provider: 'ON' install_tbb: 'ON' disable_hwloc: 'OFF' link_hwloc_statically: 'ON' @@ -149,6 +159,7 @@ jobs: -DCMAKE_C_COMPILER=${{matrix.compiler.c}} -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}} -DUMF_BUILD_LEVEL_ZERO_PROVIDER=${{matrix.level_zero_provider}} + -DUMF_BUILD_CUDA_PROVIDER=${{matrix.cuda_provider}} -DUMF_FORMAT_CODE_STYLE=OFF -DUMF_DEVELOPER_MODE=ON -DUMF_BUILD_LIBUMF_POOL_JEMALLOC=ON @@ -195,23 +206,27 @@ jobs: compiler: [{c: cl, cxx: cl}] shared_library: ['ON', 'OFF'] level_zero_provider: ['ON'] + cuda_provider: ['ON'] include: - os: 'windows-2022' build_type: Release compiler: {c: clang-cl, cxx: clang-cl} shared_library: 'ON' level_zero_provider: 'ON' + cuda_provider: 'ON' toolset: "-T ClangCL" - os: 'windows-2022' build_type: Release compiler: {c: cl, cxx: cl} shared_library: 'ON' level_zero_provider: 'ON' + cuda_provider: 'ON' - os: 'windows-2022' build_type: Release compiler: {c: cl, cxx: cl} shared_library: 'ON' level_zero_provider: 'OFF' + cuda_provider: 'OFF' runs-on: ${{matrix.os}} @@ -247,6 +262,7 @@ jobs: -DUMF_BUILD_LIBUMF_POOL_DISJOINT=ON -DUMF_BUILD_LIBUMF_POOL_JEMALLOC=ON -DUMF_BUILD_LEVEL_ZERO_PROVIDER=${{matrix.level_zero_provider}} + -DUMF_BUILD_CUDA_PROVIDER=${{matrix.cuda_provider}} -DUMF_TESTS_FAIL_ON_SKIP=ON - name: Build UMF @@ -305,6 +321,7 @@ jobs: -DUMF_BUILD_LIBUMF_POOL_DISJOINT=ON -DUMF_BUILD_LIBUMF_POOL_JEMALLOC=OFF -DUMF_BUILD_LEVEL_ZERO_PROVIDER=ON + -DUMF_BUILD_CUDA_PROVIDER=ON -DUMF_TESTS_FAIL_ON_SKIP=ON -DUMF_LINK_HWLOC_STATICALLY=ON @@ -347,6 +364,7 @@ jobs: -DUMF_BUILD_LIBUMF_POOL_DISJOINT=ON -DUMF_BUILD_LIBUMF_POOL_JEMALLOC=OFF -DUMF_BUILD_LEVEL_ZERO_PROVIDER=ON + -DUMF_BUILD_CUDA_PROVIDER=ON -DUMF_TESTS_FAIL_ON_SKIP=ON -DUMF_LINK_HWLOC_STATICALLY=ON diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index de48173bfd..41710029c8 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -63,6 +63,7 @@ jobs: -DUMF_FORMAT_CODE_STYLE=OFF -DUMF_DEVELOPER_MODE=OFF -DUMF_BUILD_LEVEL_ZERO_PROVIDER=ON + -DUMF_BUILD_CUDA_PROVIDER=ON -DUMF_BUILD_LIBUMF_POOL_DISJOINT=ON -DUMF_BUILD_LIBUMF_POOL_JEMALLOC=ON diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index b449eb23e7..a444234205 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -84,6 +84,7 @@ jobs: -DUMF_DEVELOPER_MODE=ON -DUMF_BUILD_LIBUMF_POOL_JEMALLOC=ON -DUMF_BUILD_LEVEL_ZERO_PROVIDER=ON + -DUMF_BUILD_CUDA_PROVIDER=ON -DUMF_TESTS_FAIL_ON_SKIP=ON - name: Build diff --git a/.github/workflows/fast.yml b/.github/workflows/fast.yml index 1e980c3e2d..997c4441c9 100644 --- a/.github/workflows/fast.yml +++ b/.github/workflows/fast.yml @@ -106,6 +106,7 @@ jobs: -DUMF_BUILD_TESTS=${{matrix.build_tests}} -DUMF_BUILD_EXAMPLES=ON -DUMF_BUILD_LEVEL_ZERO_PROVIDER=ON + -DUMF_BUILD_CUDA_PROVIDER=ON -DUMF_TESTS_FAIL_ON_SKIP=ON -DUMF_BUILD_SHARED_LIBRARY=ON ${{matrix.extra_build_options}} diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml index 3024b9f7ed..1d45eaf811 100644 --- a/.github/workflows/gpu.yml +++ b/.github/workflows/gpu.yml @@ -1,6 +1,3 @@ -# This workflow builds and tests providers using GPU memory. It requires -# "level_zero" labeled self-hosted runners installed on systems with the -# appropriate GPU and drivers. name: GPU on: [workflow_call] @@ -63,6 +60,7 @@ jobs: -DUMF_BUILD_LIBUMF_POOL_DISJOINT=ON -DUMF_BUILD_LIBUMF_POOL_JEMALLOC=ON -DUMF_BUILD_LEVEL_ZERO_PROVIDER=ON + -DUMF_BUILD_CUDA_PROVIDER=OFF -DUMF_TESTS_FAIL_ON_SKIP=ON - name: Configure build for Ubuntu @@ -84,9 +82,11 @@ jobs: -DUMF_BUILD_LIBUMF_POOL_DISJOINT=ON -DUMF_BUILD_LIBUMF_POOL_JEMALLOC=ON -DUMF_BUILD_LEVEL_ZERO_PROVIDER=ON + -DUMF_BUILD_CUDA_PROVIDER=OFF -DUMF_TESTS_FAIL_ON_SKIP=ON - name: Build UMF + if: matrix.os == 'Ubuntu' run: cmake --build ${{env.BUILD_DIR}} --config ${{env.BUILD_TYPE}} -j ${{matrix.number_of_processors}} - name: Run tests @@ -100,3 +100,66 @@ jobs: - name: Run benchmarks working-directory: ${{env.BUILD_DIR}} run: ctest --output-on-failure --test-dir benchmark -C ${{env.BUILD_TYPE}} --exclude-regex umf-bench-multithreaded + + gpu-CUDA: + name: Build + env: + BUILD_TYPE: Release + # run only on upstream; forks will not have the HW + # if: github.repository == 'oneapi-src/unified-memory-framework' + strategy: + matrix: + shared_library: ['ON', 'OFF'] + # TODO add windows + os: ['Ubuntu'] + include: + - os: 'Ubuntu' + compiler: {c: gcc, cxx: g++} + number_of_processors: '$(nproc)' + + runs-on: ["DSS-CUDA", "DSS-${{matrix.os}}"] + steps: + - name: Checkout + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + fetch-depth: 0 + + - name: Get information about platform + if: matrix.os == 'Ubuntu' + run: .github/scripts/get_system_info.sh + + - name: Configure build for Ubuntu + if: matrix.os == 'Ubuntu' + run: > + cmake -B ${{env.BUILD_DIR}} + -DCMAKE_INSTALL_PREFIX="${{env.INSTL_DIR}}" + -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} + -DCMAKE_C_COMPILER=${{matrix.compiler.c}} + -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}} + -DUMF_BUILD_SHARED_LIBRARY=${{matrix.shared_library}} + -DUMF_BUILD_BENCHMARKS=ON + -DUMF_BUILD_TESTS=ON + -DUMF_BUILD_GPU_TESTS=ON + -DUMF_BUILD_GPU_EXAMPLES=ON + -DUMF_FORMAT_CODE_STYLE=OFF + -DUMF_DEVELOPER_MODE=ON + -DUMF_BUILD_LIBUMF_POOL_DISJOINT=ON + -DUMF_BUILD_LIBUMF_POOL_JEMALLOC=ON + -DUMF_BUILD_LEVEL_ZERO_PROVIDER=OFF + -DUMF_BUILD_CUDA_PROVIDER=ON + -DUMF_TESTS_FAIL_ON_SKIP=ON + + - name: Build UMF + run: cmake --build ${{env.BUILD_DIR}} --config ${{env.BUILD_TYPE}} -j ${{matrix.number_of_processors}} + + - name: Run tests + working-directory: ${{env.BUILD_DIR}} + run: ctest -C ${{env.BUILD_TYPE}} --output-on-failure --test-dir test + + - name: Run examples + working-directory: ${{env.BUILD_DIR}} + run: ctest --output-on-failure --test-dir examples -C ${{env.BUILD_TYPE}} + + - name: Run benchmarks + working-directory: ${{env.BUILD_DIR}} + run: ctest --output-on-failure --test-dir benchmark -C ${{env.BUILD_TYPE}} --exclude-regex umf-bench-multithreaded diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index f2bf8f08fd..89317cc637 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -79,6 +79,7 @@ jobs: -DUMF_BUILD_LIBUMF_POOL_DISJOINT=ON -DUMF_BUILD_LIBUMF_POOL_JEMALLOC=ON -DUMF_BUILD_LEVEL_ZERO_PROVIDER=OFF + -DUMF_BUILD_CUDA_PROVIDER=OFF -DUMF_USE_VALGRIND=1 -DUMF_TESTS_FAIL_ON_SKIP=ON diff --git a/.github/workflows/pr_push.yml b/.github/workflows/pr_push.yml index 4c7a27c1da..02b7adf9f8 100644 --- a/.github/workflows/pr_push.yml +++ b/.github/workflows/pr_push.yml @@ -37,6 +37,7 @@ jobs: -DUMF_FORMAT_CODE_STYLE=ON -DUMF_BUILD_TESTS=OFF -DUMF_BUILD_LEVEL_ZERO_PROVIDER=OFF + -DUMF_BUILD_CUDA_PROVIDER=OFF -DUMF_BUILD_LIBUMF_POOL_JEMALLOC=OFF - name: Check C/C++ formatting diff --git a/.github/workflows/sanitizers.yml b/.github/workflows/sanitizers.yml index 2a09f60fe0..2c63ebd51b 100644 --- a/.github/workflows/sanitizers.yml +++ b/.github/workflows/sanitizers.yml @@ -55,6 +55,7 @@ jobs: -DCMAKE_C_COMPILER=${{matrix.compiler.c}} -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}} -DUMF_BUILD_LEVEL_ZERO_PROVIDER=ON + -DUMF_BUILD_CUDA_PROVIDER=ON -DUMF_FORMAT_CODE_STYLE=OFF -DUMF_DEVELOPER_MODE=ON -DUMF_BUILD_LIBUMF_POOL_JEMALLOC=ON @@ -132,6 +133,7 @@ jobs: -DUMF_USE_ASAN=${{matrix.sanitizers.asan}} -DUMF_BUILD_EXAMPLES=ON -DUMF_BUILD_LEVEL_ZERO_PROVIDER=OFF + -DUMF_BUILD_CUDA_PROVIDER=OFF -DUMF_TESTS_FAIL_ON_SKIP=ON - name: Build UMF diff --git a/.github/workflows/valgrind.yml b/.github/workflows/valgrind.yml index 53569385ea..40d4e65352 100644 --- a/.github/workflows/valgrind.yml +++ b/.github/workflows/valgrind.yml @@ -35,6 +35,7 @@ jobs: -DUMF_BUILD_LIBUMF_POOL_DISJOINT=ON -DUMF_BUILD_LIBUMF_POOL_JEMALLOC=ON -DUMF_BUILD_LEVEL_ZERO_PROVIDER=OFF + -DUMF_BUILD_CUDA_PROVIDER=OFF -DUMF_USE_VALGRIND=1 -DUMF_TESTS_FAIL_ON_SKIP=ON diff --git a/CMakeLists.txt b/CMakeLists.txt index 2ceb753683..83c0e0c4e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,7 @@ find_package(PkgConfig) # Build Options option(UMF_BUILD_SHARED_LIBRARY "Build UMF as shared library" OFF) option(UMF_BUILD_LEVEL_ZERO_PROVIDER "Build Level Zero memory provider" ON) +option(UMF_BUILD_CUDA_PROVIDER "Build CUDA memory provider" ON) option(UMF_BUILD_LIBUMF_POOL_DISJOINT "Build the libumf_pool_disjoint static library" OFF) option(UMF_BUILD_LIBUMF_POOL_JEMALLOC @@ -417,6 +418,11 @@ if(UMF_BUILD_LEVEL_ZERO_PROVIDER) add_optional_symbol(umfLevelZeroMemoryProviderOps) endif() +# Conditional configuration for CUDA provider +if(UMF_BUILD_CUDA_PROVIDER) + add_optional_symbol(umfCUDAMemoryProviderOps) +endif() + if(NOT UMF_DISABLE_HWLOC) add_optional_symbol(umfOsMemoryProviderOps) if(LINUX) diff --git a/README.md b/README.md index 64894ecf22..59e3140888 100644 --- a/README.md +++ b/README.md @@ -19,10 +19,11 @@ The Unified Memory Framework (UMF) is a library for constructing allocators and For a quick introduction to UMF usage, please see [examples](https://oneapi-src.github.io/unified-memory-framework/examples.html) documentation, which includes the code of the -[basic example](https://github.com/oneapi-src/unified-memory-framework/blob/main/examples/basic/basic.c) -and the more advanced one that allocates -[USM memory from the GPU device](https://github.com/oneapi-src/unified-memory-framework/blob/main/examples/basic/gpu_shared_memory.c) -using the Level Zero API and UMF Level Zero memory provider. +[basic example](https://github.com/oneapi-src/unified-memory-framework/blob/main/examples/basic/basic.c). +The are also more advanced that allocates USM memory from the +[Level Zero device](https://github.com/oneapi-src/unified-memory-framework/blob/main/examples/level_zero_shared_memory/level_zero_shared_memory.c) +using the Level Zero API and UMF Level Zero memory provider and [CUDA device](https://github.com/oneapi-src/unified-memory-framework/blob/main/examples/cuda_shared_memory/cuda_shared_memory.c) +using the CUDA API and UMF CUDA memory provider. ## Build @@ -101,6 +102,7 @@ List of options provided by CMake: | - | - | - | - | | UMF_BUILD_SHARED_LIBRARY | Build UMF as shared library | ON/OFF | OFF | | UMF_BUILD_LEVEL_ZERO_PROVIDER | Build Level Zero memory provider | ON/OFF | ON | +| UMF_BUILD_CUDA_PROVIDER | Build CUDA memory provider | ON/OFF | ON | | UMF_BUILD_LIBUMF_POOL_DISJOINT | Build the libumf_pool_disjoint static library | ON/OFF | OFF | | UMF_BUILD_LIBUMF_POOL_JEMALLOC | Build the libumf_pool_jemalloc static library | ON/OFF | OFF | | UMF_BUILD_TESTS | Build UMF tests | ON/OFF | ON | @@ -188,6 +190,22 @@ with the `disable_provider_free` parameter set to true. 1) Linux OS 2) A character device file /dev/daxX.Y created in the OS. +#### CUDA memory provider + +A memory provider that provides memory from CUDA device. + +##### Requirements + +1) Linux or Windows OS +2) The `UMF_BUILD_CUDA_PROVIDER` option turned `ON` (by default) + +Additionally, required for tests: + +3) The `UMF_BUILD_GPU_TESTS` option turned `ON` +4) System with CUDA compatible GPU +5) Required packages: + - nvidia-cuda-dev (Linux) or cuda-sdk (Windows) + ### Memory pool managers #### Proxy pool (part of libumf) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 455b9bc061..cbb6468ab0 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -85,6 +85,10 @@ function(add_umf_benchmark) target_compile_definitions(${BENCH_NAME} PRIVATE UMF_BUILD_LEVEL_ZERO_PROVIDER=1) endif() + if(UMF_BUILD_CUDA_PROVIDER) + target_compile_definitions(${BENCH_NAME} + PRIVATE UMF_BUILD_CUDA_PROVIDER=1) + endif() if(UMF_BUILD_GPU_TESTS) target_compile_definitions(${BENCH_NAME} PRIVATE UMF_BUILD_GPU_TESTS=1) endif() @@ -103,8 +107,9 @@ endif() if(LINUX) set(LIBS_OPTIONAL ${LIBS_OPTIONAL} m) endif() -if(UMF_BUILD_GPU_TESTS) +if(UMF_BUILD_GPU_TESTS AND UMF_BUILD_LEVEL_ZERO_PROVIDER) set(LIBS_OPTIONAL ${LIBS_OPTIONAL} ze_loader) + # TODO add CUDA endif() # BENCHMARKS diff --git a/benchmark/ubench.c b/benchmark/ubench.c index 900b7b85ce..f70f19fb38 100644 --- a/benchmark/ubench.c +++ b/benchmark/ubench.c @@ -30,7 +30,8 @@ #include "utils_common.h" -#if (defined UMF_BUILD_GPU_TESTS) +#if (defined UMF_BUILD_LIBUMF_POOL_DISJOINT && \ + defined UMF_BUILD_LEVEL_ZERO_PROVIDER && defined UMF_BUILD_GPU_TESTS) #include "utils_level_zero.h" #endif @@ -505,6 +506,8 @@ UBENCH_EX(ipc, disjoint_pool_with_level_zero_provider) { } #endif /* (defined UMF_BUILD_LIBUMF_POOL_DISJOINT && defined UMF_BUILD_LEVEL_ZERO_PROVIDER && defined UMF_BUILD_GPU_TESTS) */ +// TODO add IPC benchmark for CUDA + UBENCH_MAIN() #if defined(_MSC_VER) diff --git a/cmake/FindCUDA.cmake b/cmake/FindCUDA.cmake new file mode 100644 index 0000000000..92ef5c830a --- /dev/null +++ b/cmake/FindCUDA.cmake @@ -0,0 +1,35 @@ +# Copyright (C) 2024 Intel Corporation +# Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +message(STATUS "Checking for module 'cuda' using find_library()") + +find_library(CUDA_LIBRARY NAMES libcuda cuda) +set(CUDA_LIBRARIES ${CUDA_LIBRARY}) + +get_filename_component(CUDA_LIB_DIR ${CUDA_LIBRARIES} DIRECTORY) +set(CUDA_LIBRARY_DIRS ${CUDA_LIB_DIR}) + +if(WINDOWS) + find_file(CUDA_DLL NAMES "bin/cuda.dll" "cuda.dll") + get_filename_component(CUDA_DLL_DIR ${CUDA_DLL} DIRECTORY) + set(CUDA_DLL_DIRS ${CUDA_DLL_DIR}) +endif() + +if(CUDA_LIBRARY) + message(STATUS " Found cuda using find_library()") + message(STATUS " CUDA_LIBRARIES = ${CUDA_LIBRARIES}") + message(STATUS " CUDA_INCLUDE_DIRS = ${CUDA_INCLUDE_DIRS}") + message(STATUS " CUDA_LIBRARY_DIRS = ${CUDA_LIBRARY_DIRS}") + if(WINDOWS) + message(STATUS " CUDA_DLL_DIRS = ${CUDA_DLL_DIRS}") + endif() +else() + set(MSG_NOT_FOUND "cuda NOT found (set CMAKE_PREFIX_PATH to point the " + "location)") + if(CUDA_FIND_REQUIRED) + message(FATAL_ERROR ${MSG_NOT_FOUND}) + else() + message(WARNING ${MSG_NOT_FOUND}) + endif() +endif() diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 8b61c82a56..7c74326f21 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -44,16 +44,16 @@ endif() if(UMF_BUILD_GPU_EXAMPLES AND UMF_BUILD_LIBUMF_POOL_DISJOINT AND UMF_BUILD_LEVEL_ZERO_PROVIDER) - set(EXAMPLE_NAME umf_example_gpu_shared_memory) + set(EXAMPLE_NAME umf_example_level_zero_shared_memory) add_umf_executable( NAME ${EXAMPLE_NAME} - SRCS gpu_shared_memory/gpu_shared_memory.c + SRCS level_zero_shared_memory/level_zero_shared_memory.c LIBS disjoint_pool ze_loader umf) target_include_directories( ${EXAMPLE_NAME} - PRIVATE ${UMF_CMAKE_SOURCE_DIR}/src/utils + PRIVATE ${LEVEL_ZERO_INCLUDE_DIRS} ${UMF_CMAKE_SOURCE_DIR}/src/utils ${UMF_CMAKE_SOURCE_DIR}/include ${UMF_CMAKE_SOURCE_DIR}/examples/common) @@ -66,6 +66,43 @@ if(UMF_BUILD_GPU_EXAMPLES set_tests_properties(${EXAMPLE_NAME} PROPERTIES LABELS "example") + if(WINDOWS) + # append PATH to DLLs + set_property(TEST ${EXAMPLE_NAME} PROPERTY ENVIRONMENT_MODIFICATION + "${DLL_PATH_LIST}") + endif() +else() + message(STATUS "GPU Level Zero shared memory example requires " + "UMF_BUILD_GPU_EXAMPLES, UMF_BUILD_LEVEL_ZERO_PROVIDER and " + "UMF_BUILD_LIBUMF_POOL_DISJOINT to be turned ON - skipping") +endif() + +if(UMF_BUILD_GPU_EXAMPLES + AND UMF_BUILD_LIBUMF_POOL_DISJOINT + AND UMF_BUILD_CUDA_PROVIDER) + set(EXAMPLE_NAME umf_example_cuda_shared_memory) + + add_umf_executable( + NAME ${EXAMPLE_NAME} + SRCS cuda_shared_memory/cuda_shared_memory.c + LIBS disjoint_pool cuda umf) + + target_include_directories( + ${EXAMPLE_NAME} + PRIVATE ${CUDA_INCLUDE_DIRS} ${UMF_CMAKE_SOURCE_DIR}/src/utils + ${UMF_CMAKE_SOURCE_DIR}/include + ${UMF_CMAKE_SOURCE_DIR}/examples/common) + + target_link_directories(${EXAMPLE_NAME} PRIVATE ${LIBHWLOC_LIBRARY_DIRS} + ${CUDA_LIBRARY_DIRS}) + + add_test( + NAME ${EXAMPLE_NAME} + COMMAND ${EXAMPLE_NAME} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + + set_tests_properties(${EXAMPLE_NAME} PROPERTIES LABELS "example") + if(WINDOWS) # append PATH to DLLs set_property(TEST ${EXAMPLE_NAME} PROPERTY ENVIRONMENT_MODIFICATION @@ -74,8 +111,8 @@ if(UMF_BUILD_GPU_EXAMPLES else() message( STATUS - "GPU shared memory example requires UMF_BUILD_GPU_EXAMPLES, " - "UMF_BUILD_LEVEL_ZERO_PROVIDER and UMF_BUILD_LIBUMF_POOL_DISJOINT " + "GPU CUDA shared memory example requires UMF_BUILD_GPU_EXAMPLES, " + "UMF_BUILD_CUDA_PROVIDER and UMF_BUILD_LIBUMF_POOL_DISJOINT " "to be turned ON - skipping") endif() diff --git a/examples/cuda_shared_memory/CMakeLists.txt b/examples/cuda_shared_memory/CMakeLists.txt new file mode 100644 index 0000000000..a30621887e --- /dev/null +++ b/examples/cuda_shared_memory/CMakeLists.txt @@ -0,0 +1,74 @@ +# Copyright (C) 2024 Intel Corporation +# Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +cmake_minimum_required(VERSION 3.14.0 FATAL_ERROR) +project(umf_example_cuda_shared_memory LANGUAGES C) +enable_testing() + +set(UMF_EXAMPLE_DIR "${CMAKE_SOURCE_DIR}/..") +list(APPEND CMAKE_MODULE_PATH "${UMF_EXAMPLE_DIR}/cmake") +message(STATUS "CMAKE_MODULE_PATH=${CMAKE_MODULE_PATH}") + +find_package(PkgConfig) +pkg_check_modules(LIBUMF libumf) +if(NOT LIBUMF_FOUND) + find_package(LIBUMF REQUIRED libumf) +endif() + +pkg_check_modules(LIBHWLOC hwloc>=2.3.0) +if(NOT LIBHWLOC_FOUND) + find_package(LIBHWLOC 2.3.0 REQUIRED hwloc) +endif() + +include(FetchContent) + +set(CUDA_REPO "https://gitlab.com/nvidia/headers/cuda-individual/cudart.git") +set(CUDA_TAG cuda-12.5.1) + +message(STATUS "Fetching CUDA ${CUDA_TAG} from ${CUDA_REPO} ...") + +FetchContent_Declare( + cuda-headers + GIT_REPOSITORY ${CUDA_REPO} + GIT_TAG ${CUDA_TAG} + EXCLUDE_FROM_ALL) + +FetchContent_GetProperties(cuda-headers) +if(NOT cuda-headers_POPULATED) + FetchContent_Populate(cuda-headers) +endif() + +set(CUDA_INCLUDE_DIRS + ${cuda-headers_SOURCE_DIR} + CACHE PATH "Path to CUDA headers") +message(STATUS "CUDA include directory: ${CUDA_INCLUDE_DIRS}") +# build the example +set(EXAMPLE_NAME umf_example_cuda_shared_memory) +add_executable(${EXAMPLE_NAME} cuda_shared_memory.c) +target_include_directories( + ${EXAMPLE_NAME} PRIVATE ${CUDA_INCLUDE_DIRS} ${LIBUMF_INCLUDE_DIRS} + ${UMF_EXAMPLE_DIR}/common) +target_link_directories(${EXAMPLE_NAME} PRIVATE ${LIBUMF_LIBRARY_DIRS} + ${LIBHWLOC_LIBRARY_DIRS}) +target_link_options(${EXAMPLE_NAME} PRIVATE "-Wl,--start-group") +target_link_libraries(${EXAMPLE_NAME} PRIVATE stdc++ libdisjoint_pool.a cuda + ${LIBUMF_LIBRARIES}) + +# an optional part - adds a test of this example +add_test( + NAME ${EXAMPLE_NAME} + COMMAND ${EXAMPLE_NAME} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + +set_tests_properties(${EXAMPLE_NAME} PROPERTIES LABELS "example-standalone") + +if(LINUX) + # set LD_LIBRARY_PATH + set_property( + TEST ${EXAMPLE_NAME} + PROPERTY + ENVIRONMENT_MODIFICATION + "LD_LIBRARY_PATH=path_list_append:${LIBUMF_LIBRARY_DIRS};LD_LIBRARY_PATH=path_list_append:${LIBHWLOC_LIBRARY_DIRS}" + ) +endif() diff --git a/examples/cuda_shared_memory/cuda_shared_memory.c b/examples/cuda_shared_memory/cuda_shared_memory.c new file mode 100644 index 0000000000..4b30935226 --- /dev/null +++ b/examples/cuda_shared_memory/cuda_shared_memory.c @@ -0,0 +1,115 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + */ + +#include +#include + +#include +#include +#include + +#include + +int main(void) { + // A result object for storing UMF API result status + umf_result_t res; + + CUdevice cuDevice; + CUcontext cuContext; + int ret = 0; + + // Initialize the CUDA driver API + cuInit(0); + + // Get the handle to the first CUDA device + cuDeviceGet(&cuDevice, 0); + + // Create a context on the device + cuCtxCreate(&cuContext, 0, cuDevice); + + // Setup parameters for the CUDA memory provider. It will be used for + // allocating memory from CUDA devices. + cuda_memory_provider_params_t cu_memory_provider_params; + cu_memory_provider_params.cuda_context_handle = cuContext; + cu_memory_provider_params.cuda_device_handle = cuDevice; + // Set the memory type to shared to allow the memory to be accessed on both + // CPU and GPU. + cu_memory_provider_params.memory_type = UMF_MEMORY_TYPE_SHARED; + + // Create CUDA memory provider + umf_memory_provider_handle_t cu_memory_provider; + res = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(), + &cu_memory_provider_params, + &cu_memory_provider); + if (res != UMF_RESULT_SUCCESS) { + fprintf(stderr, "Failed to create a memory provider!\n"); + ret = -1; + goto cuda_destroy; + } + + printf("CUDA memory provider created at %p\n", (void *)cu_memory_provider); + + // Setup parameters for the Disjoint Pool. It will be used for managing the + // memory allocated using memory provider. + umf_disjoint_pool_params_t disjoint_memory_pool_params = + umfDisjointPoolParamsDefault(); + // Set the Slab Min Size to 64KB - the page size for GPU allocations + disjoint_memory_pool_params.SlabMinSize = 64 * 1024L; + // We would keep only single slab per each allocation bucket + disjoint_memory_pool_params.Capacity = 1; + // Set the maximum poolable size to 64KB - objects with size above this + // limit will not be stored/allocated from the pool. + disjoint_memory_pool_params.MaxPoolableSize = 64 * 1024L; + // Enable tracing + disjoint_memory_pool_params.PoolTrace = 1; + + // Create Disjoint Pool memory pool. + umf_memory_pool_handle_t cu_disjoint_memory_pool; + res = umfPoolCreate(umfDisjointPoolOps(), cu_memory_provider, + &disjoint_memory_pool_params, UMF_POOL_CREATE_FLAG_NONE, + &cu_disjoint_memory_pool); + if (res != UMF_RESULT_SUCCESS) { + fprintf(stderr, "Failed to create a memory pool!\n"); + ret = -1; + goto memory_provider_destroy; + } + + printf("Disjoint Pool created at %p\n", (void *)cu_disjoint_memory_pool); + + // Allocate some memory from the pool + int *ptr = umfPoolMalloc(cu_disjoint_memory_pool, sizeof(int)); + if (res != UMF_RESULT_SUCCESS) { + fprintf(stderr, "Failed to allocate memory from the memory pool!\n"); + ret = -1; + goto memory_pool_destroy; + } + + // Use allocated memory + *ptr = 1; + + // Free allocated memory + res = umfFree(ptr); + if (res != UMF_RESULT_SUCCESS) { + fprintf(stderr, "Failed to free memory to the pool!\n"); + ret = -1; + goto memory_pool_destroy; + } + printf("Freed memory at %p\n", (void *)ptr); + + // Cleanup +memory_pool_destroy: + umfPoolDestroy(cu_disjoint_memory_pool); + +memory_provider_destroy: + umfMemoryProviderDestroy(cu_memory_provider); + +cuda_destroy: + ret = cuCtxDestroy(cuContext); + return ret; +} diff --git a/examples/gpu_shared_memory/CMakeLists.txt b/examples/level_zero_shared_memory/CMakeLists.txt similarity index 93% rename from examples/gpu_shared_memory/CMakeLists.txt rename to examples/level_zero_shared_memory/CMakeLists.txt index 659d223975..86d22941fb 100644 --- a/examples/gpu_shared_memory/CMakeLists.txt +++ b/examples/level_zero_shared_memory/CMakeLists.txt @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception cmake_minimum_required(VERSION 3.14.0 FATAL_ERROR) -project(umf_example_gpu_shared_memory LANGUAGES C) +project(umf_example_level_zero_shared_memory LANGUAGES C) enable_testing() set(UMF_EXAMPLE_DIR "${CMAKE_SOURCE_DIR}/..") @@ -48,8 +48,8 @@ set(LEVEL_ZERO_INCLUDE_DIRS message(STATUS "Level Zero include directory: ${LEVEL_ZERO_INCLUDE_DIRS}") # build the example -set(EXAMPLE_NAME umf_example_gpu_shared_memory) -add_executable(${EXAMPLE_NAME} gpu_shared_memory.c) +set(EXAMPLE_NAME umf_example_level_zero_shared_memory) +add_executable(${EXAMPLE_NAME} level_zero_shared_memory.c) target_include_directories(${EXAMPLE_NAME} PRIVATE ${LIBUMF_INCLUDE_DIRS} ${UMF_EXAMPLE_DIR}/common) target_link_directories(${EXAMPLE_NAME} PRIVATE ${LIBUMF_LIBRARY_DIRS} diff --git a/examples/gpu_shared_memory/gpu_shared_memory.c b/examples/level_zero_shared_memory/level_zero_shared_memory.c similarity index 100% rename from examples/gpu_shared_memory/gpu_shared_memory.c rename to examples/level_zero_shared_memory/level_zero_shared_memory.c diff --git a/include/umf/memory_provider_gpu.h b/include/umf/memory_provider_gpu.h new file mode 100644 index 0000000000..cc3cc3e3e3 --- /dev/null +++ b/include/umf/memory_provider_gpu.h @@ -0,0 +1,31 @@ +/* + * + * Copyright (C) 2023-2024 Intel Corporation + * + * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + */ + +#ifndef UMF_MEMORY_PROVIDER_GPU_H +#define UMF_MEMORY_PROVIDER_GPU_H 1 + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/// @brief USM memory allocation type +typedef enum umf_usm_memory_type_t { + UMF_MEMORY_TYPE_UNKNOWN = 0, ///< The memory pointed to is of unknown type + UMF_MEMORY_TYPE_HOST, ///< The memory pointed to is a host allocation + UMF_MEMORY_TYPE_DEVICE, ///< The memory pointed to is a device allocation + UMF_MEMORY_TYPE_SHARED, ///< The memory pointed to is a shared ownership allocation +} umf_usm_memory_type_t; + +#ifdef __cplusplus +} +#endif + +#endif /* UMF_MEMORY_PROVIDER_GPU_H */ diff --git a/include/umf/providers/provider_cuda.h b/include/umf/providers/provider_cuda.h new file mode 100644 index 0000000000..2f6a07d816 --- /dev/null +++ b/include/umf/providers/provider_cuda.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +#ifndef UMF_PROVIDER_CUDA_H +#define UMF_PROVIDER_CUDA_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/// @brief CUDA Memory Provider settings struct +typedef struct cuda_memory_provider_params_t { + void *cuda_context_handle; ///< Handle to the CUDA context + int cuda_device_handle; ///< Handle to the CUDA device + umf_usm_memory_type_t memory_type; ///< Allocation memory type +} cuda_memory_provider_params_t; + +umf_memory_provider_ops_t *umfCUDAMemoryProviderOps(void); + +#ifdef __cplusplus +} +#endif + +#endif /* UMF_PROVIDER_CUDA_H */ diff --git a/include/umf/providers/provider_level_zero.h b/include/umf/providers/provider_level_zero.h index 9685c8530e..b3cc028515 100644 --- a/include/umf/providers/provider_level_zero.h +++ b/include/umf/providers/provider_level_zero.h @@ -8,22 +8,14 @@ #ifndef UMF_PROVIDER_LEVEL_ZERO_H #define UMF_PROVIDER_LEVEL_ZERO_H -#include "umf/memory_provider.h" +#include #ifdef __cplusplus extern "C" { #endif -typedef struct _ze_context_handle_t *ze_context_handle_t; typedef struct _ze_device_handle_t *ze_device_handle_t; - -/// @brief USM memory allocation type -typedef enum umf_usm_memory_type_t { - UMF_MEMORY_TYPE_UNKNOWN = 0, ///< The memory pointed to is of unknown type - UMF_MEMORY_TYPE_HOST, ///< The memory pointed to is a host allocation - UMF_MEMORY_TYPE_DEVICE, ///< The memory pointed to is a device allocation - UMF_MEMORY_TYPE_SHARED, ///< The memory pointed to is a shared ownership allocation -} umf_usm_memory_type_t; +typedef struct _ze_context_handle_t *ze_context_handle_t; /// @brief Level Zero Memory Provider settings struct typedef struct level_zero_memory_provider_params_t { diff --git a/scripts/docs_config/examples.rst b/scripts/docs_config/examples.rst index 1a76eea2a8..0f88fcc40c 100644 --- a/scripts/docs_config/examples.rst +++ b/scripts/docs_config/examples.rst @@ -111,8 +111,8 @@ Freeing memory is as easy as can be:: GPU shared memory ============================================================================== -You can find the full example code in the `examples/gpu_shared_memory/gpu_shared_memory.c`_ file -in the UMF repository. +You can find the full example code in the `examples/level_zero_shared_memory/level_zero_shared_memory.c`_ file +or `examples/cuda_shared_memory/cuda_shared_memory.c`_ file in the UMF repository. TODO @@ -209,7 +209,8 @@ function is called on the consumer side. The memory mappings on the consumer sid the :any:`umfCloseIPCHandle` function is called. .. _examples/basic/basic.c: https://github.com/oneapi-src/unified-memory-framework/blob/main/examples/basic/basic.c -.. _examples/gpu_shared_memory/gpu_shared_memory.c: https://github.com/oneapi-src/unified-memory-framework/blob/main/examples/gpu_shared_memory/gpu_shared_memory.c +.. _examples/level_zero_shared_memory/level_zero_shared_memory.c: https://github.com/oneapi-src/unified-memory-framework/blob/main/examples/level_zero_shared_memory/level_zero_shared_memory.c +.. _examples/cuda_shared_memory/cuda_shared_memory.c: https://github.com/oneapi-src/unified-memory-framework/blob/main/examples/cuda_shared_memory/cuda_shared_memory.c .. _examples/ipc_level_zero/ipc_level_zero.c: https://github.com/oneapi-src/unified-memory-framework/blob/main/examples/ipc_level_zero/ipc_level_zero.c .. _examples/custom_provider/file_provider.c: https://github.com/oneapi-src/unified-memory-framework/blob/main/examples/custom_provider/file_provider.c .. _examples/memspace: https://github.com/oneapi-src/unified-memory-framework/blob/main/examples/memspace/ diff --git a/scripts/qemu/run-build.sh b/scripts/qemu/run-build.sh index 5ed1e43daa..666bd2200a 100755 --- a/scripts/qemu/run-build.sh +++ b/scripts/qemu/run-build.sh @@ -15,6 +15,7 @@ cd build cmake .. \ -DCMAKE_BUILD_TYPE=Debug \ -DUMF_BUILD_LEVEL_ZERO_PROVIDER=ON \ + -DUMF_BUILD_CUDA_PROVIDER=ON \ -DUMF_FORMAT_CODE_STYLE=OFF \ -DUMF_DEVELOPER_MODE=ON \ -DUMF_BUILD_LIBUMF_POOL_DISJOINT=ON \ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 00aeb8a478..4419402f9f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -48,6 +48,32 @@ elseif(UMF_BUILD_LEVEL_ZERO_PROVIDER) message(STATUS "Level Zero include directory: ${LEVEL_ZERO_INCLUDE_DIRS}") endif() +if(UMF_BUILD_CUDA_PROVIDER) + include(FetchContent) + + set(CUDA_REPO + "https://gitlab.com/nvidia/headers/cuda-individual/cudart.git") + set(CUDA_TAG cuda-12.5.1) + + message(STATUS "Fetching CUDA ${CUDA_TAG} from ${CUDA_REPO} ...") + + FetchContent_Declare( + cuda-headers + GIT_REPOSITORY ${CUDA_REPO} + GIT_TAG ${CUDA_TAG} + EXCLUDE_FROM_ALL) + + FetchContent_GetProperties(cuda-headers) + if(NOT cuda-headers_POPULATED) + FetchContent_Populate(cuda-headers) + endif() + + set(CUDA_INCLUDE_DIRS + ${cuda-headers_SOURCE_DIR} + CACHE PATH "Path to CUDA headers") + message(STATUS "CUDA include directory: ${CUDA_INCLUDE_DIRS}") +endif() + add_subdirectory(utils) set(UMF_LIBS $) @@ -219,6 +245,12 @@ if(UMF_BUILD_LEVEL_ZERO_PROVIDER) "UMF_BUILD_LEVEL_ZERO_PROVIDER=1") endif() +if(UMF_BUILD_CUDA_PROVIDER) + target_sources(umf PRIVATE provider/provider_cuda.c) + set(UMF_COMPILE_DEFINITIONS ${UMF_COMPILE_DEFINITIONS} + "UMF_BUILD_CUDA_PROVIDER=1") +endif() + add_library(${PROJECT_NAME}::umf ALIAS umf) if(LIBHWLOC_INCLUDE_DIRS) @@ -229,6 +261,10 @@ if(LEVEL_ZERO_INCLUDE_DIRS) target_include_directories(umf PRIVATE ${LEVEL_ZERO_INCLUDE_DIRS}) endif() +if(CUDA_INCLUDE_DIRS) + target_include_directories(umf PRIVATE ${CUDA_INCLUDE_DIRS}) +endif() + target_include_directories( umf PUBLIC $ diff --git a/src/provider/provider_cuda.c b/src/provider/provider_cuda.c new file mode 100644 index 0000000000..5dfe0f9fae --- /dev/null +++ b/src/provider/provider_cuda.c @@ -0,0 +1,344 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +*/ + +#include +#include +#include + +#include +#include + +#include "cuda.h" + +#include "base_alloc_global.h" +#include "utils_assert.h" +#include "utils_common.h" +#include "utils_concurrency.h" +#include "utils_load_library.h" +#include "utils_log.h" +#include "utils_sanitizers.h" + +typedef struct cu_memory_provider_t { + CUcontext context; + CUdevice device; + umf_usm_memory_type_t memory_type; + size_t min_alignment; +} cu_memory_provider_t; + +typedef struct cu_ops_t { + CUresult (*cuMemGetAllocationGranularity)( + size_t *granularity, const CUmemAllocationProp *prop, + CUmemAllocationGranularity_flags option); + CUresult (*cuMemAlloc)(CUdeviceptr *dptr, size_t bytesize); + CUresult (*cuMemAllocHost)(void **pp, size_t bytesize); + CUresult (*cuMemAllocManaged)(CUdeviceptr *dptr, size_t bytesize, + unsigned int flags); + CUresult (*cuMemFree)(CUdeviceptr dptr); + CUresult (*cuMemFreeHost)(void *p); + + CUresult (*cuGetErrorName)(CUresult error, const char **pStr); + CUresult (*cuGetErrorString)(CUresult error, const char **pStr); +} cu_ops_t; + +static cu_ops_t g_cu_ops; +static UTIL_ONCE_FLAG cu_is_initialized = UTIL_ONCE_FLAG_INIT; +static bool Init_cu_global_state_failed; + +#define TLS_MSG_BUF_LEN 1024 + +typedef struct cu_last_native_error_t { + CUresult native_error; + char msg_buff[TLS_MSG_BUF_LEN]; +} cu_last_native_error_t; + +static __TLS cu_last_native_error_t TLS_last_native_error; + +static void cu_store_last_native_error(CUresult native_error) { + TLS_last_native_error.native_error = native_error; +} + +umf_result_t cu2umf_result(CUresult result) { + switch (result) { + case CUDA_SUCCESS: + return UMF_RESULT_SUCCESS; + case CUDA_ERROR_OUT_OF_MEMORY: + return UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY; + case CUDA_ERROR_INVALID_VALUE: + case CUDA_ERROR_INVALID_HANDLE: + case CUDA_ERROR_INVALID_RESOURCE_TYPE: + return UMF_RESULT_ERROR_INVALID_ARGUMENT; + default: + cu_store_last_native_error(result); + return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC; + } +} + +static void init_cu_global_state(void) { +#ifdef _WIN32 + const char *lib_name = "cudart.dll"; +#else + const char *lib_name = "libcuda.so"; +#endif + // check if CUDA shared library is already loaded + // we pass 0 as a handle to search the global symbol table + + // NOTE: some symbols defined in the lib have _vX postfixes - it is + // important to load the proper version of functions + *(void **)&g_cu_ops.cuMemGetAllocationGranularity = + util_get_symbol_addr(0, "cuMemGetAllocationGranularity", lib_name); + *(void **)&g_cu_ops.cuMemAlloc = + util_get_symbol_addr(0, "cuMemAlloc_v2", lib_name); + *(void **)&g_cu_ops.cuMemAllocHost = + util_get_symbol_addr(0, "cuMemAllocHost_v2", lib_name); + *(void **)&g_cu_ops.cuMemAllocManaged = + util_get_symbol_addr(0, "cuMemAllocManaged", lib_name); + *(void **)&g_cu_ops.cuMemFree = + util_get_symbol_addr(0, "cuMemFree_v2", lib_name); + *(void **)&g_cu_ops.cuMemFreeHost = + util_get_symbol_addr(0, "cuMemFreeHost", lib_name); + *(void **)&g_cu_ops.cuGetErrorName = + util_get_symbol_addr(0, "cuGetErrorName", lib_name); + *(void **)&g_cu_ops.cuGetErrorString = + util_get_symbol_addr(0, "cuGetErrorString", lib_name); + + if (!g_cu_ops.cuMemGetAllocationGranularity || !g_cu_ops.cuMemAlloc || + !g_cu_ops.cuMemAllocHost || !g_cu_ops.cuMemAllocManaged || + !g_cu_ops.cuMemFree || !g_cu_ops.cuMemFreeHost || + !g_cu_ops.cuGetErrorName || !g_cu_ops.cuGetErrorString) { + LOG_ERR("Required CUDA symbols not found."); + Init_cu_global_state_failed = true; + } +} + +umf_result_t cu_memory_provider_initialize(void *params, void **provider) { + if (provider == NULL || params == NULL) { + return UMF_RESULT_ERROR_INVALID_ARGUMENT; + } + + cuda_memory_provider_params_t *cu_params = + (cuda_memory_provider_params_t *)params; + + if (cu_params->memory_type == UMF_MEMORY_TYPE_UNKNOWN || + cu_params->memory_type > UMF_MEMORY_TYPE_SHARED) { + return UMF_RESULT_ERROR_INVALID_ARGUMENT; + } + + if (cu_params->cuda_context_handle == NULL) { + return UMF_RESULT_ERROR_INVALID_ARGUMENT; + } + + util_init_once(&cu_is_initialized, init_cu_global_state); + if (Init_cu_global_state_failed) { + LOG_ERR("Loading CUDA symbols failed"); + return UMF_RESULT_ERROR_UNKNOWN; + } + + cu_memory_provider_t *cu_provider = + umf_ba_global_alloc(sizeof(cu_memory_provider_t)); + if (!cu_provider) { + return UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + // CUDA alloc functions doesn't allow to provide user alignment - get the + // minimum one from the driver + size_t min_alignment = 0; + CUmemAllocationProp allocProps = {0}; + allocProps.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + allocProps.type = CU_MEM_ALLOCATION_TYPE_PINNED; + allocProps.location.id = cu_provider->device; + CUresult cu_result = g_cu_ops.cuMemGetAllocationGranularity( + &min_alignment, &allocProps, CU_MEM_ALLOC_GRANULARITY_MINIMUM); + if (cu_result != CUDA_SUCCESS) { + umf_ba_global_free(cu_provider); + return cu2umf_result(cu_result); + } + + cu_provider->context = cu_params->cuda_context_handle; + cu_provider->device = cu_params->cuda_device_handle; + cu_provider->memory_type = cu_params->memory_type; + cu_provider->min_alignment = min_alignment; + + *provider = cu_provider; + + return UMF_RESULT_SUCCESS; +} + +void cu_memory_provider_finalize(void *provider) { + ASSERT(provider); + + util_init_once(&cu_is_initialized, init_cu_global_state); + umf_ba_global_free(provider); + + // portable version of "cu_is_initialized = UTIL_ONCE_FLAG_INIT;" + static UTIL_ONCE_FLAG is_initialized = UTIL_ONCE_FLAG_INIT; + memcpy(&cu_is_initialized, &is_initialized, sizeof(cu_is_initialized)); +} + +static umf_result_t cu_memory_provider_alloc(void *provider, size_t size, + size_t alignment, + void **resultPtr) { + ASSERT(provider); + ASSERT(resultPtr); + + cu_memory_provider_t *cu_provider = (cu_memory_provider_t *)provider; + + if (alignment > cu_provider->min_alignment) { + // alignment of CUDA allocations is controlled by the CUDA driver - + // currently UMF doesn't support alignment larger than default + return UMF_RESULT_ERROR_NOT_SUPPORTED; + } + + CUresult cu_result = CUDA_SUCCESS; + switch (cu_provider->memory_type) { + case UMF_MEMORY_TYPE_HOST: { + cu_result = g_cu_ops.cuMemAllocHost(resultPtr, size); + break; + } + case UMF_MEMORY_TYPE_DEVICE: { + cu_result = g_cu_ops.cuMemAlloc((CUdeviceptr *)resultPtr, size); + break; + } + case UMF_MEMORY_TYPE_SHARED: { + cu_result = g_cu_ops.cuMemAllocManaged((CUdeviceptr *)resultPtr, size, + CU_MEM_ATTACH_GLOBAL); + break; + } + default: + // this shouldn't happen as we check the memory_type settings during + // the initialization + ASSERT(0); + return UMF_RESULT_ERROR_UNKNOWN; + } + + // check the alignment + ASSERT(alignment == 0 || ((uintptr_t)(*resultPtr) % alignment) == 0); + + return cu2umf_result(cu_result); +} + +static umf_result_t cu_memory_provider_free(void *provider, void *ptr, + size_t bytes) { + (void)bytes; + + ASSERT(provider); + cu_memory_provider_t *cu_provider = (cu_memory_provider_t *)provider; + + CUresult cu_result = CUDA_SUCCESS; + switch (cu_provider->memory_type) { + case UMF_MEMORY_TYPE_HOST: { + cu_result = g_cu_ops.cuMemFreeHost(ptr); + break; + } + case UMF_MEMORY_TYPE_SHARED: + case UMF_MEMORY_TYPE_DEVICE: { + cu_result = g_cu_ops.cuMemFree((CUdeviceptr)ptr); + break; + } + default: + // this shouldn't happen as we check the memory_type settings during + // the initialization + ASSERT(0); + return UMF_RESULT_ERROR_UNKNOWN; + } + + return cu2umf_result(cu_result); +} + +void cu_memory_provider_get_last_native_error(void *provider, + const char **ppMessage, + int32_t *pError) { + (void)provider; + + const char *error_name = 0; + const char *error_string = 0; + g_cu_ops.cuGetErrorName(TLS_last_native_error.native_error, &error_name); + g_cu_ops.cuGetErrorString(TLS_last_native_error.native_error, + &error_string); + + size_t buf_size = 0; + strncpy(TLS_last_native_error.msg_buff, error_name, TLS_MSG_BUF_LEN - 1); + buf_size = strlen(TLS_last_native_error.msg_buff); + + strncat(TLS_last_native_error.msg_buff, " - ", + TLS_MSG_BUF_LEN - buf_size - 1); + buf_size = strlen(TLS_last_native_error.msg_buff); + + strncat(TLS_last_native_error.msg_buff, error_string, + TLS_MSG_BUF_LEN - buf_size - 1); + + *pError = TLS_last_native_error.native_error; + *ppMessage = TLS_last_native_error.msg_buff; +} + +static umf_result_t cu_memory_provider_get_min_page_size(void *provider, + void *ptr, + size_t *pageSize) { + (void)ptr; + + cu_memory_provider_t *cu_provider = (cu_memory_provider_t *)provider; + + CUmemAllocationProp allocProps = {0}; + allocProps.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + allocProps.type = CU_MEM_ALLOCATION_TYPE_PINNED; + allocProps.location.id = cu_provider->device; + + CUresult cu_result = g_cu_ops.cuMemGetAllocationGranularity( + pageSize, &allocProps, CU_MEM_ALLOC_GRANULARITY_MINIMUM); + + return cu2umf_result(cu_result); +} + +static umf_result_t +cu_memory_provider_get_recommended_page_size(void *provider, size_t size, + size_t *pageSize) { + (void)size; + + cu_memory_provider_t *cu_provider = (cu_memory_provider_t *)provider; + + CUmemAllocationProp allocProps = {0}; + allocProps.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + allocProps.type = CU_MEM_ALLOCATION_TYPE_PINNED; + allocProps.location.id = cu_provider->device; + + CUresult cu_result = g_cu_ops.cuMemGetAllocationGranularity( + pageSize, &allocProps, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED); + + return cu2umf_result(cu_result); +} + +const char *cu_memory_provider_get_name(void *provider) { + (void)provider; + return "CUDA"; +} + +static struct umf_memory_provider_ops_t UMF_CUDA_MEMORY_PROVIDER_OPS = { + .version = UMF_VERSION_CURRENT, + .initialize = cu_memory_provider_initialize, + .finalize = cu_memory_provider_finalize, + .alloc = cu_memory_provider_alloc, + .free = cu_memory_provider_free, + .get_last_native_error = cu_memory_provider_get_last_native_error, + .get_recommended_page_size = cu_memory_provider_get_recommended_page_size, + .get_min_page_size = cu_memory_provider_get_min_page_size, + .get_name = cu_memory_provider_get_name, + // TODO + /* + .ext.purge_lazy = cu_memory_provider_purge_lazy, + .ext.purge_force = cu_memory_provider_purge_force, + .ext.allocation_merge = cu_memory_provider_allocation_merge, + .ext.allocation_split = cu_memory_provider_allocation_split, + .ipc.get_ipc_handle_size = cu_memory_provider_get_ipc_handle_size, + .ipc.get_ipc_handle = cu_memory_provider_get_ipc_handle, + .ipc.put_ipc_handle = cu_memory_provider_put_ipc_handle, + .ipc.open_ipc_handle = cu_memory_provider_open_ipc_handle, + .ipc.close_ipc_handle = cu_memory_provider_close_ipc_handle, + */ +}; + +umf_memory_provider_ops_t *umfCUDAMemoryProviderOps(void) { + return &UMF_CUDA_MEMORY_PROVIDER_OPS; +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index be6d9ad7d5..87dccb3cd5 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -23,6 +23,11 @@ enable_testing() set(UMF_TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +if(UMF_BUILD_GPU_TESTS AND UMF_BUILD_CUDA_PROVIDER) + find_package(CUDA REQUIRED cuda) + # TODO do the same for ze_loader +endif() + function(build_umf_test) # Parameters: * NAME - a name of the test * SRCS - source files * LIBS - # libraries to be linked with @@ -258,6 +263,28 @@ if(UMF_BUILD_GPU_TESTS AND UMF_BUILD_LEVEL_ZERO_PROVIDER) PRIVATE ${LEVEL_ZERO_INCLUDE_DIRS}) endif() +if(UMF_BUILD_GPU_TESTS AND UMF_BUILD_CUDA_PROVIDER) + # we have two test binaries here that use the same sources, but differ in + # the way they are linked to the CUDA (statically or at runtime using + # dlopen) + add_umf_test( + NAME provider_cuda + SRCS providers/provider_cuda.cpp providers/cuda_helpers.cpp + LIBS ${UMF_UTILS_FOR_TEST} cuda) + target_include_directories(umf_test-provider_cuda + PRIVATE ${CUDA_INCLUDE_DIRS}) + target_link_directories(umf_test-provider_cuda PRIVATE ${CUDA_LIBRARY_DIRS}) + + add_umf_test( + NAME provider_cuda_dlopen + SRCS providers/provider_cuda.cpp providers/cuda_helpers.cpp + LIBS ${UMF_UTILS_FOR_TEST}) + target_compile_definitions(umf_test-provider_cuda_dlopen + PUBLIC USE_DLOPEN=1) + target_include_directories(umf_test-provider_cuda_dlopen + PRIVATE ${CUDA_INCLUDE_DIRS}) +endif() + if(UMF_BUILD_SHARED_LIBRARY) # if build as shared library, ba symbols won't be visible in tests set(BA_SOURCES_FOR_TEST ${BA_SOURCES}) @@ -367,6 +394,9 @@ if(LINUX) common/ipc_os_prov_common.c) add_umf_ipc_test(TEST ipc_devdax_prov) endif() + + # TODO add IPC tests for CUDA + if(UMF_BUILD_GPU_TESTS AND UMF_BUILD_LEVEL_ZERO_PROVIDER) build_umf_test( NAME @@ -437,15 +467,28 @@ if(LINUX if(UMF_BUILD_GPU_EXAMPLES AND UMF_BUILD_LIBUMF_POOL_DISJOINT AND UMF_BUILD_LEVEL_ZERO_PROVIDER) - set(EXAMPLES ${EXAMPLES} gpu_shared_memory) + set(EXAMPLES ${EXAMPLES} level_zero_shared_memory) else() message( STATUS - "GPU shared memory example requires UMF_BUILD_GPU_EXAMPLES, " + "GPU level zero shared memory example requires UMF_BUILD_GPU_EXAMPLES, " "UMF_BUILD_LEVEL_ZERO_PROVIDER and UMF_BUILD_LIBUMF_POOL_DISJOINT " "to be turned ON - skipping") endif() + if(UMF_BUILD_GPU_EXAMPLES + AND UMF_BUILD_LIBUMF_POOL_DISJOINT + AND UMF_BUILD_CUDA_PROVIDER) + set(EXAMPLES ${EXAMPLES} cuda_shared_memory) + else() + message( + STATUS + "GPU CUDA shared memory example requires UMF_BUILD_GPU_EXAMPLES, " + "UMF_BUILD_CUDA_PROVIDER and UMF_BUILD_LIBUMF_POOL_DISJOINT " + "to be turned ON - skipping") + endif() + + # TODO add IPC examples for CUDA if(UMF_BUILD_GPU_EXAMPLES AND UMF_BUILD_LIBUMF_POOL_DISJOINT AND UMF_BUILD_LEVEL_ZERO_PROVIDER) diff --git a/test/providers/cuda_helpers.cpp b/test/providers/cuda_helpers.cpp new file mode 100644 index 0000000000..bd83f789b1 --- /dev/null +++ b/test/providers/cuda_helpers.cpp @@ -0,0 +1,288 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + */ + +#include +#include +#include + +#include "cuda_helpers.h" +#include "utils_concurrency.h" +#include "utils_load_library.h" + +struct libcu_ops { + CUresult (*cuInit)(unsigned int flags); + CUresult (*cuCtxCreate)(CUcontext *pctx, unsigned int flags, CUdevice dev); + CUresult (*cuCtxDestroy)(CUcontext ctx); + CUresult (*cuDeviceGet)(CUdevice *device, int ordinal); + CUresult (*cuMemAlloc)(CUdeviceptr *dptr, size_t size); + CUresult (*cuMemFree)(CUdeviceptr dptr); + CUresult (*cuMemAllocHost)(void **pp, size_t size); + CUresult (*cuMemAllocManaged)(CUdeviceptr *dptr, size_t bytesize, + unsigned int flags); + CUresult (*cuMemFreeHost)(void *p); + CUresult (*cuMemsetD32)(CUdeviceptr dstDevice, unsigned int pattern, + size_t size); + CUresult (*cuMemcpyDtoH)(void *dstHost, CUdeviceptr srcDevice, size_t size); + CUresult (*cuPointerGetAttributes)(unsigned int numAttributes, + CUpointer_attribute *attributes, + void **data, CUdeviceptr ptr); +} libcu_ops; + +#if USE_DLOPEN +struct DlHandleCloser { + void operator()(void *dlHandle) { + if (dlHandle) { + util_close_library(dlHandle); + } + } +}; + +std::unique_ptr cuDlHandle = nullptr; +int InitCUDAOps() { +#ifdef _WIN32 + const char *lib_name = "cudart.dll"; +#else + const char *lib_name = "libcuda.so"; +#endif + // CUDA symbols + // NOTE that we use UMF_UTIL_OPEN_LIBRARY_GLOBAL which add all loaded + // symbols to the global symbol table. + cuDlHandle = std::unique_ptr( + util_open_library(lib_name, UMF_UTIL_OPEN_LIBRARY_GLOBAL)); + + // NOTE: some symbols defined in the lib have _vX postfixes - this is + // important to load the proper version of functions + *(void **)&libcu_ops.cuInit = + util_get_symbol_addr(cuDlHandle.get(), "cuInit", lib_name); + if (libcu_ops.cuInit == nullptr) { + fprintf(stderr, "cuInit symbol not found in %s\n", lib_name); + return -1; + } + *(void **)&libcu_ops.cuCtxCreate = + util_get_symbol_addr(cuDlHandle.get(), "cuCtxCreate_v2", lib_name); + if (libcu_ops.cuCtxCreate == nullptr) { + fprintf(stderr, "cuCtxCreate_v2 symbol not found in %s\n", lib_name); + return -1; + } + *(void **)&libcu_ops.cuCtxDestroy = + util_get_symbol_addr(cuDlHandle.get(), "cuCtxDestroy_v2", lib_name); + if (libcu_ops.cuCtxDestroy == nullptr) { + fprintf(stderr, "cuCtxDestroy symbol not found in %s\n", lib_name); + return -1; + } + *(void **)&libcu_ops.cuDeviceGet = + util_get_symbol_addr(cuDlHandle.get(), "cuDeviceGet", lib_name); + if (libcu_ops.cuDeviceGet == nullptr) { + fprintf(stderr, "cuDeviceGet symbol not found in %s\n", lib_name); + return -1; + } + *(void **)&libcu_ops.cuMemAlloc = + util_get_symbol_addr(cuDlHandle.get(), "cuMemAlloc_v2", lib_name); + if (libcu_ops.cuMemAlloc == nullptr) { + fprintf(stderr, "cuMemAlloc_v2 symbol not found in %s\n", lib_name); + return -1; + } + *(void **)&libcu_ops.cuMemFree = + util_get_symbol_addr(cuDlHandle.get(), "cuMemFree_v2", lib_name); + if (libcu_ops.cuMemFree == nullptr) { + fprintf(stderr, "cuMemFree_v2 symbol not found in %s\n", lib_name); + return -1; + } + *(void **)&libcu_ops.cuMemAllocHost = + util_get_symbol_addr(cuDlHandle.get(), "cuMemAllocHost_v2", lib_name); + if (libcu_ops.cuMemAllocHost == nullptr) { + fprintf(stderr, "cuMemAllocHost_v2 symbol not found in %s\n", lib_name); + return -1; + } + *(void **)&libcu_ops.cuMemAllocManaged = + util_get_symbol_addr(cuDlHandle.get(), "cuMemAllocManaged", lib_name); + if (libcu_ops.cuMemAllocManaged == nullptr) { + fprintf(stderr, "cuMemAllocManaged symbol not found in %s\n", lib_name); + return -1; + } + *(void **)&libcu_ops.cuMemFreeHost = + util_get_symbol_addr(cuDlHandle.get(), "cuMemFreeHost", lib_name); + if (libcu_ops.cuMemFreeHost == nullptr) { + fprintf(stderr, "cuMemFreeHost symbol not found in %s\n", lib_name); + return -1; + } + *(void **)&libcu_ops.cuMemsetD32 = + util_get_symbol_addr(cuDlHandle.get(), "cuMemsetD32_v2", lib_name); + if (libcu_ops.cuMemsetD32 == nullptr) { + fprintf(stderr, "cuMemsetD32_v2 symbol not found in %s\n", lib_name); + return -1; + } + *(void **)&libcu_ops.cuMemcpyDtoH = + util_get_symbol_addr(cuDlHandle.get(), "cuMemcpyDtoH_v2", lib_name); + if (libcu_ops.cuMemcpyDtoH == nullptr) { + fprintf(stderr, "cuMemcpyDtoH_v2 symbol not found in %s\n", lib_name); + return -1; + } + *(void **)&libcu_ops.cuPointerGetAttributes = util_get_symbol_addr( + cuDlHandle.get(), "cuPointerGetAttributes", lib_name); + if (libcu_ops.cuPointerGetAttributes == nullptr) { + fprintf(stderr, "cuPointerGetAttributes symbol not found in %s\n", + lib_name); + return -1; + } + + return 0; +} + +#else // USE_DLOPEN +int InitCUDAOps() { + // CUDA is linked statically but we prepare ops structure to + // make test code consistent + libcu_ops.cuInit = cuInit; + libcu_ops.cuCtxCreate = cuCtxCreate; + libcu_ops.cuCtxDestroy = cuCtxDestroy; + libcu_ops.cuDeviceGet = cuDeviceGet; + libcu_ops.cuMemAlloc = cuMemAlloc; + libcu_ops.cuMemAllocHost = cuMemAllocHost; + libcu_ops.cuMemAllocManaged = cuMemAllocManaged; + libcu_ops.cuMemFree = cuMemFree; + libcu_ops.cuMemFreeHost = cuMemFreeHost; + libcu_ops.cuMemsetD32 = cuMemsetD32; + libcu_ops.cuMemcpyDtoH = cuMemcpyDtoH; + libcu_ops.cuPointerGetAttributes = cuPointerGetAttributes; + + return 0; +} +#endif // USE_DLOPEN + +static int init_cuda_lib(void) { + CUresult result = libcu_ops.cuInit(0); + if (result != CUDA_SUCCESS) { + return -1; + } + return 0; +} + +int cuda_fill(CUcontext context, CUdevice device, void *ptr, size_t size, + const void *pattern, size_t pattern_size) { + + (void)context; + (void)device; + (void)pattern_size; + + // TODO support patterns > sizeof(unsigned int) + assert(pattern_size <= sizeof(unsigned int)); + + int ret = 0; + CUresult res = + libcu_ops.cuMemsetD32((CUdeviceptr)ptr, *(unsigned int *)pattern, + size / sizeof(unsigned int)); + if (res != CUDA_SUCCESS) { + fprintf(stderr, "cuMemsetD32() failed!\n"); + return -1; + } + + return ret; +} + +int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, void *src_ptr, + size_t size) { + (void)context; + (void)device; + + int ret = 0; + CUresult res = libcu_ops.cuMemcpyDtoH(dst_ptr, (CUdeviceptr)src_ptr, size); + if (res != CUDA_SUCCESS) { + fprintf(stderr, "cuMemcpyDtoH() failed!\n"); + return -1; + } + + return ret; +} + +umf_usm_memory_type_t get_mem_type(CUcontext context, void *ptr) { + + (void)context; + + unsigned int managed; + unsigned int type; + void *attrib_vals[2] = {&managed, &type}; + CUpointer_attribute attribs[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED, + CU_POINTER_ATTRIBUTE_MEMORY_TYPE}; + + CUresult res = libcu_ops.cuPointerGetAttributes(2, attribs, attrib_vals, + (CUdeviceptr)ptr); + if (res != CUDA_SUCCESS) { + fprintf(stderr, "cuPointerGetAttributes() failed!\n"); + return UMF_MEMORY_TYPE_UNKNOWN; + } + + if (type == CU_MEMORYTYPE_DEVICE && managed == 0) { + return UMF_MEMORY_TYPE_DEVICE; + } else if (type == CU_MEMORYTYPE_DEVICE && managed == 1) { + return UMF_MEMORY_TYPE_SHARED; + } else if (type == CU_MEMORYTYPE_HOST) { + return UMF_MEMORY_TYPE_HOST; + } + + return UMF_MEMORY_TYPE_UNKNOWN; +} + +UTIL_ONCE_FLAG cuda_init_flag; +int InitResult; +void init_cuda_once() { + InitResult = InitCUDAOps(); + if (InitResult != 0) { + return; + } + InitResult = init_cuda_lib(); +} + +int init_cuda() { + util_init_once(&cuda_init_flag, init_cuda_once); + + return InitResult; +} + +cuda_memory_provider_params_t +create_cuda_prov_params(umf_usm_memory_type_t memory_type) { + cuda_memory_provider_params_t params = {NULL, 0, UMF_MEMORY_TYPE_UNKNOWN}; + int ret = -1; + + ret = init_cuda(); + if (ret != 0) { + // Return empty params. Test will be skipped. + return params; + } + + // Get the first CUDA device + CUdevice cuDevice = -1; + CUresult res = libcu_ops.cuDeviceGet(&cuDevice, 0); + if (res != CUDA_SUCCESS || cuDevice < 0) { + // Return empty params. Test will be skipped. + return params; + } + + // Create a CUDA context + CUcontext cuContext = nullptr; + res = libcu_ops.cuCtxCreate(&cuContext, 0, cuDevice); + if (res != CUDA_SUCCESS || cuContext == nullptr) { + // Return empty params. Test will be skipped. + return params; + } + + params.cuda_context_handle = cuContext; + params.cuda_device_handle = cuDevice; + params.memory_type = memory_type; + + return params; +} + +int destroy_context(CUcontext context) { + CUresult res = libcu_ops.cuCtxDestroy(context); + if (res != CUDA_SUCCESS) { + fprintf(stderr, "cuCtxDestroy() failed!\n"); + return -1; + } + + return 0; +} diff --git a/test/providers/cuda_helpers.h b/test/providers/cuda_helpers.h new file mode 100644 index 0000000000..3227fc9c59 --- /dev/null +++ b/test/providers/cuda_helpers.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + */ + +#ifndef TEST_COMMON_CUDA_HELPERS_HPP +#define TEST_COMMON_CUDA_HELPERS_HPP + +#include + +#include "cuda.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int destroy_context(CUcontext context); + +int cuda_fill(CUcontext context, CUdevice device, void *ptr, size_t size, + const void *pattern, size_t pattern_size); + +int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, void *src_ptr, + size_t size); + +umf_usm_memory_type_t get_mem_type(CUcontext context, void *ptr); + +cuda_memory_provider_params_t +create_cuda_prov_params(umf_usm_memory_type_t memory_type); + +#ifdef __cplusplus +} +#endif + +#endif // TEST_COMMON_CUDA_HELPERS_HPP diff --git a/test/providers/provider_cuda.cpp b/test/providers/provider_cuda.cpp new file mode 100644 index 0000000000..f563d45c8a --- /dev/null +++ b/test/providers/provider_cuda.cpp @@ -0,0 +1,189 @@ +// Copyright (C) 2024 Intel Corporation +// Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifdef _WIN32 +//workaround for std::numeric_limits on windows +#define NOMINMAX +#endif + +#include + +#include + +#include "cuda_helpers.h" +#include "ipcFixtures.hpp" +#include "pool.hpp" +#include "utils_load_library.h" + +using umf_test::test; +using namespace umf_test; + +class CUDAMemoryAccessor : public MemoryAccessor { + public: + void init(CUcontext hContext, CUdevice hDevice) { + hDevice_ = hDevice; + hContext_ = hContext; + } + + void fill(void *ptr, size_t size, const void *pattern, + size_t pattern_size) { + ASSERT_NE(hContext_, nullptr); + ASSERT_GE(hDevice_, -1); + ASSERT_NE(ptr, nullptr); + + int ret = + cuda_fill(hContext_, hDevice_, ptr, size, pattern, pattern_size); + ASSERT_EQ(ret, 0); + } + + void copy(void *dst_ptr, void *src_ptr, size_t size) { + ASSERT_NE(hContext_, nullptr); + ASSERT_GE(hDevice_, -1); + ASSERT_NE(dst_ptr, nullptr); + ASSERT_NE(src_ptr, nullptr); + + int ret = cuda_copy(hContext_, hDevice_, dst_ptr, src_ptr, size); + ASSERT_EQ(ret, 0); + } + + private: + CUdevice hDevice_; + CUcontext hContext_; +}; + +using CUDAProviderTestParams = + std::tuple; + +struct umfCUDAProviderTest + : umf_test::test, + ::testing::WithParamInterface { + + void SetUp() override { + test::SetUp(); + + auto [memory_type, accessor] = this->GetParam(); + params = create_cuda_prov_params(memory_type); + memAccessor = accessor; + if (memory_type == UMF_MEMORY_TYPE_DEVICE) { + ((CUDAMemoryAccessor *)memAccessor) + ->init((CUcontext)params.cuda_context_handle, + params.cuda_device_handle); + } + } + + void TearDown() override { + if (params.cuda_context_handle) { + int ret = destroy_context((CUcontext)params.cuda_context_handle); + ASSERT_EQ(ret, 0); + } + test::TearDown(); + } + + cuda_memory_provider_params_t params; + MemoryAccessor *memAccessor = nullptr; +}; + +TEST_P(umfCUDAProviderTest, basic) { + const size_t size = 1024 * 8; + const uint32_t pattern = 0xAB; + + // create CUDA provider + umf_memory_provider_handle_t provider = nullptr; + umf_result_t umf_result = + umfMemoryProviderCreate(umfCUDAMemoryProviderOps(), ¶ms, &provider); + ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); + ASSERT_NE(provider, nullptr); + + size_t pageSize = 0; + umf_result = umfMemoryProviderGetMinPageSize(provider, 0, &pageSize); + ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); + ASSERT_GE(pageSize, 0); + + umf_result = + umfMemoryProviderGetRecommendedPageSize(provider, 0, &pageSize); + ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); + ASSERT_GE(pageSize, 0); + + void *ptr = nullptr; + umf_result = umfMemoryProviderAlloc(provider, size, 128, &ptr); + ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); + ASSERT_NE(ptr, nullptr); + + // use the allocated memory - fill it with a 0xAB pattern + memAccessor->fill(ptr, size, &pattern, sizeof(pattern)); + + umf_usm_memory_type_t memoryTypeActual = + get_mem_type((CUcontext)params.cuda_context_handle, ptr); + ASSERT_EQ(memoryTypeActual, params.memory_type); + + // check if the pattern was successfully applied + uint32_t *hostMemory = (uint32_t *)calloc(1, size); + memAccessor->copy(hostMemory, ptr, size); + for (size_t i = 0; i < size / sizeof(int); i++) { + ASSERT_EQ(hostMemory[i], pattern); + } + free(hostMemory); + + umf_result = umfMemoryProviderFree(provider, ptr, size); + ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); + + umfMemoryProviderDestroy(provider); +} + +TEST_P(umfCUDAProviderTest, allocInvalidSize) { + // create CUDA provider + umf_memory_provider_handle_t provider = nullptr; + umf_result_t umf_result = + umfMemoryProviderCreate(umfCUDAMemoryProviderOps(), ¶ms, &provider); + ASSERT_EQ(umf_result, UMF_RESULT_SUCCESS); + ASSERT_NE(provider, nullptr); + + // try to alloc (int)-1 + void *ptr = nullptr; + umf_result = umfMemoryProviderAlloc(provider, -1, 0, &ptr); + ASSERT_EQ(umf_result, UMF_RESULT_ERROR_OUT_OF_HOST_MEMORY); + + // in case of size == 0 we should got INVALID_ARGUMENT error + // NOTE: this is invalid only for the DEVICE or SHARED allocations + if (params.memory_type != UMF_MEMORY_TYPE_HOST) { + umf_result = umfMemoryProviderAlloc(provider, 0, 0, &ptr); + ASSERT_EQ(umf_result, UMF_RESULT_ERROR_INVALID_ARGUMENT); + } + + // destroy context and try to alloc some memory + destroy_context((CUcontext)params.cuda_context_handle); + params.cuda_context_handle = 0; + umf_result = umfMemoryProviderAlloc(provider, 128, 0, &ptr); + ASSERT_EQ(umf_result, UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC); + + const char *message; + int32_t error; + umfMemoryProviderGetLastNativeError(provider, &message, &error); + ASSERT_EQ(error, CUDA_ERROR_INVALID_CONTEXT); + const char *expected_message = + "CUDA_ERROR_INVALID_CONTEXT - invalid device context"; + ASSERT_EQ(strncmp(message, expected_message, strlen(expected_message)), 0); +} + +// TODO add tests that mixes CUDA Memory Provider and Disjoint Pool + +CUDAMemoryAccessor cuAccessor; +HostMemoryAccessor hostAccessor; + +INSTANTIATE_TEST_SUITE_P( + umfCUDAProviderTestSuite, umfCUDAProviderTest, + ::testing::Values( + CUDAProviderTestParams{UMF_MEMORY_TYPE_DEVICE, &cuAccessor}, + CUDAProviderTestParams{UMF_MEMORY_TYPE_SHARED, &hostAccessor}, + CUDAProviderTestParams{UMF_MEMORY_TYPE_HOST, &hostAccessor})); + +// TODO: add IPC API +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(umfIpcTest); +/* +INSTANTIATE_TEST_SUITE_P(umfCUDAProviderTestSuite, umfIpcTest, + ::testing::Values(ipcTestParams{ + umfProxyPoolOps(), nullptr, + umfCUDAMemoryProviderOps(), + &cuParams_device_memory, &l0Accessor})); +*/