Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CUDA vector_add sample project #2160

Merged
merged 48 commits into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
6634ffb
add the CUDA vector addition sample
ericniebler Jul 4, 2024
2d9f364
Remove unnecessary sample helpers
pciolkosz Aug 1, 2024
e872ca2
Merge remote-tracking branch 'origin/main' into cudax-samples
ericniebler Aug 2, 2024
93a82ce
use a specific cuda architecture instead of `native`
ericniebler Aug 2, 2024
59ea51d
use `cuda::launch` instead of launching the kernel directly
ericniebler Aug 2, 2024
e62220a
use thrust's host_ and device_vector types in the cudax sample for now
ericniebler Aug 3, 2024
adb634d
use a temporary `launch_ex` fn that applies an arg transform
ericniebler Aug 3, 2024
29732bf
minor cleanup
ericniebler Aug 3, 2024
6f13b40
Merge remote-tracking branch 'origin/main' into cudax-samples
ericniebler Aug 7, 2024
dbd7a68
use `__launch_transform` in the `vector_add` sample
ericniebler Aug 7, 2024
9ed5532
mock up a cudax::vector and the in/out annotations
ericniebler Aug 7, 2024
52e6c7e
a working example with vector, in/out, and launch
ericniebler Aug 8, 2024
82db01d
insert a sync stream at the right place
ericniebler Aug 8, 2024
f587bc9
add missing include directory
ericniebler Aug 8, 2024
4732a80
i do not like cmake
ericniebler Aug 8, 2024
c73f856
add missing header
ericniebler Aug 8, 2024
ef0b399
add explicit device selection
ericniebler Aug 8, 2024
4d1ad50
try to fix msvc build break
ericniebler Aug 8, 2024
fad5e66
try again
ericniebler Aug 8, 2024
e817699
Merge remote-tracking branch 'origin/main' into cudax-samples
ericniebler Aug 8, 2024
2500b30
cmake is evil
ericniebler Aug 8, 2024
56247ca
once more with feeling
ericniebler Aug 8, 2024
388d57c
again
ericniebler Aug 8, 2024
1d74986
again
ericniebler Aug 8, 2024
791d13c
ah, enable language CXX
ericniebler Aug 8, 2024
1ff49c6
again
ericniebler Aug 9, 2024
fc27771
try c++ 20
ericniebler Aug 9, 2024
2fc597d
better?
ericniebler Aug 9, 2024
285333e
maybe this?
ericniebler Aug 9, 2024
8a200a2
will it ever end?
ericniebler Aug 9, 2024
600dde1
wassup?
ericniebler Aug 9, 2024
0f2494e
work around msvc non-conformance
ericniebler Aug 9, 2024
87e67f3
very close now i think
ericniebler Aug 9, 2024
b43b90b
use msvc with conforming preprocessor
ericniebler Aug 9, 2024
ab29482
cmake string strangeness
ericniebler Aug 9, 2024
9850ef5
here i go again
ericniebler Aug 9, 2024
da3120d
try c++20
ericniebler Aug 9, 2024
505545b
only require c++20 when using msvc
ericniebler Aug 9, 2024
ca9d544
Replace the mdspan concept emulation with libcu++ one
miscco Aug 9, 2024
6ffa2ae
Fix formatting
miscco Aug 9, 2024
ac8e6d8
Fix issues with concept emulation
miscco Aug 9, 2024
9a13c77
Try and work around issue with nvcc deduction failure
miscco Aug 9, 2024
e2e7354
Drop the whole macro
miscco Aug 9, 2024
4be1ee9
drop more concept emulation
miscco Aug 9, 2024
bdbd29e
Fix one more issue with `is_always_strided`
miscco Aug 9, 2024
457e0d9
Merge branch 'main' into pr/ericniebler/2160
miscco Aug 9, 2024
6656965
Merge remote-tracking branch 'origin/main' into cudax-samples
ericniebler Aug 9, 2024
f9580c8
Merge branch 'cudax-samples' of github.com:ericniebler/cccl into cuda…
ericniebler Aug 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions cudax/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ endif()

option(cudax_ENABLE_HEADER_TESTING "Test that CUDA Experimental's public headers compile." ON)
option(cudax_ENABLE_TESTING "Build CUDA Experimental's tests." ON)
option(cudax_ENABLE_SAMPLES "Build CUDA Experimental's samples." ON)

include(cmake/cudaxBuildCompilerTargets.cmake)
include(cmake/cudaxBuildTargetList.cmake)
Expand All @@ -41,3 +42,13 @@ if (cudax_ENABLE_TESTING)
enable_testing() # Must be in root directory
add_subdirectory(test)
endif()

if (cudax_ENABLE_SAMPLES)
include(ExternalProject)
ExternalProject_Add(cudax_samples
PREFIX samples
SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/samples"
BUILD_ALWAYS ON
INSTALL_COMMAND cmake -E echo "Skipping install step.")
add_dependencies(cudax.all cudax_samples)
endif()
19 changes: 15 additions & 4 deletions cudax/include/cuda/experimental/__detail/utility.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,31 @@ namespace cuda::experimental
{
namespace detail
{
struct __ignore
// This is a helper type that can be used to ignore function arguments.
struct [[maybe_unused]] __ignore
{
template <typename... Args>
_CCCL_HOST_DEVICE constexpr __ignore(Args&&...) noexcept
__ignore() = default;

template <typename _Arg>
_CCCL_HOST_DEVICE constexpr __ignore(_Arg&&) noexcept
{}
};

// Classes can inherit from this type to become immovable.
struct __immovable
{
__immovable() = default;
__immovable(__immovable&&) = delete;
__immovable& operator=(__immovable&&) = delete;
};
} // namespace detail

struct uninit_t
{
explicit uninit_t() = default;
};

inline constexpr uninit_t uninit{};
_CCCL_GLOBAL_CONSTANT uninit_t uninit{};
} // namespace cuda::experimental

#endif // __CUDAX_DETAIL_UTILITY_H
53 changes: 53 additions & 0 deletions cudax/samples/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

cmake_minimum_required(VERSION 3.14 FATAL_ERROR)

project(CUDAX_SAMPLES CUDA)

# This example uses the CMake Package Manager (CPM) to simplify fetching CCCL from GitHub
# For more information, see https://github.com/cpm-cmake/CPM.cmake
include(cmake/CPM.cmake)

# We define these as variables so they can be overriden in CI to pull from a PR instead of CCCL `main`
# In your project, these variables are unncessary and you can just use the values directly
set(CCCL_REPOSITORY "nvidia/cccl" CACHE STRING "GitHub repository to fetch CCCL from")
set(CCCL_TAG "main" CACHE STRING "Git tag/branch to fetch from CCCL repository")

# This will automatically clone CCCL from GitHub and make the exported cmake targets available
CPMAddPackage(
NAME CCCL
GITHUB_REPOSITORY ${CCCL_REPOSITORY}
GIT_TAG ${CCCL_TAG}
)

# Default to building for the GPU on the current system
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 86)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem isn't the architecture value, it's that the way this test is set up, it requires running on a GPU runner, but is ending up on a CPU runner.

@alliepiper can help you get it sorted.

endif()

# Creates a cmake executable target for the main program
add_executable(vector_add vector_add/vector_add.cu)
set_property(TARGET vector_add PROPERTY CXX_STANDARD 17)
target_include_directories(vector_add PRIVATE ${CMAKE_SOURCE_DIR}/../include)

# "Links" the CCCL Cmake target to the `vector_add` executable. This configures everything needed to use
# CCCL headers, including setting up include paths, compiler flags, etc.
target_link_libraries(vector_add PRIVATE CCCL::CCCL)

# This is only relevant for internal testing and not needed by end users.
include(CTest)
enable_testing()
add_test(NAME vector_add COMMAND vector_add)
33 changes: 33 additions & 0 deletions cudax/samples/cmake/CPM.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
set(CPM_DOWNLOAD_VERSION 0.38.1)

if(CPM_SOURCE_CACHE)
set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
elseif(DEFINED ENV{CPM_SOURCE_CACHE})
set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
else()
set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
endif()

# Expand relative path. This is important if the provided path contains a tilde (~)
get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE)

function(download_cpm)
message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
file(DOWNLOAD
https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
${CPM_DOWNLOAD_LOCATION}
)
endfunction()

if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
download_cpm()
else()
# resume download if it previously failed
file(READ ${CPM_DOWNLOAD_LOCATION} check)
if("${check}" STREQUAL "")
download_cpm()
endif()
unset(check)
endif()

include(${CPM_DOWNLOAD_LOCATION})
185 changes: 185 additions & 0 deletions cudax/samples/vector_add/vector.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDAX__CONTAINER_VECTOR
#define _CUDAX__CONTAINER_VECTOR

#include <cuda/__cccl_config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

#include <cuda/std/__type_traits/maybe_const.h>
#include <cuda/std/span>
#include <cuda/stream_ref>

#include <cuda/experimental/__detail/utility.cuh>

#if 1 //_CCCL_STD_VER >= 2017
namespace cuda::experimental
{
using ::cuda::std::span;
using ::thrust::device_vector;
using ::thrust::host_vector;

namespace detail
{
template <typename _Ty>
struct __in_box
{
const _Ty& __val;
};

template <typename _Ty>
struct __out_box
{
_Ty& __val;
};
} // namespace detail

template <typename _Ty>
class vector
{
public:
vector() = default;
explicit vector(size_t n)
: __h_(n)
{}

_Ty& operator[](size_t i) noexcept
{
__dirty_ = true;
return __h_[i];
}

const _Ty& operator[](size_t i) const noexcept
{
return __h_[i];
}

private:
enum class __param : unsigned
{
_in = 1,
_out = 2,
_inout = 3
};

_CCCL_NODISCARD_FRIEND _CCCL_HOST_DEVICE constexpr __param operator&(__param __a, __param __b) noexcept
{
return __param(unsigned(__a) & unsigned(__b));
}

void sync_host_to_device() const
{
if (__dirty_)
{
printf("sync_host_to_device\n");
__d_ = __h_;
__dirty_ = false;
}
}

void sync_device_to_host()
{
printf("sync_device_to_host\n");
__h_ = __d_;
}

template <__param _Param>
struct __action : detail::__immovable
{
static constexpr bool __mut = ((_Param & __param::_out) == __param::_out);
using __cv_vector = ::cuda::std::__maybe_const<!__mut, vector>;

explicit __action(stream_ref __str, __cv_vector& __v) noexcept
: __str_(__str)
, __v_(__v)
{
printf("action()\n");
if constexpr ((_Param & __param::_in) == __param::_in)
{
__v_.sync_host_to_device();
}
}

~__action()
{
printf("~action()\n");
if constexpr ((_Param & __param::_out) == __param::_out)
{
printf("about to synchronize the stream\n");
fflush(stdout);
__str_.wait(); // wait for the kernel to finish
ericniebler marked this conversation as resolved.
Show resolved Hide resolved
printf("done synchronizing the stream\n");
fflush(stdout);
__v_.sync_device_to_host();
}
}

using __as_kernel_arg = ::cuda::std::span<_Ty>;

operator ::cuda::std::span<_Ty>()
{
printf("to span\n");
return {__v_.__d_.data().get(), __v_.__d_.size()};
}

public:
stream_ref __str_;
__cv_vector& __v_;
};

_CCCL_NODISCARD_FRIEND __action<__param::_inout> __cudax_launch_transform(stream_ref __str, const vector& __v) noexcept
{
return __action<__param::_inout>{__str, __v};
}

_CCCL_NODISCARD_FRIEND __action<__param::_in>
__cudax_launch_transform(stream_ref __str, detail::__in_box<vector> __b) noexcept
{
return __action<__param::_in>{__str, __b.__val};
}

_CCCL_NODISCARD_FRIEND __action<__param::_out>
__cudax_launch_transform(stream_ref __str, detail::__out_box<vector> __b) noexcept
{
return __action<__param::_out>{__str, __b.__val};
}

host_vector<_Ty> __h_;
mutable device_vector<_Ty> __d_{};
mutable bool __dirty_ = true;
};

template <class _Ty>
detail::__in_box<_Ty> in(const _Ty& __v) noexcept
{
return {__v};
}

template <class _Ty>
detail::__out_box<_Ty> out(_Ty& __v) noexcept
{
return {__v};
}

} // namespace cuda::experimental

#endif
#endif
Loading
Loading