Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add reduce kernels #3136

Merged
merged 32 commits into from
Jun 25, 2024
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
860868d
feat: add tree reduction implementation of argmin and argmax
ManasviGoyal May 29, 2024
3cdbd7e
feat: add awkward_ListOffsetArray_reduce_local_outoffsets_64 kernel
ManasviGoyal May 30, 2024
c1a846b
test: integration tests for cuda
ManasviGoyal May 30, 2024
7be3f98
test: some more integration tests for cuda
ManasviGoyal May 30, 2024
98fb7ed
feat: add awkward_reduce_count_64 kernel
ManasviGoyal Jun 6, 2024
0ed94ef
fix: indexing and indentation
ManasviGoyal Jun 6, 2024
02c03bc
feat: add awkward_reduce_countnonzero kernel
ManasviGoyal Jun 6, 2024
34fc82b
feat: add reduce sum, min and max kernels
ManasviGoyal Jun 6, 2024
4e00f07
feat: add reduce prod and sum_int_bool
ManasviGoyal Jun 6, 2024
b28a605
feat: add sum_bool and prod_bool kernels
ManasviGoyal Jun 6, 2024
9e7abc7
fix: use cpt.assert_allclose
ManasviGoyal Jun 6, 2024
458165c
test: reducer integration tests
ManasviGoyal Jun 6, 2024
c75cb79
fix: typr conversion
ManasviGoyal Jun 6, 2024
427670c
fix: use atomic to avoid race conditions
ManasviGoyal Jun 7, 2024
127e035
fix: remove unnessary variable
ManasviGoyal Jun 10, 2024
8dee2ae
fix: minor fixes
ManasviGoyal Jun 10, 2024
b957bee
Merge branch 'main' into ManasviGoyal/add-reducer-kernels
ManasviGoyal Jun 11, 2024
896770f
fix: all reducer for atomics
ManasviGoyal Jun 12, 2024
f3d1cdc
fix: missing template
ManasviGoyal Jun 12, 2024
ef47ead
fix: remove complex
ManasviGoyal Jun 12, 2024
c881f1d
fix: atomicMin() for float 32 and indentation
ManasviGoyal Jun 12, 2024
38d30b9
fix: pass correct dtype of identity
ManasviGoyal Jun 12, 2024
51b0e15
fix: remove combinations test
ManasviGoyal Jun 12, 2024
7e7fdc4
fix: manage resources and disable failing test
ianna Jun 13, 2024
1148b95
fix: uncomment fixed test for slicing
ManasviGoyal Jun 18, 2024
8e926ab
fix: correctly interpret typetracer array for cuda backend
ManasviGoyal Jun 18, 2024
38d314d
fix: tests-spec error for bool
ManasviGoyal Jun 18, 2024
15068b6
fix: check for the backend of head
ManasviGoyal Jun 18, 2024
d864481
Merge branch 'main' into ManasviGoyal/add-reducer-kernels
ianna Jun 19, 2024
b2c0a89
Merge branch 'main' into ManasviGoyal/add-reducer-kernels
ianna Jun 21, 2024
8921b82
Update dev/generate-tests.py
ianna Jun 24, 2024
c9bff0f
Merge branch 'main' into ManasviGoyal/add-reducer-kernels
jpivarski Jun 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions dev/generate-kernel-signatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@
"awkward_ListOffsetArray_drop_none_indexes",
"awkward_ListOffsetArray_reduce_local_nextparents_64",
"awkward_ListOffsetArray_reduce_nonlocal_maxcount_offsetscopy_64",
"awkward_ListOffsetArray_reduce_local_outoffsets_64",
"awkward_UnionArray_flatten_length",
"awkward_UnionArray_flatten_combine",
"awkward_UnionArray_nestedfill_tags_index",
Expand All @@ -123,6 +124,7 @@
"awkward_reduce_sum_int32_bool_64",
"awkward_reduce_sum_int64_bool_64",
"awkward_reduce_sum_bool",
"awkward_reduce_prod",
"awkward_reduce_prod_bool",
"awkward_reduce_countnonzero",
"awkward_sorting_ranges",
Expand Down Expand Up @@ -381,6 +383,8 @@ def kernel_signatures_cuda_py(specification):
from awkward._connect.cuda import fetch_specialization
from awkward._connect.cuda import import_cupy

import math

cupy = import_cupy("Awkward Arrays with CUDA")
"""
)
Expand Down
10 changes: 7 additions & 3 deletions dev/generate-tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ def genspectests(specdict):

"""
)
f.write("import pytest\nimport kernels\n\n")
f.write("import pytest\nimport numpy as np\nimport kernels\n\n")
num = 1
if spec.tests == []:
f.write(
Expand Down Expand Up @@ -893,6 +893,7 @@ def gencpuunittests(specdict):
"awkward_ListOffsetArray_drop_none_indexes",
"awkward_ListOffsetArray_reduce_local_nextparents_64",
"awkward_ListOffsetArray_reduce_nonlocal_maxcount_offsetscopy_64",
"awkward_ListOffsetArray_reduce_local_outoffsets_64",
"awkward_UnionArray_flatten_length",
"awkward_UnionArray_flatten_combine",
"awkward_UnionArray_nestedfill_tags_index",
Expand All @@ -908,6 +909,7 @@ def gencpuunittests(specdict):
"awkward_reduce_sum_int32_bool_64",
"awkward_reduce_sum_int64_bool_64",
"awkward_reduce_sum_bool",
"awkward_reduce_prod",
"awkward_reduce_prod_bool",
"awkward_reduce_countnonzero",
"awkward_sorting_ranges",
Expand Down Expand Up @@ -958,6 +960,7 @@ def gencudakerneltests(specdict):

f.write(
"import cupy\n"
"import cupy.testing as cpt\n"
ianna marked this conversation as resolved.
Show resolved Hide resolved
"import pytest\n\n"
"import awkward as ak\n"
"import awkward._connect.cuda as ak_cu\n"
Expand Down Expand Up @@ -1027,7 +1030,7 @@ def gencudakerneltests(specdict):
if isinstance(val, list):
f.write(
" " * 4
+ f"assert cupy.array_equal({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n"
+ f"cpt.assert_allclose({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n"
)
else:
f.write(" " * 4 + f"assert {arg} == pytest_{arg}\n")
Expand Down Expand Up @@ -1087,6 +1090,7 @@ def gencudaunittests(specdict):
f.write(
"import re\n"
"import cupy\n"
"import cupy.testing as cpt\n"
"import pytest\n\n"
"import awkward as ak\n"
"import awkward._connect.cuda as ak_cu\n"
Expand Down Expand Up @@ -1223,7 +1227,7 @@ def gencudaunittests(specdict):
if isinstance(val, list):
f.write(
" " * 4
+ f"assert cupy.array_equal({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n"
+ f"cpt.assert_allclose({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n"
)
else:
f.write(" " * 4 + f"assert {arg} == pytest_{arg}\n")
Expand Down
1,100 changes: 946 additions & 154 deletions kernel-test-data.json

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions src/awkward/_connect/cuda/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def fetch_template_specializations(kernel_dict):
"awkward_ListArray_rpad_axis1",
"awkward_ListOffsetArray_drop_none_indexes",
"awkward_ListOffsetArray_reduce_nonlocal_maxcount_offsetscopy_64",
"awkward_ListOffsetArray_reduce_local_outoffsets_64",
"awkward_UnionArray_regular_index",
"awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64",
"awkward_ListOffsetArray_rpad_axis1",
Expand All @@ -119,6 +120,7 @@ def fetch_template_specializations(kernel_dict):
"awkward_reduce_sum_int32_bool_64",
"awkward_reduce_sum_int64_bool_64",
"awkward_reduce_sum_bool",
"awkward_reduce_prod",
"awkward_reduce_prod_bool",
"awkward_reduce_argmax",
"awkward_reduce_argmin",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE

// BEGIN PYTHON
// def f(grid, block, args):
// (outoffsets, parents, lenparents, outlength, invocation_index, err_code) = args
// if block[0] > 0:
// segment = math.floor((outlength + block[0] - 1) / block[0])
// grid_size = math.floor((lenparents + block[0] - 1) / block[0])
// else:
// grid_size = 1
// temp = cupy.zeros(lenparents, dtype=cupy.int64)
// scan_in_array = cupy.zeros(outlength, dtype=cupy.uint64)
// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, scan_in_array, temp, invocation_index, err_code))
// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, scan_in_array, temp, invocation_index, err_code))
// scan_in_array = cupy.cumsum(scan_in_array)
// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, scan_in_array, temp, invocation_index, err_code))
// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", {dtype_specializations}] = None
// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", {dtype_specializations}] = None
// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", {dtype_specializations}] = None
// END PYTHON

template <typename T, typename C>
__global__ void
awkward_ListOffsetArray_reduce_local_outoffsets_64_a(
T* outoffsets,
const C* parents,
int64_t lenparents,
int64_t outlength,
uint64_t* scan_in_array,
int64_t* temp,
uint64_t invocation_index,
uint64_t* err_code) {
if (err_code[0] == NO_ERROR) {
int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;

if (thread_id < outlength) {
outoffsets[thread_id] = 0;
}
}
}

template <typename T, typename C>
__global__ void
awkward_ListOffsetArray_reduce_local_outoffsets_64_b(
T* outoffsets,
const C* parents,
int64_t lenparents,
int64_t outlength,
uint64_t* scan_in_array,
int64_t* temp,
uint64_t invocation_index,
uint64_t* err_code) {
if (err_code[0] == NO_ERROR) {
int64_t idx = threadIdx.x;
int64_t thread_id = blockIdx.x * blockDim.x + idx;

if (thread_id < lenparents) {
temp[thread_id] = 1;
}
__syncthreads();

for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
int64_t val = 0;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = temp[thread_id - stride];
}
__syncthreads();
temp[thread_id] += val;
__syncthreads();
}

if (thread_id < lenparents) {
int64_t parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
atomicAdd(&scan_in_array[parent], temp[thread_id]);
}
}
}
}

template <typename T, typename C>
__global__ void
awkward_ListOffsetArray_reduce_local_outoffsets_64_c(
T* outoffsets,
const C* parents,
int64_t lenparents,
int64_t outlength,
uint64_t* scan_in_array,
int64_t* temp,
uint64_t invocation_index,
uint64_t* err_code) {
if (err_code[0] == NO_ERROR) {
int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
outoffsets[0] = 0;

if (thread_id < outlength) {
outoffsets[thread_id + 1] = (T)(scan_in_array[thread_id]);
}
}
}
65 changes: 58 additions & 7 deletions src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,18 @@
// BEGIN PYTHON
// def f(grid, block, args):
// (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code))
// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code))
// if block[0] > 0:
// grid_size = math.floor((lenparents + block[0] - 1) / block[0])
// else:
// grid_size = 1
// atomic_toptr = cupy.array(toptr, dtype=cupy.uint64)
// temp = cupy.zeros(lenparents, dtype=toptr.dtype)
// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
// cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
// out["awkward_reduce_argmax_a", {dtype_specializations}] = None
// out["awkward_reduce_argmax_b", {dtype_specializations}] = None
// out["awkward_reduce_argmax_c", {dtype_specializations}] = None
// END PYTHON

template <typename T, typename C, typename U>
Expand All @@ -17,12 +25,15 @@ awkward_reduce_argmax_a(
const U* parents,
int64_t lenparents,
int64_t outlength,
uint64_t* atomic_toptr,
T* temp,
uint64_t invocation_index,
uint64_t* err_code) {
if (err_code[0] == NO_ERROR) {
int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;

if (thread_id < outlength) {
toptr[thread_id] = -1;
atomic_toptr[thread_id] = -1;
}
}
}
Expand All @@ -35,17 +46,57 @@ awkward_reduce_argmax_b(
const U* parents,
int64_t lenparents,
int64_t outlength,
uint64_t* atomic_toptr,
T* temp,
uint64_t invocation_index,
uint64_t* err_code) {
if (err_code[0] == NO_ERROR) {
int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
int64_t idx = threadIdx.x;
int64_t thread_id = blockIdx.x * blockDim.x + idx;

if (thread_id < lenparents) {
temp[thread_id] = thread_id;
}
__syncthreads();

for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
int64_t index = -1;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
index = temp[thread_id - stride];
}
if (index != -1 && (temp[thread_id] == -1 || fromptr[index] > fromptr[temp[thread_id]] ||
(fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
temp[thread_id] = index;
}
__syncthreads();
}

if (thread_id < lenparents) {
int64_t parent = parents[thread_id];
if (toptr[parent] == -1 ||
(fromptr[thread_id] > (fromptr[toptr[parent]]))) {
toptr[parent] = thread_id; // we need the last parent filled, thread random order problem, find max arg at that index
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
atomicExch(&atomic_toptr[parent], temp[thread_id]);
}
}
}
}

template <typename T, typename C, typename U>
__global__ void
awkward_reduce_argmax_c(
T* toptr,
const C* fromptr,
const U* parents,
int64_t lenparents,
int64_t outlength,
uint64_t* atomic_toptr,
T* temp,
uint64_t invocation_index,
uint64_t* err_code) {
if (err_code[0] == NO_ERROR) {
int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;

if (thread_id < outlength) {
toptr[thread_id] = (T)(atomic_toptr[thread_id]);
}
}
}
Loading
Loading