-
Notifications
You must be signed in to change notification settings - Fork 221
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add additional integration benchmarks covering previous regressions.
[skip tests]
- Loading branch information
Showing
3 changed files
with
122 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
module ByVal | ||
|
||
using CUDA, BenchmarkTools, Random | ||
|
||
const threads = 256 | ||
|
||
# simple add matrixes kernel | ||
function kernel_add_mat(n, x1, x2, y) | ||
i = (blockIdx().x-1) * blockDim().x + threadIdx().x | ||
if i <= n | ||
@inbounds y[i] = x1[i] + x2[i] | ||
end | ||
return | ||
end | ||
|
||
@inline get_inputs3(indx_y, a, b, c) = (a, b, c) | ||
@inline get_inputs3(indx_y, a1, a2, b1, b2, c1, c2) = indx_y == 1 ? (a1, b1, c1) : (a2, b2, c2) | ||
@inline get_inputs3(indx_y, a1, a2, a3, b1, b2, b3, c1, c2, c3) = indx_y == 1 ? (a1, b1, c1) : indx_y == 2 ? (a2, b2, c2) : (a3, b3, c3) | ||
|
||
# add arrays of matrixes kernel | ||
function kernel_add_mat_z_slices(n, vararg...) | ||
x1, x2, y = get_inputs3(blockIdx().y, vararg...) | ||
i = (blockIdx().x-1) * blockDim().x + threadIdx().x | ||
if i <= n | ||
@inbounds y[i] = x1[i] + x2[i] | ||
end | ||
return | ||
end | ||
|
||
function add_z_slices!(y, x1, x2) | ||
m1, n1 = size(x1[1]) #get size of first slice | ||
blocks = (m1 * n1 + threads - 1) ÷ threads | ||
# get length(x1) more blocks than needed to process 1 slice | ||
@cuda blocks = blocks, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...) | ||
end | ||
|
||
function add!(y, x1, x2) | ||
m1, n1 = size(x1) | ||
blocks = (m1 * n1 + threads - 1) ÷ threads | ||
@cuda blocks = blocks, 1 threads = threads kernel_add_mat(m1 * n1, x1, x2, y) | ||
end | ||
|
||
function main() | ||
results = BenchmarkGroup() | ||
|
||
num_z_slices = 3 | ||
Random.seed!(1) | ||
|
||
#m, n = 7, 5 # tiny to measure overhead | ||
#m, n = 521, 111 | ||
#m, n = 1521, 1111 | ||
#m, n = 3001, 1511 # prime numbers to test memory access correctness | ||
m, n = 3072, 1536 # 256 multiplier | ||
#m, n = 6007, 3001 # prime numbers to test memory access correctness | ||
|
||
x1 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] | ||
x2 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] | ||
y1 = [similar(x1[1]) for i = 1:num_z_slices] | ||
|
||
# reference down to bones add on GPU | ||
results["reference"] = @benchmark CUDA.@sync add!($y1[1], $x1[1], $x2[1]) | ||
|
||
# adding arrays in an array | ||
for slices = 1:num_z_slices | ||
results["slices=$slices"] = @benchmark CUDA.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices]) | ||
end | ||
|
||
# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them | ||
CUDA.unsafe_free!.(x1) | ||
CUDA.unsafe_free!.(x2) | ||
CUDA.unsafe_free!.(y1) | ||
|
||
return results | ||
end | ||
|
||
end | ||
|
||
ByVal.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
module cudadevrt | ||
|
||
using CUDA, BenchmarkTools, Random | ||
|
||
const threads = 256 | ||
#simple add matrix and vector kernel | ||
function kernel_add_mat_vec(m, x1, x2, y) | ||
# one block per column | ||
offset = (blockIdx().x-1) * m | ||
@inbounds xtmp = x2[blockIdx().x] | ||
for i = threadIdx().x : blockDim().x : m | ||
@inbounds y[offset + i] = x1[offset + i] + xtmp | ||
end | ||
return | ||
end | ||
|
||
function add!(y, x1, x2) | ||
m, n = size(x1) | ||
@cuda blocks = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y) | ||
end | ||
|
||
function main() | ||
Random.seed!(1) | ||
m, n = 3072, 1536 # 256 multiplier | ||
x1 = cu(randn(Float32, (m, n)) .+ Float32(0.5)) | ||
x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5)) | ||
y1 = similar(x1) | ||
|
||
results = @benchmark CUDA.@sync add!($y1, $x1, $x2) | ||
|
||
# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them | ||
CUDA.unsafe_free!(x1) | ||
CUDA.unsafe_free!(x2) | ||
CUDA.unsafe_free!(y1) | ||
|
||
return results | ||
end | ||
|
||
end | ||
|
||
cudadevrt.main() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters