Skip to content

Commit

Permalink
Add additional integration benchmarks covering previous regressions.
Browse files Browse the repository at this point in the history
[skip tests]
  • Loading branch information
maleadt committed Apr 28, 2021
1 parent ce2004d commit af86b69
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 0 deletions.
78 changes: 78 additions & 0 deletions perf/byval.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
module ByVal

using CUDA, BenchmarkTools, Random

const threads = 256

# simple add matrixes kernel
function kernel_add_mat(n, x1, x2, y)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
if i <= n
@inbounds y[i] = x1[i] + x2[i]
end
return
end

@inline get_inputs3(indx_y, a, b, c) = (a, b, c)
@inline get_inputs3(indx_y, a1, a2, b1, b2, c1, c2) = indx_y == 1 ? (a1, b1, c1) : (a2, b2, c2)
@inline get_inputs3(indx_y, a1, a2, a3, b1, b2, b3, c1, c2, c3) = indx_y == 1 ? (a1, b1, c1) : indx_y == 2 ? (a2, b2, c2) : (a3, b3, c3)

# add arrays of matrixes kernel
function kernel_add_mat_z_slices(n, vararg...)
x1, x2, y = get_inputs3(blockIdx().y, vararg...)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
if i <= n
@inbounds y[i] = x1[i] + x2[i]
end
return
end

function add_z_slices!(y, x1, x2)
m1, n1 = size(x1[1]) #get size of first slice
blocks = (m1 * n1 + threads - 1) ÷ threads
# get length(x1) more blocks than needed to process 1 slice
@cuda blocks = blocks, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
end

function add!(y, x1, x2)
m1, n1 = size(x1)
blocks = (m1 * n1 + threads - 1) ÷ threads
@cuda blocks = blocks, 1 threads = threads kernel_add_mat(m1 * n1, x1, x2, y)
end

function main()
results = BenchmarkGroup()

num_z_slices = 3
Random.seed!(1)

#m, n = 7, 5 # tiny to measure overhead
#m, n = 521, 111
#m, n = 1521, 1111
#m, n = 3001, 1511 # prime numbers to test memory access correctness
m, n = 3072, 1536 # 256 multiplier
#m, n = 6007, 3001 # prime numbers to test memory access correctness

x1 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
x2 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
y1 = [similar(x1[1]) for i = 1:num_z_slices]

# reference down to bones add on GPU
results["reference"] = @benchmark CUDA.@sync add!($y1[1], $x1[1], $x2[1])

# adding arrays in an array
for slices = 1:num_z_slices
results["slices=$slices"] = @benchmark CUDA.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
end

# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
CUDA.unsafe_free!.(x1)
CUDA.unsafe_free!.(x2)
CUDA.unsafe_free!.(y1)

return results
end

end

ByVal.main()
42 changes: 42 additions & 0 deletions perf/cudadevrt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
module cudadevrt

using CUDA, BenchmarkTools, Random

const threads = 256
#simple add matrix and vector kernel
function kernel_add_mat_vec(m, x1, x2, y)
# one block per column
offset = (blockIdx().x-1) * m
@inbounds xtmp = x2[blockIdx().x]
for i = threadIdx().x : blockDim().x : m
@inbounds y[offset + i] = x1[offset + i] + xtmp
end
return
end

function add!(y, x1, x2)
m, n = size(x1)
@cuda blocks = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y)
end

function main()
Random.seed!(1)
m, n = 3072, 1536 # 256 multiplier
x1 = cu(randn(Float32, (m, n)) .+ Float32(0.5))
x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5))
y1 = similar(x1)

results = @benchmark CUDA.@sync add!($y1, $x1, $x2)

# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
CUDA.unsafe_free!(x1)
CUDA.unsafe_free!(x2)
CUDA.unsafe_free!(y1)

return results
end

end

cudadevrt.main()

2 changes: 2 additions & 0 deletions perf/runbenchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ results = run(SUITE, verbose=true)

# integration tests (that do nasty things, so need to be run last)
results["integration"]["volumerhs"] = include("volumerhs.jl")
results["integration"]["byval"] = include("byval.jl")
results["integration"]["cudadevrt"] = include("cudadevrt.jl")

println(results)

Expand Down

0 comments on commit af86b69

Please sign in to comment.