From af86b696f131c672d453abf76f2bbac92d65462a Mon Sep 17 00:00:00 2001
From: Tim Besard <tim@juliacomputing.com>
Date: Wed, 28 Apr 2021 09:56:29 +0200
Subject: [PATCH] Add additional integration benchmarks covering previous
 regressions.

[skip tests]
---
 perf/byval.jl         | 78 +++++++++++++++++++++++++++++++++++++++++++
 perf/cudadevrt.jl     | 42 +++++++++++++++++++++++
 perf/runbenchmarks.jl |  2 ++
 3 files changed, 122 insertions(+)
 create mode 100644 perf/byval.jl
 create mode 100644 perf/cudadevrt.jl

diff --git a/perf/byval.jl b/perf/byval.jl
new file mode 100644
index 0000000000..888215b1f5
--- /dev/null
+++ b/perf/byval.jl
@@ -0,0 +1,78 @@
+module ByVal
+
+using CUDA, BenchmarkTools, Random
+
+const threads = 256
+
+# simple add matrixes kernel
+function kernel_add_mat(n, x1, x2, y)
+    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+    if i <= n
+        @inbounds y[i] = x1[i] + x2[i]
+    end
+    return
+end
+
+@inline get_inputs3(indx_y, a, b, c)                            = (a, b, c)
+@inline get_inputs3(indx_y, a1, a2, b1, b2, c1, c2)             = indx_y == 1 ? (a1, b1, c1) : (a2, b2, c2)
+@inline get_inputs3(indx_y, a1, a2, a3, b1, b2, b3, c1, c2, c3) = indx_y == 1 ? (a1, b1, c1) : indx_y == 2 ? (a2, b2, c2) : (a3, b3, c3)
+
+# add arrays of matrixes kernel
+function kernel_add_mat_z_slices(n, vararg...)
+    x1, x2, y = get_inputs3(blockIdx().y, vararg...)
+    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+    if i <= n
+        @inbounds y[i] = x1[i] + x2[i]
+    end
+    return
+end
+
+function add_z_slices!(y, x1, x2)
+    m1, n1 = size(x1[1]) #get size of first slice
+    blocks = (m1 * n1 + threads - 1) ÷ threads
+    # get length(x1) more blocks than needed to process 1 slice
+    @cuda blocks = blocks, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
+end
+
+function add!(y, x1, x2)
+    m1, n1 = size(x1)
+    blocks = (m1 * n1 + threads - 1) ÷ threads
+    @cuda blocks = blocks, 1          threads = threads kernel_add_mat(m1 * n1, x1, x2, y)
+end
+
+function main()
+    results = BenchmarkGroup()
+
+    num_z_slices = 3
+    Random.seed!(1)
+
+    #m, n = 7, 5          # tiny to measure overhead
+    #m, n = 521, 111
+    #m, n = 1521, 1111
+    #m, n = 3001, 1511    # prime numbers to test memory access correctness
+    m, n = 3072, 1536    # 256 multiplier
+    #m, n = 6007, 3001    # prime numbers to test memory access correctness
+
+    x1 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
+    x2 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
+    y1 = [similar(x1[1]) for i = 1:num_z_slices]
+
+    # reference down to bones add on GPU
+    results["reference"] = @benchmark CUDA.@sync add!($y1[1], $x1[1], $x2[1])
+
+    # adding arrays in an array
+    for slices = 1:num_z_slices
+        results["slices=$slices"] = @benchmark CUDA.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
+    end
+
+    # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
+    CUDA.unsafe_free!.(x1)
+    CUDA.unsafe_free!.(x2)
+    CUDA.unsafe_free!.(y1)
+
+    return results
+end
+
+end
+
+ByVal.main()
diff --git a/perf/cudadevrt.jl b/perf/cudadevrt.jl
new file mode 100644
index 0000000000..f166dd373a
--- /dev/null
+++ b/perf/cudadevrt.jl
@@ -0,0 +1,42 @@
+module cudadevrt
+
+using CUDA, BenchmarkTools, Random
+
+const threads = 256
+#simple add matrix and vector kernel
+function kernel_add_mat_vec(m, x1, x2, y)
+    # one block per column
+    offset = (blockIdx().x-1) * m
+    @inbounds xtmp = x2[blockIdx().x]
+    for i = threadIdx().x : blockDim().x : m
+        @inbounds y[offset + i] = x1[offset + i] + xtmp
+    end
+    return
+end
+
+function add!(y, x1, x2)
+    m, n = size(x1)
+    @cuda blocks = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y)
+end
+
+function main()
+    Random.seed!(1)
+    m, n = 3072, 1536    # 256 multiplier
+    x1 = cu(randn(Float32, (m, n)) .+ Float32(0.5))
+    x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5))
+    y1 = similar(x1)
+
+    results = @benchmark CUDA.@sync add!($y1, $x1, $x2)
+
+    # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
+    CUDA.unsafe_free!(x1)
+    CUDA.unsafe_free!(x2)
+    CUDA.unsafe_free!(y1)
+
+    return results
+end
+
+end
+
+cudadevrt.main()
+
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
index c145a71db3..6c7699fa12 100644
--- a/perf/runbenchmarks.jl
+++ b/perf/runbenchmarks.jl
@@ -82,6 +82,8 @@ results = run(SUITE, verbose=true)
 
 # integration tests (that do nasty things, so need to be run last)
 results["integration"]["volumerhs"] = include("volumerhs.jl")
+results["integration"]["byval"] = include("byval.jl")
+results["integration"]["cudadevrt"] = include("cudadevrt.jl")
 
 println(results)