diff --git a/perf/array.jl b/perf/array.jl index a6a226cc..0c57a7df 100644 --- a/perf/array.jl +++ b/perf/array.jl @@ -5,16 +5,16 @@ const n = 1000 # generate some arrays cpu_mat = rand(rng, Float32, m, n) -gpu_mat = CuArray{Float32}(undef, size(cpu_mat)) +gpu_mat = MtlArray{Float32}(undef, size(cpu_mat)) gpu_vec = reshape(gpu_mat, length(gpu_mat)) gpu_arr_3d = reshape(gpu_mat, (m, 40, 25)) gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10)) -gpu_mat_ints = CuArray(rand(rng, Int, m, n)) +gpu_mat_ints = MtlArray(rand(rng, Int, m, n)) gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints)) -gpu_mat_bools = CuArray(rand(rng, Bool, m, n)) +gpu_mat_bools = MtlArray(rand(rng, Bool, m, n)) gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools)) -group["construct"] = @benchmarkable CuArray{Int}(undef, 1) +group["construct"] = @benchmarkable MtlArray{Int}(undef, 1) group["copy"] = @async_benchmarkable copy($gpu_mat) @@ -26,7 +26,7 @@ let group = addgroup!(group, "copyto!") end let group = addgroup!(group, "iteration") - group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10] + group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10] group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools] @@ -46,12 +46,12 @@ let group = addgroup!(group, "iteration") end end -let group = addgroup!(group, "reverse") - group["1d"] = @async_benchmarkable reverse($gpu_vec) - group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1) - group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec) - group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1) -end +# let group = addgroup!(group, "reverse") +# group["1d"] = @async_benchmarkable reverse($gpu_vec) +# group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1) +# group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec) +# group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1) +# end group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0 @@ -77,31 +77,31 @@ end let group = addgroup!(group, "random") let group = addgroup!(group, "rand") - group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n) - group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n) + group["Float32"] = @async_benchmarkable Metal.rand(Float32, m*n) + group["Int64"] = @async_benchmarkable Metal.rand(Int64, m*n) end let group = addgroup!(group, "rand!") - group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec) - group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints) + group["Float32"] = @async_benchmarkable Metal.rand!($gpu_vec) + group["Int64"] = @async_benchmarkable Metal.rand!($gpu_vec_ints) end let group = addgroup!(group, "randn") - group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n) - #group["Int64"] = @async_benchmarkable CUDA.randn(Int64, m*n) + group["Float32"] = @async_benchmarkable Metal.randn(Float32, m*n) + # group["Int64"] = @async_benchmarkable Metal.randn(Int64, m*n) end let group = addgroup!(group, "randn!") - group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec) - #group["Int64"] = @async_benchmarkable CUDA.randn!($gpu_vec_ints) + group["Float32"] = @async_benchmarkable Metal.randn!($gpu_vec) + # group["Int64"] = @async_benchmarkable Metal.randn!($gpu_vec_ints) end end -let group = addgroup!(group, "sorting") - group["1d"] = @async_benchmarkable sort($gpu_vec) - group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1) - group["by"] = @async_benchmarkable sort($gpu_vec; by=sin) -end +# let group = addgroup!(group, "sorting") +# group["1d"] = @async_benchmarkable sort($gpu_vec) +# group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1) +# group["by"] = @async_benchmarkable sort($gpu_vec; by=sin) +# end let group = addgroup!(group, "permutedims") group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1)) diff --git a/perf/byval.jl b/perf/byval.jl index d32d62a9..6a546641 100644 --- a/perf/byval.jl +++ b/perf/byval.jl @@ -1,13 +1,12 @@ module ByVal -using CUDA, BenchmarkTools, Random -using CUDA: i32 +using Metal, BenchmarkTools, Random const threads = 256 # simple add matrixes kernel function kernel_add_mat(n, x1, x2, y) - i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x + i = thread_position_in_grid_1d() if i <= n @inbounds y[i] = x1[i] + x2[i] end @@ -20,8 +19,8 @@ end # add arrays of matrixes kernel function kernel_add_mat_z_slices(n, vararg...) - x1, x2, y = get_inputs3(blockIdx().y, vararg...) - i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x + x1, x2, y = get_inputs3(threadgroup_position_in_grid_2d().y, vararg...) + i = thread_position_in_grid_1d() if i <= n @inbounds y[i] = x1[i] + x2[i] end @@ -30,15 +29,15 @@ end function add_z_slices!(y, x1, x2) m1, n1 = size(x1[1]) #get size of first slice - blocks = (m1 * n1 + threads - 1) ÷ threads - # get length(x1) more blocks than needed to process 1 slice - @cuda blocks = blocks, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...) + groups = (m1 * n1 + threads - 1) ÷ threads + # get length(x1) more groups than needed to process 1 slice + @metal groups = groups, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...) end function add!(y, x1, x2) m1, n1 = size(x1) - blocks = (m1 * n1 + threads - 1) ÷ threads - @cuda blocks = blocks, 1 threads = threads kernel_add_mat(m1 * n1, x1, x2, y) + groups = (m1 * n1 + threads - 1) ÷ threads + @metal groups = (groups, 1) threads = threads kernel_add_mat(m1 * n1, x1, x2, y) end function main() @@ -54,22 +53,22 @@ function main() m, n = 3072, 1536 # 256 multiplier #m, n = 6007, 3001 # prime numbers to test memory access correctness - x1 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] - x2 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] + x1 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] + x2 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] y1 = [similar(x1[1]) for i = 1:num_z_slices] # reference down to bones add on GPU - results["reference"] = @benchmark CUDA.@sync blocking=true add!($y1[1], $x1[1], $x2[1]) + results["reference"] = @benchmark Metal.@sync add!($y1[1], $x1[1], $x2[1]) # adding arrays in an array for slices = 1:num_z_slices - results["slices=$slices"] = @benchmark CUDA.@sync blocking=true add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices]) + results["slices=$slices"] = @benchmark Metal.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices]) end # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them - CUDA.unsafe_free!.(x1) - CUDA.unsafe_free!.(x2) - CUDA.unsafe_free!.(y1) + Metal.unsafe_free!.(x1) + Metal.unsafe_free!.(x2) + Metal.unsafe_free!.(y1) return results end diff --git a/perf/kernel.jl b/perf/kernel.jl index 1022a774..5cfcc924 100644 --- a/perf/kernel.jl +++ b/perf/kernel.jl @@ -1,33 +1,35 @@ -using CUDA: i32 +# using GPUArrays group = addgroup!(SUITE, "kernel") -group["launch"] = @benchmarkable @cuda identity(nothing) +group["launch"] = @benchmarkable @metal identity(nothing) -group["occupancy"] = @benchmarkable begin - kernel = @cuda launch=false identity(nothing) - launch_configuration(kernel.fun) -end +# group["occupancy"] = @benchmarkable begin +# kernel = @metal launch=false identity(nothing) +# GPUArrays.launch_heuristic(Metal.mtlArrayBackend(), kernel.f; elements=1, elements_per_thread=1) +# return +# end -src = CUDA.rand(Float32, 512, 1000) +src = Metal.rand(Float32, 512, 1000) dest = similar(src) function indexing_kernel(dest, src) - i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x + i = thread_position_in_grid_1d() @inbounds dest[i] = src[i] return end -group["indexing"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $indexing_kernel($dest, $src) +group["indexing"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $indexing_kernel($dest, $src) function checked_indexing_kernel(dest, src) - i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x + i = thread_position_in_grid_1d() dest[i] = src[i] return end -group["indexing_checked"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $checked_indexing_kernel($dest, $src) +group["indexing_checked"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $checked_indexing_kernel($dest, $src) -function rand_kernel(dest::AbstractArray{T}) where {T} - i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x - dest[i] = rand(T) - return -end -group["rand"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $rand_kernel($dest) +## DELETE +# function rand_kernel(dest::AbstractArray{T}) where {T} +# i = thread_position_in_grid_1d() +# dest[i] = Metal.rand(T) +# return +# end +# group["rand"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $rand_kernel($dest) diff --git a/perf/latency.jl b/perf/latency.jl index 9cd20135..a1066abe 100644 --- a/perf/latency.jl +++ b/perf/latency.jl @@ -1,6 +1,6 @@ module Latency -using CUDA +using Metal using BenchmarkTools function main() @@ -11,24 +11,24 @@ function main() base_cmd = `$base_cmd --project=$(unsafe_string(Base.JLOptions().project))` end # NOTE: we don't ust Base.active_project() here because of how CI launches this script, - # starting with --project in the main CUDA.jl project. + # starting with --project in the main Metal.jl project. # time to precompile the package and its dependencies precompile_cmd = - `$base_cmd -e "pkg = Base.identify_package(\"CUDA\") + `$base_cmd -e "pkg = Base.identify_package(\"Metal\") Base.compilecache(pkg)"` results["precompile"] = @benchmark run($precompile_cmd) evals=1 seconds=60 # time to actually import the package import_cmd = - `$base_cmd -e "using CUDA"` + `$base_cmd -e "using Metal"` results["import"] = @benchmark run($import_cmd) evals=1 seconds=30 # time to actually compile a kernel ttfp_cmd = - `$base_cmd -e "using CUDA + `$base_cmd -e "using Metal kernel() = return - CUDA.code_ptx(devnull, kernel, Tuple{}; kernel=true)"` + Metal.code_agx(devnull, kernel, Tuple{}; kernel=true)"` results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60 results diff --git a/perf/metal.jl b/perf/metal.jl index 5dbcbf0a..5e136b0e 100644 --- a/perf/metal.jl +++ b/perf/metal.jl @@ -1,14 +1,6 @@ -group = addgroup!(SUITE, "cuda") +group = addgroup!(SUITE, "metal") let group = addgroup!(group, "synchronization") - let group = addgroup!(group, "stream") - group["blocking"] = @benchmarkable synchronize(blocking=true) - group["auto"] = @benchmarkable synchronize() - group["nonblocking"] = @benchmarkable synchronize(spin=false) - end - let group = addgroup!(group, "context") - group["blocking"] = @benchmarkable device_synchronize(blocking=true) - group["auto"] = @benchmarkable device_synchronize() - group["nonblocking"] = @benchmarkable device_synchronize(spin=false) - end + group["stream"] = @benchmarkable synchronize() + group["context"] = @benchmarkable device_synchronize() end diff --git a/perf/metaldevrt.jl b/perf/metaldevrt.jl index 08348fab..a3dbd07f 100644 --- a/perf/metaldevrt.jl +++ b/perf/metaldevrt.jl @@ -1,14 +1,14 @@ -module cudadevrt +module metaldevrt -using CUDA, BenchmarkTools, Random +using Metal, BenchmarkTools, Random const threads = 256 #simple add matrix and vector kernel function kernel_add_mat_vec(m, x1, x2, y) # one block per column - offset = (blockIdx().x-1) * m - @inbounds xtmp = x2[blockIdx().x] - for i = threadIdx().x : blockDim().x : m + offset = (threadgroup_position_in_grid_2d().x-1) * m + @inbounds xtmp = x2[threadgroup_position_in_grid_2d().x] + for i = thread_position_in_threadgroup_2d().x : threadgroups_per_grid_2d().x : m @inbounds y[offset + i] = x1[offset + i] + xtmp end return @@ -16,27 +16,27 @@ end function add!(y, x1, x2) m, n = size(x1) - @cuda blocks = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y) + @metal groups = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y) end function main() Random.seed!(1) m, n = 3072, 1536 # 256 multiplier - x1 = cu(randn(Float32, (m, n)) .+ Float32(0.5)) - x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5)) + x1 = mtl(randn(Float32, (m, n)) .+ Float32(0.5)) + x2 = mtl(randn(Float32, (1, n)) .+ Float32(0.5)) y1 = similar(x1) - results = @benchmark CUDA.@sync blocking=true add!($y1, $x1, $x2) + results = @benchmark Metal.@sync add!($y1, $x1, $x2) # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them - CUDA.unsafe_free!(x1) - CUDA.unsafe_free!(x2) - CUDA.unsafe_free!(y1) + Metal.unsafe_free!(x1) + Metal.unsafe_free!(x2) + Metal.unsafe_free!(y1) return results end end -cudadevrt.main() +metaldevrt.main() diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl index e3c6c6e9..549b01c9 100644 --- a/perf/runbenchmarks.jl +++ b/perf/runbenchmarks.jl @@ -1,18 +1,14 @@ # benchmark suite execution and codespeed submission -using CUDA +using Metal using BenchmarkTools using StableRNGs rng = StableRNG(123) -# we only submit results when running on the master branch -real_run = get(ENV, "CODESPEED_BRANCH", nothing) == "master" -if real_run - # to find untuned benchmarks - BenchmarkTools.DEFAULT_PARAMETERS.evals = 0 -end +# to find untuned benchmarks +BenchmarkTools.DEFAULT_PARAMETERS.evals = 0 # print system information @info "System information:\n" * sprint(io->Metal.versioninfo(io)) @@ -20,7 +16,7 @@ end # convenience macro to create a benchmark that requires synchronizing the GPU macro async_benchmarkable(ex...) quote - @benchmarkable CUDA.@sync blocking=true $(ex...) + @benchmarkable Metal.@sync $(ex...) end end @@ -33,19 +29,18 @@ SUITE = BenchmarkGroup() # NOTE: don't use spaces in benchmark names (tobami/codespeed#256) -include("cuda.jl") +include("metal.jl") include("kernel.jl") include("array.jl") -if real_run - @info "Preparing main benchmarks" - warmup(SUITE; verbose=false) - tune!(SUITE) +@info "Preparing main benchmarks" +warmup(SUITE; verbose=false) +tune!(SUITE) - # reclaim memory that might have been used by the tuning process - GC.gc(true) - CUDA.reclaim() -end +# reclaim memory that might have been used by the tuning process +GC.gc(true) +GC.gc(true) +GC.gc(true) # benchmark groups that aren't part of the suite addgroup!(SUITE, "integration") @@ -56,9 +51,9 @@ results = run(SUITE, verbose=true) # integration tests (that do nasty things, so need to be run last) @info "Running integration benchmarks" integration_results = BenchmarkGroup() -integration_results["volumerhs"] = include("volumerhs.jl") +# integration_results["volumerhs"] = include("volumerhs.jl") integration_results["byval"] = include("byval.jl") -integration_results["cudadevrt"] = include("cudadevrt.jl") +integration_results["metaldevrt"] = include("metaldevrt.jl") results["latency"] = latency_results results["integration"] = integration_results @@ -84,48 +79,3 @@ if ispath(reference_path) println("Regressions:") println(regressions(comparison)) end - - -## submission - -# using JSON, HTTP - -if real_run - @info "Submitting to Codespeed..." - - basedata = Dict( - "branch" => ENV["CODESPEED_BRANCH"], - "commitid" => ENV["CODESPEED_COMMIT"], - "project" => ENV["CODESPEED_PROJECT"], - "environment" => ENV["CODESPEED_ENVIRONMENT"], - "executable" => ENV["CODESPEED_EXECUTABLE"] - ) - - # convert nested groups of benchmark to flat dictionaries of results - flat_results = [] - function flatten(results, prefix="") - for (key,value) in results - if value isa BenchmarkGroup - flatten(value, "$prefix$key/") - else - @assert value isa BenchmarkTools.Trial - - # codespeed reports maxima, but those are often very noisy. - # get rid of measurements that unnecessarily skew the distribution. - rmskew!(value) - - push!(flat_results, - Dict(basedata..., - "benchmark" => "$prefix$key", - "result_value" => median(value).time / 1e9, - "min" => minimum(value).time / 1e9, - "max" => maximum(value).time / 1e9)) - end - end - end - flatten(results) - - HTTP.post("$(ENV["CODESPEED_SERVER"])/result/add/json/", - ["Content-Type" => "application/x-www-form-urlencoded"], - HTTP.URIs.escapeuri(Dict("json" => JSON.json(flat_results)))) -end diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl index 5c7737f5..f5b3b6d3 100644 --- a/perf/volumerhs.jl +++ b/perf/volumerhs.jl @@ -1,7 +1,7 @@ module VolumeRHS using BenchmarkTools -using CUDA +using Metal using StableRNGs using StaticArrays @@ -20,16 +20,16 @@ end # HACK: module-local versions of core arithmetic; needed to get FMA for (jlf, f) in zip((:+, :*, :-), (:add, :mul, :sub)) - for (T, llvmT) in ((:Float32, "float"), (:Float64, "double")) - ir = """ - %x = f$f contract nsz $llvmT %0, %1 - ret $llvmT %x - """ - @eval begin - # the @pure is necessary so that we can constant propagate. - @inline Base.@pure function $jlf(a::$T, b::$T) - Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b) - end + T = :Float32 + llvmT = "float" + ir = """ + %x = f$f contract nsz $llvmT %0, %1 + ret $llvmT %x + """ + @eval begin + # the @pure is necessary so that we can constant propagate. + @inline Base.@pure function $jlf(a::$T, b::$T) + Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b) end end @eval function $jlf(args...) @@ -38,16 +38,16 @@ for (jlf, f) in zip((:+, :*, :-), (:add, :mul, :sub)) end let (jlf, f) = (:div_arcp, :div) - for (T, llvmT) in ((:Float32, "float"), (:Float64, "double")) - ir = """ - %x = f$f fast $llvmT %0, %1 - ret $llvmT %x - """ - @eval begin - # the @pure is necessary so that we can constant propagate. - @inline Base.@pure function $jlf(a::$T, b::$T) - Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b) - end + T = :Float32 + llvmT = "float" + ir = """ + %x = f$f fast $llvmT %0, %1 + ret $llvmT %x + """ + @eval begin + # the @pure is necessary so that we can constant propagate. + @inline Base.@pure function $jlf(a::$T, b::$T) + Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b) end end @eval function $jlf(args...) @@ -77,8 +77,10 @@ const N = 4 const nmoist = 0 const ntrace = 0 -Base.@irrational grav 9.81 BigFloat(9.81) -Base.@irrational gdm1 0.4 BigFloat(0.4) +# Base.@irrational grav 9.81 BigFloat(9.81) +const grav = 9.81f0 +# Base.@irrational gdm1 0.4 BigFloat(0.4) +const gdm1 = 0.4f0 function volumerhs!(rhs, Q, vgeo, gravity, D, nelem) Q = Base.Experimental.Const(Q) @@ -99,9 +101,9 @@ function volumerhs!(rhs, Q, vgeo, gravity, D, nelem) r_rhsW = MArray{Tuple{Nq}, eltype(rhs)}(undef) r_rhsE = MArray{Tuple{Nq}, eltype(rhs)}(undef) - e = blockIdx().x - j = threadIdx().y - i = threadIdx().x + e = threadgroup_position_in_grid_2d().x + j = thread_position_in_threadgroup_2d().y + i = thread_position_in_threadgroup_2d().x @inbounds begin for k in 1:Nq @@ -233,37 +235,37 @@ function main() Nq = N + 1 nvar = _nstate + nmoist + ntrace - Q = 1 .+ CuArray(rand(rng, DFloat, Nq, Nq, Nq, nvar, nelem)) + Q = 1 .+ MtlArray(rand(rng, DFloat, Nq, Nq, Nq, nvar, nelem)) Q[:, :, :, _E, :] .+= 20 - vgeo = CuArray(rand(rng, DFloat, Nq, Nq, Nq, _nvgeo, nelem)) + vgeo = MtlArray(rand(rng, DFloat, Nq, Nq, Nq, _nvgeo, nelem)) # make sure the entries of the mass matrix satisfy the inverse relation vgeo[:, :, :, _MJ, :] .+= 3 vgeo[:, :, :, _MJI, :] .= 1 ./ vgeo[:, :, :, _MJ, :] - D = CuArray(rand(rng, DFloat, Nq, Nq)) + D = MtlArray(rand(rng, DFloat, Nq, Nq)) - rhs = CuArray(zeros(DFloat, Nq, Nq, Nq, nvar, nelem)) + rhs = MtlArray(zeros(DFloat, Nq, Nq, Nq, nvar, nelem)) threads=(N+1, N+1) - kernel = @cuda launch=false volumerhs!(rhs, Q, vgeo, DFloat(grav), D, nelem) + kernel = @metal launch=false volumerhs!(rhs, Q, vgeo, DFloat(grav), D, nelem) # XXX: should we print these for all kernels? maybe upload them to Codespeed? - @info """volumerhs! details: - - $(CUDA.registers(kernel)) registers, max $(CUDA.maxthreads(kernel)) threads - - $(Base.format_bytes(CUDA.memory(kernel).local)) local memory, - $(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory, - $(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory""" + # @info """volumerhs! details: + # - $(Metal.registers(kernel)) registers, max $(Metal.maxthreads(kernel)) threads + # - $(Base.format_bytes(Metal.memory(kernel).local)) local memory, + # $(Base.format_bytes(Metal.memory(kernel).shared)) shared memory, + # $(Base.format_bytes(Metal.memory(kernel).constant)) constant memory""" results = @benchmark begin - CUDA.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem; - threads=$threads, blocks=$nelem) + Metal.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem; + threads=$threads, groups=$nelem) end # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them - CUDA.unsafe_free!(rhs) - CUDA.unsafe_free!(Q) - CUDA.unsafe_free!(vgeo) - CUDA.unsafe_free!(D) + Metal.unsafe_free!(rhs) + Metal.unsafe_free!(Q) + Metal.unsafe_free!(vgeo) + Metal.unsafe_free!(D) results end