diff --git a/perf/array.jl b/perf/array.jl
index a6a226cc..0c57a7df 100644
--- a/perf/array.jl
+++ b/perf/array.jl
@@ -5,16 +5,16 @@ const n = 1000
 
 # generate some arrays
 cpu_mat = rand(rng, Float32, m, n)
-gpu_mat = CuArray{Float32}(undef, size(cpu_mat))
+gpu_mat = MtlArray{Float32}(undef, size(cpu_mat))
 gpu_vec = reshape(gpu_mat, length(gpu_mat))
 gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
 gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
-gpu_mat_ints = CuArray(rand(rng, Int, m, n))
+gpu_mat_ints = MtlArray(rand(rng, Int, m, n))
 gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
-gpu_mat_bools = CuArray(rand(rng, Bool, m, n))
+gpu_mat_bools = MtlArray(rand(rng, Bool, m, n))
 gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
 
-group["construct"] = @benchmarkable CuArray{Int}(undef, 1)
+group["construct"] = @benchmarkable MtlArray{Int}(undef, 1)
 
 group["copy"] = @async_benchmarkable copy($gpu_mat)
 
@@ -26,7 +26,7 @@ let group = addgroup!(group, "copyto!")
 end
 
 let group = addgroup!(group, "iteration")
-    group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10]
+    group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10]
 
     group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
 
@@ -46,12 +46,12 @@ let group = addgroup!(group, "iteration")
     end
 end
 
-let group = addgroup!(group, "reverse")
-    group["1d"] = @async_benchmarkable reverse($gpu_vec)
-    group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
-    group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
-    group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
-end
+# let group = addgroup!(group, "reverse")
+#     group["1d"] = @async_benchmarkable reverse($gpu_vec)
+#     group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
+#     group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
+#     group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
+# end
 
 group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0
 
@@ -77,31 +77,31 @@ end
 
 let group = addgroup!(group, "random")
     let group = addgroup!(group, "rand")
-        group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n)
-        group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n)
+        group["Float32"] = @async_benchmarkable Metal.rand(Float32, m*n)
+        group["Int64"] = @async_benchmarkable Metal.rand(Int64, m*n)
     end
 
     let group = addgroup!(group, "rand!")
-        group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec)
-        group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints)
+        group["Float32"] = @async_benchmarkable Metal.rand!($gpu_vec)
+        group["Int64"] = @async_benchmarkable Metal.rand!($gpu_vec_ints)
     end
 
     let group = addgroup!(group, "randn")
-        group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n)
-        #group["Int64"] = @async_benchmarkable CUDA.randn(Int64, m*n)
+        group["Float32"] = @async_benchmarkable Metal.randn(Float32, m*n)
+        # group["Int64"] = @async_benchmarkable Metal.randn(Int64, m*n)
     end
 
     let group = addgroup!(group, "randn!")
-        group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec)
-        #group["Int64"] = @async_benchmarkable CUDA.randn!($gpu_vec_ints)
+        group["Float32"] = @async_benchmarkable Metal.randn!($gpu_vec)
+        # group["Int64"] = @async_benchmarkable Metal.randn!($gpu_vec_ints)
     end
 end
 
-let group = addgroup!(group, "sorting")
-    group["1d"] = @async_benchmarkable sort($gpu_vec)
-    group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
-    group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
-end
+# let group = addgroup!(group, "sorting")
+#     group["1d"] = @async_benchmarkable sort($gpu_vec)
+#     group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
+#     group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
+# end
 
 let group = addgroup!(group, "permutedims")
     group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
diff --git a/perf/byval.jl b/perf/byval.jl
index d32d62a9..6a546641 100644
--- a/perf/byval.jl
+++ b/perf/byval.jl
@@ -1,13 +1,12 @@
 module ByVal
 
-using CUDA, BenchmarkTools, Random
-using CUDA: i32
+using Metal, BenchmarkTools, Random
 
 const threads = 256
 
 # simple add matrixes kernel
 function kernel_add_mat(n, x1, x2, y)
-    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    i = thread_position_in_grid_1d()
     if i <= n
         @inbounds y[i] = x1[i] + x2[i]
     end
@@ -20,8 +19,8 @@ end
 
 # add arrays of matrixes kernel
 function kernel_add_mat_z_slices(n, vararg...)
-    x1, x2, y = get_inputs3(blockIdx().y, vararg...)
-    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    x1, x2, y = get_inputs3(threadgroup_position_in_grid_2d().y, vararg...)
+    i = thread_position_in_grid_1d()
     if i <= n
         @inbounds y[i] = x1[i] + x2[i]
     end
@@ -30,15 +29,15 @@ end
 
 function add_z_slices!(y, x1, x2)
     m1, n1 = size(x1[1]) #get size of first slice
-    blocks = (m1 * n1 + threads - 1) ÷ threads
-    # get length(x1) more blocks than needed to process 1 slice
-    @cuda blocks = blocks, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
+    groups = (m1 * n1 + threads - 1) ÷ threads
+    # get length(x1) more groups than needed to process 1 slice
+    @metal groups = groups, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
 end
 
 function add!(y, x1, x2)
     m1, n1 = size(x1)
-    blocks = (m1 * n1 + threads - 1) ÷ threads
-    @cuda blocks = blocks, 1          threads = threads kernel_add_mat(m1 * n1, x1, x2, y)
+    groups = (m1 * n1 + threads - 1) ÷ threads
+    @metal groups = (groups, 1)  threads = threads kernel_add_mat(m1 * n1, x1, x2, y)
 end
 
 function main()
@@ -54,22 +53,22 @@ function main()
     m, n = 3072, 1536    # 256 multiplier
     #m, n = 6007, 3001    # prime numbers to test memory access correctness
 
-    x1 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
-    x2 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
+    x1 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
+    x2 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
     y1 = [similar(x1[1]) for i = 1:num_z_slices]
 
     # reference down to bones add on GPU
-    results["reference"] = @benchmark CUDA.@sync blocking=true add!($y1[1], $x1[1], $x2[1])
+    results["reference"] = @benchmark Metal.@sync add!($y1[1], $x1[1], $x2[1])
 
     # adding arrays in an array
     for slices = 1:num_z_slices
-        results["slices=$slices"] = @benchmark CUDA.@sync blocking=true add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
+        results["slices=$slices"] = @benchmark Metal.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
     end
 
     # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
-    CUDA.unsafe_free!.(x1)
-    CUDA.unsafe_free!.(x2)
-    CUDA.unsafe_free!.(y1)
+    Metal.unsafe_free!.(x1)
+    Metal.unsafe_free!.(x2)
+    Metal.unsafe_free!.(y1)
 
     return results
 end
diff --git a/perf/kernel.jl b/perf/kernel.jl
index 1022a774..5cfcc924 100644
--- a/perf/kernel.jl
+++ b/perf/kernel.jl
@@ -1,33 +1,35 @@
-using CUDA: i32
+# using GPUArrays
 
 group = addgroup!(SUITE, "kernel")
 
-group["launch"] = @benchmarkable @cuda identity(nothing)
+group["launch"] = @benchmarkable @metal identity(nothing)
 
-group["occupancy"] = @benchmarkable begin
-    kernel = @cuda launch=false identity(nothing)
-    launch_configuration(kernel.fun)
-end
+# group["occupancy"] = @benchmarkable begin
+#     kernel = @metal launch=false identity(nothing)
+#     GPUArrays.launch_heuristic(Metal.mtlArrayBackend(), kernel.f; elements=1, elements_per_thread=1)
+#     return
+# end
 
-src = CUDA.rand(Float32, 512, 1000)
+src = Metal.rand(Float32, 512, 1000)
 dest = similar(src)
 function indexing_kernel(dest, src)
-    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    i = thread_position_in_grid_1d()
     @inbounds dest[i] = src[i]
     return
 end
-group["indexing"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $indexing_kernel($dest, $src)
+group["indexing"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $indexing_kernel($dest, $src)
 
 function checked_indexing_kernel(dest, src)
-    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    i = thread_position_in_grid_1d()
     dest[i] = src[i]
     return
 end
-group["indexing_checked"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $checked_indexing_kernel($dest, $src)
+group["indexing_checked"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $checked_indexing_kernel($dest, $src)
 
-function rand_kernel(dest::AbstractArray{T}) where {T}
-    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
-    dest[i] = rand(T)
-    return
-end
-group["rand"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $rand_kernel($dest)
+## DELETE
+# function rand_kernel(dest::AbstractArray{T}) where {T}
+#     i = thread_position_in_grid_1d()
+#     dest[i] = Metal.rand(T)
+#     return
+# end
+# group["rand"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $rand_kernel($dest)
diff --git a/perf/latency.jl b/perf/latency.jl
index 9cd20135..a1066abe 100644
--- a/perf/latency.jl
+++ b/perf/latency.jl
@@ -1,6 +1,6 @@
 module Latency
 
-using CUDA
+using Metal
 using BenchmarkTools
 
 function main()
@@ -11,24 +11,24 @@ function main()
         base_cmd = `$base_cmd --project=$(unsafe_string(Base.JLOptions().project))`
     end
     # NOTE: we don't ust Base.active_project() here because of how CI launches this script,
-    #       starting with --project in the main CUDA.jl project.
+    #       starting with --project in the main Metal.jl project.
 
     # time to precompile the package and its dependencies
     precompile_cmd =
-        `$base_cmd -e "pkg = Base.identify_package(\"CUDA\")
+        `$base_cmd -e "pkg = Base.identify_package(\"Metal\")
                        Base.compilecache(pkg)"`
     results["precompile"] = @benchmark run($precompile_cmd) evals=1 seconds=60
 
     # time to actually import the package
     import_cmd =
-        `$base_cmd -e "using CUDA"`
+        `$base_cmd -e "using Metal"`
     results["import"] = @benchmark run($import_cmd) evals=1 seconds=30
 
     # time to actually compile a kernel
     ttfp_cmd =
-        `$base_cmd -e "using CUDA
+        `$base_cmd -e "using Metal
                        kernel() = return
-                       CUDA.code_ptx(devnull, kernel, Tuple{}; kernel=true)"`
+                       Metal.code_agx(devnull, kernel, Tuple{}; kernel=true)"`
     results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60
 
     results
diff --git a/perf/metal.jl b/perf/metal.jl
index 5dbcbf0a..5e136b0e 100644
--- a/perf/metal.jl
+++ b/perf/metal.jl
@@ -1,14 +1,6 @@
-group = addgroup!(SUITE, "cuda")
+group = addgroup!(SUITE, "metal")
 
 let group = addgroup!(group, "synchronization")
-    let group = addgroup!(group, "stream")
-        group["blocking"] = @benchmarkable synchronize(blocking=true)
-        group["auto"] = @benchmarkable synchronize()
-        group["nonblocking"] = @benchmarkable synchronize(spin=false)
-    end
-    let group = addgroup!(group, "context")
-        group["blocking"] = @benchmarkable device_synchronize(blocking=true)
-        group["auto"] = @benchmarkable device_synchronize()
-        group["nonblocking"] = @benchmarkable device_synchronize(spin=false)
-    end
+    group["stream"] = @benchmarkable synchronize()
+    group["context"] = @benchmarkable device_synchronize()
 end
diff --git a/perf/metaldevrt.jl b/perf/metaldevrt.jl
index 08348fab..a3dbd07f 100644
--- a/perf/metaldevrt.jl
+++ b/perf/metaldevrt.jl
@@ -1,14 +1,14 @@
-module cudadevrt
+module metaldevrt
 
-using CUDA, BenchmarkTools, Random
+using Metal, BenchmarkTools, Random
 
 const threads = 256
 #simple add matrix and vector kernel
 function kernel_add_mat_vec(m, x1, x2, y)
     # one block per column
-    offset = (blockIdx().x-1) * m
-    @inbounds xtmp = x2[blockIdx().x]
-    for i = threadIdx().x : blockDim().x : m
+    offset = (threadgroup_position_in_grid_2d().x-1) * m
+    @inbounds xtmp = x2[threadgroup_position_in_grid_2d().x]
+    for i = thread_position_in_threadgroup_2d().x : threadgroups_per_grid_2d().x : m
         @inbounds y[offset + i] = x1[offset + i] + xtmp
     end
     return
@@ -16,27 +16,27 @@ end
 
 function add!(y, x1, x2)
     m, n = size(x1)
-    @cuda blocks = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y)
+    @metal groups = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y)
 end
 
 function main()
     Random.seed!(1)
     m, n = 3072, 1536    # 256 multiplier
-    x1 = cu(randn(Float32, (m, n)) .+ Float32(0.5))
-    x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5))
+    x1 = mtl(randn(Float32, (m, n)) .+ Float32(0.5))
+    x2 = mtl(randn(Float32, (1, n)) .+ Float32(0.5))
     y1 = similar(x1)
 
-    results = @benchmark CUDA.@sync blocking=true add!($y1, $x1, $x2)
+    results = @benchmark Metal.@sync add!($y1, $x1, $x2)
 
     # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
-    CUDA.unsafe_free!(x1)
-    CUDA.unsafe_free!(x2)
-    CUDA.unsafe_free!(y1)
+    Metal.unsafe_free!(x1)
+    Metal.unsafe_free!(x2)
+    Metal.unsafe_free!(y1)
 
     return results
 end
 
 end
 
-cudadevrt.main()
+metaldevrt.main()
 
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
index e3c6c6e9..549b01c9 100644
--- a/perf/runbenchmarks.jl
+++ b/perf/runbenchmarks.jl
@@ -1,18 +1,14 @@
 # benchmark suite execution and codespeed submission
 
-using CUDA
+using Metal
 
 using BenchmarkTools
 
 using StableRNGs
 rng = StableRNG(123)
 
-# we only submit results when running on the master branch
-real_run = get(ENV, "CODESPEED_BRANCH", nothing) == "master"
-if real_run
-    # to find untuned benchmarks
-    BenchmarkTools.DEFAULT_PARAMETERS.evals = 0
-end
+# to find untuned benchmarks
+BenchmarkTools.DEFAULT_PARAMETERS.evals = 0
 
 # print system information
 @info "System information:\n" * sprint(io->Metal.versioninfo(io))
@@ -20,7 +16,7 @@ end
 # convenience macro to create a benchmark that requires synchronizing the GPU
 macro async_benchmarkable(ex...)
     quote
-        @benchmarkable CUDA.@sync blocking=true $(ex...)
+        @benchmarkable Metal.@sync $(ex...)
     end
 end
 
@@ -33,19 +29,18 @@ SUITE = BenchmarkGroup()
 
 # NOTE: don't use spaces in benchmark names (tobami/codespeed#256)
 
-include("cuda.jl")
+include("metal.jl")
 include("kernel.jl")
 include("array.jl")
 
-if real_run
-    @info "Preparing main benchmarks"
-    warmup(SUITE; verbose=false)
-    tune!(SUITE)
+@info "Preparing main benchmarks"
+warmup(SUITE; verbose=false)
+tune!(SUITE)
 
-    # reclaim memory that might have been used by the tuning process
-    GC.gc(true)
-    CUDA.reclaim()
-end
+# reclaim memory that might have been used by the tuning process
+GC.gc(true)
+GC.gc(true)
+GC.gc(true)
 
 # benchmark groups that aren't part of the suite
 addgroup!(SUITE, "integration")
@@ -56,9 +51,9 @@ results = run(SUITE, verbose=true)
 # integration tests (that do nasty things, so need to be run last)
 @info "Running integration benchmarks"
 integration_results = BenchmarkGroup()
-integration_results["volumerhs"] = include("volumerhs.jl")
+# integration_results["volumerhs"] = include("volumerhs.jl")
 integration_results["byval"] = include("byval.jl")
-integration_results["cudadevrt"] = include("cudadevrt.jl")
+integration_results["metaldevrt"] = include("metaldevrt.jl")
 
 results["latency"] = latency_results
 results["integration"] = integration_results
@@ -84,48 +79,3 @@ if ispath(reference_path)
     println("Regressions:")
     println(regressions(comparison))
 end
-
-
-## submission
-
-# using JSON, HTTP
-
-if real_run
-    @info "Submitting to Codespeed..."
-
-    basedata = Dict(
-        "branch"        => ENV["CODESPEED_BRANCH"],
-        "commitid"      => ENV["CODESPEED_COMMIT"],
-        "project"       => ENV["CODESPEED_PROJECT"],
-        "environment"   => ENV["CODESPEED_ENVIRONMENT"],
-        "executable"    => ENV["CODESPEED_EXECUTABLE"]
-    )
-
-    # convert nested groups of benchmark to flat dictionaries of results
-    flat_results = []
-    function flatten(results, prefix="")
-        for (key,value) in results
-            if value isa BenchmarkGroup
-                flatten(value, "$prefix$key/")
-            else
-                @assert value isa BenchmarkTools.Trial
-
-                # codespeed reports maxima, but those are often very noisy.
-                # get rid of measurements that unnecessarily skew the distribution.
-                rmskew!(value)
-
-                push!(flat_results,
-                    Dict(basedata...,
-                        "benchmark" => "$prefix$key",
-                        "result_value" => median(value).time / 1e9,
-                        "min" => minimum(value).time / 1e9,
-                        "max" => maximum(value).time / 1e9))
-            end
-        end
-    end
-    flatten(results)
-
-    HTTP.post("$(ENV["CODESPEED_SERVER"])/result/add/json/",
-                ["Content-Type" => "application/x-www-form-urlencoded"],
-                HTTP.URIs.escapeuri(Dict("json" => JSON.json(flat_results))))
-end
diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl
index 5c7737f5..f5b3b6d3 100644
--- a/perf/volumerhs.jl
+++ b/perf/volumerhs.jl
@@ -1,7 +1,7 @@
 module VolumeRHS
 
 using BenchmarkTools
-using CUDA
+using Metal
 using StableRNGs
 using StaticArrays
 
@@ -20,16 +20,16 @@ end
 
 # HACK: module-local versions of core arithmetic; needed to get FMA
 for (jlf, f) in zip((:+, :*, :-), (:add, :mul, :sub))
-    for (T, llvmT) in ((:Float32, "float"), (:Float64, "double"))
-        ir = """
-            %x = f$f contract nsz $llvmT %0, %1
-            ret $llvmT %x
-        """
-        @eval begin
-            # the @pure is necessary so that we can constant propagate.
-            @inline Base.@pure function $jlf(a::$T, b::$T)
-                Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b)
-            end
+    T = :Float32
+    llvmT = "float"
+    ir = """
+        %x = f$f contract nsz $llvmT %0, %1
+        ret $llvmT %x
+    """
+    @eval begin
+        # the @pure is necessary so that we can constant propagate.
+        @inline Base.@pure function $jlf(a::$T, b::$T)
+            Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b)
         end
     end
     @eval function $jlf(args...)
@@ -38,16 +38,16 @@ for (jlf, f) in zip((:+, :*, :-), (:add, :mul, :sub))
 end
 
 let (jlf, f) = (:div_arcp, :div)
-    for (T, llvmT) in ((:Float32, "float"), (:Float64, "double"))
-        ir = """
-            %x = f$f fast $llvmT %0, %1
-            ret $llvmT %x
-        """
-        @eval begin
-            # the @pure is necessary so that we can constant propagate.
-            @inline Base.@pure function $jlf(a::$T, b::$T)
-                Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b)
-            end
+    T = :Float32
+    llvmT = "float"
+    ir = """
+        %x = f$f fast $llvmT %0, %1
+        ret $llvmT %x
+    """
+    @eval begin
+        # the @pure is necessary so that we can constant propagate.
+        @inline Base.@pure function $jlf(a::$T, b::$T)
+            Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b)
         end
     end
     @eval function $jlf(args...)
@@ -77,8 +77,10 @@ const N = 4
 const nmoist = 0
 const ntrace = 0
 
-Base.@irrational grav 9.81 BigFloat(9.81)
-Base.@irrational gdm1 0.4 BigFloat(0.4)
+# Base.@irrational grav 9.81 BigFloat(9.81)
+const grav = 9.81f0
+# Base.@irrational gdm1 0.4 BigFloat(0.4)
+const gdm1 = 0.4f0
 
 function volumerhs!(rhs, Q, vgeo, gravity, D, nelem)
     Q = Base.Experimental.Const(Q)
@@ -99,9 +101,9 @@ function volumerhs!(rhs, Q, vgeo, gravity, D, nelem)
     r_rhsW = MArray{Tuple{Nq}, eltype(rhs)}(undef)
     r_rhsE = MArray{Tuple{Nq}, eltype(rhs)}(undef)
 
-    e = blockIdx().x
-    j = threadIdx().y
-    i = threadIdx().x
+    e = threadgroup_position_in_grid_2d().x
+    j = thread_position_in_threadgroup_2d().y
+    i = thread_position_in_threadgroup_2d().x
 
     @inbounds begin
         for k in 1:Nq
@@ -233,37 +235,37 @@ function main()
     Nq = N + 1
     nvar = _nstate + nmoist + ntrace
 
-    Q = 1 .+ CuArray(rand(rng, DFloat, Nq, Nq, Nq, nvar, nelem))
+    Q = 1 .+ MtlArray(rand(rng, DFloat, Nq, Nq, Nq, nvar, nelem))
     Q[:, :, :, _E, :] .+= 20
-    vgeo = CuArray(rand(rng, DFloat, Nq, Nq, Nq, _nvgeo, nelem))
+    vgeo = MtlArray(rand(rng, DFloat, Nq, Nq, Nq, _nvgeo, nelem))
 
     # make sure the entries of the mass matrix satisfy the inverse relation
     vgeo[:, :, :, _MJ, :] .+= 3
     vgeo[:, :, :, _MJI, :] .= 1 ./ vgeo[:, :, :, _MJ, :]
 
-    D = CuArray(rand(rng, DFloat, Nq, Nq))
+    D = MtlArray(rand(rng, DFloat, Nq, Nq))
 
-    rhs = CuArray(zeros(DFloat, Nq, Nq, Nq, nvar, nelem))
+    rhs = MtlArray(zeros(DFloat, Nq, Nq, Nq, nvar, nelem))
 
     threads=(N+1, N+1)
 
-    kernel = @cuda launch=false volumerhs!(rhs, Q, vgeo, DFloat(grav), D, nelem)
+    kernel = @metal launch=false volumerhs!(rhs, Q, vgeo, DFloat(grav), D, nelem)
     # XXX: should we print these for all kernels? maybe upload them to Codespeed?
-    @info """volumerhs! details:
-              - $(CUDA.registers(kernel)) registers, max $(CUDA.maxthreads(kernel)) threads
-              - $(Base.format_bytes(CUDA.memory(kernel).local)) local memory,
-                $(Base.format_bytes(CUDA.memory(kernel).shared)) shared memory,
-                $(Base.format_bytes(CUDA.memory(kernel).constant)) constant memory"""
+    # @info """volumerhs! details:
+    #           - $(Metal.registers(kernel)) registers, max $(Metal.maxthreads(kernel)) threads
+    #           - $(Base.format_bytes(Metal.memory(kernel).local)) local memory,
+    #             $(Base.format_bytes(Metal.memory(kernel).shared)) shared memory,
+    #             $(Base.format_bytes(Metal.memory(kernel).constant)) constant memory"""
     results = @benchmark begin
-        CUDA.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem;
-                                         threads=$threads, blocks=$nelem)
+        Metal.@sync blocking=true $kernel($rhs, $Q, $vgeo, $(DFloat(grav)), $D, $nelem;
+                                         threads=$threads, groups=$nelem)
     end
 
     # BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
-    CUDA.unsafe_free!(rhs)
-    CUDA.unsafe_free!(Q)
-    CUDA.unsafe_free!(vgeo)
-    CUDA.unsafe_free!(D)
+    Metal.unsafe_free!(rhs)
+    Metal.unsafe_free!(Q)
+    Metal.unsafe_free!(vgeo)
+    Metal.unsafe_free!(D)
 
     results
 end