diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 8c338bbe67..02c7314661 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -48,6 +48,26 @@ steps:
     # NOTE: we make sure to run all tests on CUDA versions that have CUDNN & CUTENSOR
     #       by setting the CI_THOROUGH environment variable, and using a recent GPU.
 
+  - label: "CUDA 11.2"
+    plugins:
+      - JuliaCI/julia#v0.6:
+          version: 1.5
+      - JuliaCI/julia-test#v0.3: ~
+      - JuliaCI/julia-coverage#v0.3:
+          codecov: true
+          dirs:
+            - src
+            - lib
+            - examples
+    agents:
+      queue: "juliagpu"
+      cuda: "11.2"
+    env:
+      JULIA_CUDA_VERSION: '11.2'
+      JULIA_CUDA_USE_BINARYBUILDER: 'true'
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
   - label: "CUDA 11.1"
     plugins:
       - JuliaCI/julia#v0.4:
diff --git a/Artifacts.toml b/Artifacts.toml
index ec86f5cd27..0a1c477aff 100644
--- a/Artifacts.toml
+++ b/Artifacts.toml
@@ -138,6 +138,38 @@ os = "windows"
 
 
 
+[[CUDA112]]
+arch = "powerpc64le"
+git-tree-sha1 = "205075090ca78a68358e4613a444e56ddf5333d3"
+lazy = true
+libc = "glibc"
+os = "linux"
+
+    [[CUDA112.download]]
+    sha256 = "b146213b1b3ebf8c32f09f9cd8f843461991ece3089ae56c1d7669e6ccdd7711"
+    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.2.0+0/CUDA.v11.2.0.powerpc64le-linux-gnu.tar.gz"
+
+[[CUDA112]]
+arch = "x86_64"
+git-tree-sha1 = "e99dab5d7bdf5b60da265bae5e949189d907a56b"
+lazy = true
+libc = "glibc"
+os = "linux"
+
+    [[CUDA112.download]]
+    sha256 = "e2e2c31544411a4e85db23f603c367a9386c44ab0ba49fd86a1af2668fe3ce82"
+    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.2.0+0/CUDA.v11.2.0.x86_64-linux-gnu.tar.gz"
+
+[[CUDA112]]
+arch = "x86_64"
+git-tree-sha1 = "1aecead5cc57a9388d3b46929549edb0ef99912f"
+lazy = true
+os = "windows"
+
+    [[CUDA112.download]]
+    sha256 = "cbfcbc9428b761b99856ab1aab089c5f34b87abee9322ac269052bbd877969e0"
+    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.2.0+0/CUDA.v11.2.0.x86_64-w64-mingw32.tar.gz"
+
 
 # CUDNN
 
diff --git a/deps/bindeps.jl b/deps/bindeps.jl
index afdf749fb5..e4d72c4a13 100644
--- a/deps/bindeps.jl
+++ b/deps/bindeps.jl
@@ -118,6 +118,7 @@ lazy_artifact(x) = @artifact_str(x)
 # NOTE: we don't use autogenerated JLLs, because we have multiple artifacts and need to
 #       decide at run time (i.e. not via package dependencies) which one to use.
 const cuda_artifacts = Dict(
+    (release=v"11.2", version=v"11.2.0",   preferred=false) => ()->lazy_artifact("CUDA112"),
     (release=v"11.1", version=v"11.1.1",   preferred=true)  => ()->lazy_artifact("CUDA111"),
     (release=v"11.0", version=v"11.0.3",   preferred=true)  => ()->lazy_artifact("CUDA110"),
     (release=v"10.2", version=v"10.2.89",  preferred=true)  => ()->lazy_artifact("CUDA102"),
diff --git a/deps/compatibility.jl b/deps/compatibility.jl
index 3ef3044ca1..7680cb10e7 100644
--- a/deps/compatibility.jl
+++ b/deps/compatibility.jl
@@ -94,6 +94,7 @@ const cuda_ptx_db = Dict(
     v"6.5" => v"10.2":highest,
     v"7.0" => v"11.0":highest,
     v"7.1" => v"11.1":highest,
+    v"7.2" => v"11.2":highest,
 )
 
 function cuda_ptx_support(ver::VersionNumber)
diff --git a/lib/cublas/wrappers.jl b/lib/cublas/wrappers.jl
index aa6d33630c..f2a5c6c004 100644
--- a/lib/cublas/wrappers.jl
+++ b/lib/cublas/wrappers.jl
@@ -763,6 +763,7 @@ function gemmExComputeType(TA, TB, TC, m, k, n)
     end
 
     if m%4 == 0 && n%4 == 0 && k%4 == 0 && sig === (Int8, Int32)
+        CUDA.version() >= v"11.2" && return nothing # NVIDIA bug #3221266
         # Int32=Int8*Int8 requires m,n,k to be multiples of 4
         # https://forums.developer.nvidia.com/t/cublasgemmex-cant-use-cuda-r-8i-compute-type-on-gtx1080/58100/2
         return math_mode==CUDA.PEDANTIC_MATH ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I