diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 4aa8537f1c..9dedfd20d2 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -61,6 +61,7 @@ steps:
         matrix:
           setup:
             cuda:
+              - "12.5"
               - "12.4"
               - "12.3"
               - "12.2"
diff --git a/Project.toml b/Project.toml
index 758266782c..ab7c59c0e1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -50,9 +50,9 @@ AbstractFFTs = "0.4, 0.5, 1.0"
 Adapt = "4"
 BFloat16s = "0.2, 0.3, 0.4, 0.5"
 CEnum = "0.2, 0.3, 0.4, 0.5"
-CUDA_Driver_jll = "0.8"
+CUDA_Driver_jll = "0.9"
 CUDA_Runtime_Discovery = "0.3.3"
-CUDA_Runtime_jll = "0.12, 0.13"
+CUDA_Runtime_jll = "0.14"
 ChainRulesCore = "1"
 Crayons = "4"
 DataFrames = "1"
diff --git a/README.md b/README.md
index 28399c588c..ed9743e3d4 100644
--- a/README.md
+++ b/README.md
@@ -30,31 +30,6 @@ using Julia. It features a user-friendly array abstraction, a compiler for writi
 kernels in Julia, and wrappers for various CUDA libraries.
 
 
-## Requirements
-
-The latest development version of CUDA.jl requires **Julia 1.8** or higher. If you are using
-an older version of Julia, you need to use a previous version of CUDA.jl. This will happen
-automatically when you install the package using Julia's package manager.
-
-Note that CUDA.jl may not work with a custom build of Julia; it is recommended that you
-install Julia using the [official binaries](https://julialang.org/downloads/) or
-[juliaup](https://github.com/JuliaLang/juliaup).
-
-CUDA.jl also requires a CUDA-capable GPU with **compute capability 3.5** (Kepler) or
-higher, and an accompanying **NVIDIA driver for CUDA 11.0** or newer. A compatible CUDA
-toolkit will be downloaded automatically, but in case you want to use your own, CUDA.jl only
-supports **CUDA toolkit 11.4+** or newer. These requirements are not enforced by the Julia
-package manager when installing CUDA.jl. Depending on your system and GPU, you may need to
-install an older version of the package:
-
-* CUDA.jl v4.4 is the last version with support for CUDA 11.0-11.3 (deprecated in v5.0)
-* CUDA.jl v4.0 is the last version to work with CUDA 10.2 (removed in v4.1)
-* CUDA.jl v3.13 is the last version to work with CUDA 10.1 (removed in v4.0)
-* CUDA.jl v1.3 is the last version to work with CUDA 9-10.0 (removed in v2.0)
-
-Finally, you should be using a platform **supported by NVIDIA**. Currently, that means using
-64-bit Linux or Windows, with an X86, ARM, or PowerPC host processor.
-
 ## Quick start
 
 Before all, make sure you have a recent NVIDIA driver. On Windows, also make sure you have
@@ -91,6 +66,35 @@ For more usage instructions and other information, please refer to [the
 documentation](https://juliagpu.github.io/CUDA.jl/stable/).
 
 
+## Requirements
+
+The latest development version of CUDA.jl requires **Julia 1.8** or higher. If you are using
+an older version of Julia, you need to use a previous version of CUDA.jl. This will happen
+automatically when you install the package using Julia's package manager.
+
+Note that CUDA.jl may not work with a custom build of Julia; it is recommended that you
+install Julia using the [official binaries](https://julialang.org/downloads/) or
+[juliaup](https://github.com/JuliaLang/juliaup).
+
+The latest version of CUDA.jl also has certain requirements that cannot be enforced by the
+package manager:
+
+- Host platform: only 64-bit Linux and Windows are supported;
+- Device hardware: only NVIDIA GPUs with **compute capability 3.5** (Kepler) or higher are
+  supported;
+- NVIDIA driver: a driver for **CUDA 11.0** or newer is required;
+- CUDA toolkit (in case you need to use your own): only **CUDA toolkit 11.4** or newer are
+  supported.
+
+If you cannot meet these requirements, you may need to install an older version of CUDA.jl:
+
+* CUDA.jl v5.3 is the last version with support for PowerPC (removed in v5.4)
+* CUDA.jl v4.4 is the last version with support for CUDA 11.0-11.3 (deprecated in v5.0)
+* CUDA.jl v4.0 is the last version to work with CUDA 10.2 (removed in v4.1)
+* CUDA.jl v3.13 is the last version to work with CUDA 10.1 (removed in v4.0)
+* CUDA.jl v1.3 is the last version to work with CUDA 9-10.0 (removed in v2.0)
+
+
 ## Supporting and Citing
 
 Much of the software in this ecosystem was developed as part of academic research. If you
@@ -105,7 +109,7 @@ root of this repository lists the relevant papers.
 
 The package is tested against, and being developed for, Julia 1.8 and above. Main
 development and testing happens on x86 Linux, but the package is expected to work on
-Windows, and on ARM and PowerPC as well.
+Windows and ARM and as well.
 
 
 ## Questions and Contributions
diff --git a/lib/cudadrv/libcuda.jl b/lib/cudadrv/libcuda.jl
index e719fccf7e..294b4b1201 100644
--- a/lib/cudadrv/libcuda.jl
+++ b/lib/cudadrv/libcuda.jl
@@ -868,7 +868,7 @@ function Base.getproperty(x::Ptr{CUstreamBatchMemOpParams_union}, f::Symbol)
     f === :waitValue && return Ptr{CUstreamMemOpWaitValueParams_st}(x + 0)
     f === :writeValue && return Ptr{CUstreamMemOpWriteValueParams_st}(x + 0)
     f === :flushRemoteWrites && return Ptr{CUstreamMemOpFlushRemoteWritesParams_st}(x + 0)
-    f === :memoryBarrier && return Ptr{CUstreampMemoryBarrierParams_st}(x + 0)
+    f === :memoryBarrier && return Ptr{CUstreamMemOpMemoryBarrierParams_st}(x + 0)
     f === :pad && return Ptr{NTuple{6,cuuint64_t}}(x + 0)
     return getfield(x, f)
 end
@@ -6048,7 +6048,7 @@ struct CUstreamMemOpFlushRemoteWritesParams_st
     flags::Cuint
 end
 
-struct CUstreampMemoryBarrierParams_st
+struct CUstreamMemOpMemoryBarrierParams_st
     operation::CUstreamBatchMemOpType
     flags::Cuint
 end
diff --git a/lib/cupti/wrappers.jl b/lib/cupti/wrappers.jl
index 07252210a1..ed823303c7 100644
--- a/lib/cupti/wrappers.jl
+++ b/lib/cupti/wrappers.jl
@@ -225,7 +225,7 @@ function process(f, cfg::ActivityConfig)
     ## kernel activities
     activity_types[CUPTI_ACTIVITY_KIND_KERNEL] =
         if cuda_version >= v"12.0"
-            CUpti_ActivityKernel5
+            CUpti_ActivityKernel9
         elseif cuda_version >= v"11.8"
             CUpti_ActivityKernel8
         elseif cuda_version >= v"11.6"
diff --git a/res/wrap/Manifest.toml b/res/wrap/Manifest.toml
index 45a01383c4..7b350e1bf6 100644
--- a/res/wrap/Manifest.toml
+++ b/res/wrap/Manifest.toml
@@ -17,9 +17,9 @@ version = "0.5.0"
 
 [[CSTParser]]
 deps = ["Tokenize"]
-git-tree-sha1 = "b544d62417a99d091c569b95109bc9d8c223e9e3"
+git-tree-sha1 = "0157e592151e39fa570645e2b2debcdfb8a0f112"
 uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f"
-version = "3.4.2"
+version = "3.4.3"
 
 [[CUDA_Driver_jll]]
 deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
@@ -35,9 +35,9 @@ version = "0.12.1+0"
 
 [[CUDA_SDK_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "752d5810571b8639702ada3b098cf550babb1967"
+git-tree-sha1 = "6be192823cc703b93728c13b7183d1b4446be0d1"
 uuid = "6cbf2f2e-7e60-5632-ac76-dca2274e0be0"
-version = "12.4.1+0"
+version = "12.4.1+2"
 
 [[CUDNN_jll]]
 deps = ["Artifacts", "CUDA_Runtime_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
@@ -53,9 +53,9 @@ version = "2.0.1+0"
 
 [[Clang]]
 deps = ["CEnum", "Clang_jll", "Downloads", "Pkg", "TOML"]
-git-tree-sha1 = "be935fd478265159ffdb1a949489a5f91319fb95"
+git-tree-sha1 = "2397d5da17ba4970f772a9888b208a0a1d77eb5d"
 uuid = "40e3b903-d033-50b4-a0cc-940c62c95e31"
-version = "0.18.1"
+version = "0.18.3"
 
 [[Clang_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "TOML", "Zlib_jll", "libLLVM_jll"]
@@ -85,7 +85,7 @@ version = "4.14.0"
 [[CompilerSupportLibraries_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "1.1.0+0"
+version = "1.1.1+0"
 
 [[Crayons]]
 git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
@@ -276,9 +276,9 @@ uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 version = "1.10.0"
 
 [[Tokenize]]
-git-tree-sha1 = "5b5a892ba7704c0977013bd0f9c30f5d962181e0"
+git-tree-sha1 = "468b4685af4abe0e9fd4d7bf495a6554a6276e75"
 uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624"
-version = "0.5.28"
+version = "0.5.29"
 
 [[URIs]]
 git-tree-sha1 = "67db6cc7b3821e19ebe75791a9dd19c9b1188f2b"
diff --git a/src/compatibility.jl b/src/compatibility.jl
index 8d651e375b..9201a15f42 100644
--- a/src/compatibility.jl
+++ b/src/compatibility.jl
@@ -112,6 +112,7 @@ const cuda_ptx_db = Dict(
     v"8.2" => between(v"12.2", highest),
     v"8.3" => between(v"12.3", highest),
     v"8.4" => between(v"12.4", highest),
+    v"8.5" => between(v"12.5", highest),
 )
 
 function cuda_ptx_support(ver::VersionNumber)
@@ -182,15 +183,15 @@ const llvm_cap_db = Dict(
     v"6.0"   => between(v"3.9", highest),
     v"6.1"   => between(v"3.9", highest),
     v"6.2"   => between(v"3.9", highest),
-    v"7.0"   => between(v"6.0", highest),
-    v"7.2"   => between(v"7.0", highest),
-    v"7.5"   => between(v"8.0", highest),
-    v"8.0"   => between(v"11.0", highest),
-    v"8.6"   => between(v"13.0", highest),
-    v"8.7"   => between(v"16.0", highest),
-    v"8.9"   => between(v"16.0", highest),
-    v"9.0"   => between(v"16.0", highest),
-    #v"9.0a" => between(v"17.0", highest),
+    v"7.0"   => between(v"6", highest),
+    v"7.2"   => between(v"7", highest),
+    v"7.5"   => between(v"8", highest),
+    v"8.0"   => between(v"11", highest),
+    v"8.6"   => between(v"13", highest),
+    v"8.7"   => between(v"16", highest),
+    v"8.9"   => between(v"16", highest),
+    v"9.0"   => between(v"16", highest),
+    #v"9.0a" => between(v"18", highest),
 )
 
 function llvm_cap_support(ver::VersionNumber)
@@ -216,20 +217,25 @@ const llvm_ptx_db = Dict(
     v"4.2" => between(v"3.7", highest),
     v"4.3" => between(v"3.9", highest),
     v"5.0" => between(v"3.9", highest),
-    v"6.0" => between(v"6.0", highest),
-    v"6.1" => between(v"7.0", highest),
-    v"6.3" => between(v"8.0", highest),
-    v"6.4" => between(v"9.0", highest),
-    v"6.5" => between(v"11.0", highest),
-    v"7.0" => between(v"11.0", highest),
-    v"7.1" => between(v"13.0", highest),
-    v"7.2" => between(v"13.0", highest),
-    v"7.3" => between(v"14.0", highest),
-    v"7.4" => between(v"14.0", highest),
-    v"7.5" => between(v"14.0", highest),
-    v"7.6" => between(v"16.0", highest),
-    v"7.7" => between(v"16.0", highest),
-    v"7.8" => between(v"16.0", highest),
+    v"6.0" => between(v"6", highest),
+    v"6.1" => between(v"7", highest),
+    v"6.3" => between(v"8", highest),
+    v"6.4" => between(v"9", highest),
+    v"6.5" => between(v"11", highest),
+    v"7.0" => between(v"11", highest),
+    v"7.1" => between(v"13", highest),
+    v"7.2" => between(v"13", highest),
+    v"7.3" => between(v"14", highest),
+    v"7.4" => between(v"14", highest),
+    v"7.5" => between(v"14", highest),
+    v"7.6" => between(v"16", highest),
+    v"7.7" => between(v"16", highest),
+    v"7.8" => between(v"16", highest),
+    v"8.0" => between(v"17", highest),
+    v"8.1" => between(v"17", highest),
+    v"8.2" => between(v"18", highest),
+    v"8.3" => between(v"18", highest),
+    v"8.4" => between(v"19", highest),
 )
 
 function llvm_ptx_support(ver::VersionNumber)
diff --git a/src/compiler/reflection.jl b/src/compiler/reflection.jl
index aaa9e12cc4..2bfa6dbc3c 100644
--- a/src/compiler/reflection.jl
+++ b/src/compiler/reflection.jl
@@ -52,8 +52,9 @@ function code_sass(io::IO, job::CompilerJob; raw::Bool=false)
     end
 
     # NVIDIA bug #4604961: CUPTI in CUDA 12.4 Update 1 does not capture profiled events
-    #                      unless the activity API is first activated
-    if runtime_version() == v"12.4"
+    #                      unless the activity API is first activated. This is fixed in
+    #                      12.5 Update 1, but we can only check the minor runtime version.
+    if v"12.4" <= runtime_version() <= v"12.5"
         cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
                                     CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
         CUPTI.enable!(cfg) do
@@ -89,8 +90,9 @@ function code_sass(f::Base.Callable, io::IO=stdout; raw::Bool=false)
     end
 
     # NVIDIA bug #4604961: CUPTI in CUDA 12.4 Update 1 does not capture profiled events
-    #                      unless the activity API is first activated
-    if runtime_version() == v"12.4"
+    #                      unless the activity API is first activated. This is fixed in
+    #                      12.5 Update 1, but we can only check the minor runtime version.
+    if v"12.4" <= runtime_version() <= v"12.5"
         cfg = CUPTI.ActivityConfig([CUPTI.CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
                                     CUPTI.CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API])
         CUPTI.enable!(cfg) do
diff --git a/src/initialization.jl b/src/initialization.jl
index 2953542d60..fa61a88597 100644
--- a/src/initialization.jl
+++ b/src/initialization.jl
@@ -66,9 +66,9 @@ function __init__()
         return
     end
 
-    if driver < v"11"
-        @error "This version of CUDA.jl only supports NVIDIA drivers for CUDA 11.x or higher (yours is for CUDA $driver)"
-        _initialization_error[] = "CUDA driver too old"
+    if !(v"11" <= driver < v"13-")
+        @error "This version of CUDA.jl only supports NVIDIA drivers for CUDA 11.x or 12.x (yours is for CUDA $driver)"
+        _initialization_error[] = "CUDA driver unsupported"
         return
     end
 
@@ -79,19 +79,45 @@ function __init__()
 
     # check that we have a runtime
     if !CUDA_Runtime.is_available()
+        # try to find out why
+        reason = if CUDA_Runtime != CUDA_Runtime_jll
+            """You requested use of a local CUDA toolkit, but not all
+               required components were discovered.
+
+               Try running with `JULIA_DEBUG=CUDA_Runtime_Discovery` in
+               your environment and re-loading CUDA.jl for more details."""
+        elseif !Sys.iswindows() && !Sys.islinux() && !in(Sys.ARCH, [:x86_64, :aarch64])
+            """You are using an unsupported platform: this version of CUDA.jl
+               only supports Linux (x86_64, aarch64) and Windows (x86_64).
+
+               Consider downgrading CUDA.jl (refer to the README for a list of
+               supported platforms) or manually installing the CUDA toolkit and make
+               CUDA.jl use it by calling `CUDA.set_runtime_version!(local_toolkit=true)`."""
+        elseif CUDA_Runtime_jll.host_platform["cuda"] == "none"
+            """CUDA.jl's JLLs were precompiled without an NVIDIA driver present.
+               This can happen when installing CUDA.jl on an HPC log-in node,
+               or in a container. In that case, you need to specify which CUDA
+               version to use at run time by calling `CUDA.set_runtime_version!`
+               or provisioning the preference it sets at compile time.
+
+               If you are not running in a container or on an HPC log-in node,
+               try re-compiling the CUDA runtime JLL and re-loading CUDA.jl:
+                    pkg = Base.PkgId(Base.UUID("76a88914-d11a-5bdc-97e0-2f5a05c973a2"),
+                                     "CUDA_Runtime_jll")
+                    Base.compilecache(pkg)
+                    # re-start Julia and re-load CUDA.jl"""
+        else
+            """Could not diagnose why the CUDA runtime is not available.
+
+               If the issue persists, please file a support ticket with the following details:
+               - host platform: $(Base.BinaryPlatforms.triplet(CUDA_Runtime_jll.host_platform))
+               - libcuda: $libcuda (loaded through JLL: $(CUDA_Driver_jll.is_available()))
+               - driver version: $driver
+               """
+        end
         @error """CUDA.jl could not find an appropriate CUDA runtime to use.
 
-                  This can have several reasons:
-                  * you are using an unsupported platform: this version of CUDA.jl
-                    only supports Linux (x86_64, aarch64, ppc64le) and Windows (x86_64),
-                    while your platform was identified as $(Base.BinaryPlatforms.triplet(CUDA_Runtime_jll.host_platform));
-                  * you precompiled CUDA.jl in an environment where the CUDA driver
-                    was not available (i.e., a container, or an HPC login node).
-                    in that case, you need to specify which CUDA version to use
-                    by calling `CUDA.set_runtime_version!`;
-                  * you requested use of a local CUDA toolkit, but not all
-                    required components were discovered. try running with
-                    JULIA_DEBUG=all in your environment for more details.
+                  $reason
 
                   For more details, refer to the CUDA.jl documentation at
                   https://cuda.juliagpu.org/stable/installation/overview/"""
@@ -148,6 +174,13 @@ function __init__()
         return
     end
 
+    # warn if we're not using an official build of Julia
+    official_release = startswith(Base.TAGGED_RELEASE_BANNER, "Official")
+    if !official_release
+        @warn """You are using a non-official build of Julia. This may cause issues with CUDA.jl.
+                 Please consider using an official build from https://julialang.org/downloads/."""
+    end
+
     @static if !isdefined(Base, :get_extension)
         @require ChainRulesCore="d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" begin
             include("../ext/ChainRulesCoreExt.jl")
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index 9ca772d0f8..d8f409781e 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -193,7 +193,7 @@ end
     valid_kernel() = return
     invalid_kernel() = 1
 
-    if can_use_cupti()
+    if can_use_cupti() && CUDA.runtime_version() != v"12.5"
         @test CUDA.code_sass(devnull, valid_kernel, Tuple{}) == nothing
         @test_throws CUDA.KernelError CUDA.code_sass(devnull, invalid_kernel, Tuple{})
     end
@@ -204,7 +204,7 @@ end
 
     @eval kernel_341(ptr) = (@inbounds unsafe_store!(ptr, $(Symbol("dummy_^"))(unsafe_load(ptr))); nothing)
 
-    if can_use_cupti()
+    if can_use_cupti() && CUDA.runtime_version() != v"12.5"
         CUDA.code_sass(devnull, kernel_341, Tuple{Ptr{Int}})
     end
 end
@@ -212,7 +212,7 @@ end
 @testset "device runtime" begin
     kernel() = (CUDA.cudaGetLastError(); return)
 
-    if can_use_cupti()
+    if can_use_cupti() && CUDA.runtime_version() != v"12.5"
         CUDA.code_sass(devnull, kernel, Tuple{})
     end
 end
diff --git a/test/core/execution.jl b/test/core/execution.jl
index 6d78851e27..e8ad6ef960 100644
--- a/test/core/execution.jl
+++ b/test/core/execution.jl
@@ -77,7 +77,7 @@ end
     CUDA.code_warntype(devnull, dummy, Tuple{})
     CUDA.code_llvm(devnull, dummy, Tuple{})
     CUDA.code_ptx(devnull, dummy, Tuple{})
-    if can_use_cupti()
+    if can_use_cupti() && CUDA.runtime_version() != v"12.5" # NVIDIA bug #4667039
         # functions defined in Julia
         sass = sprint(io->CUDA.code_sass(io, dummy, Tuple{}))
         @test occursin(".text._Z5dummy", sass)
@@ -96,7 +96,7 @@ end
     @device_code_warntype io=devnull @cuda dummy()
     @device_code_llvm io=devnull @cuda dummy()
     @device_code_ptx io=devnull @cuda dummy()
-    if can_use_cupti()
+    if can_use_cupti() && CUDA.runtime_version() != v"12.5" # NVIDIA bug #4667039
         # functions defined in Julia
         sass = sprint(io->@device_code_sass io=io @cuda dummy())
         @test occursin(".text._Z5dummy", sass)
@@ -120,7 +120,7 @@ end
     @test occursin("dummy", sprint(io->(@device_code_llvm io=io optimize=false @cuda dummy())))
     @test occursin("dummy", sprint(io->(@device_code_llvm io=io @cuda dummy())))
     @test occursin("dummy", sprint(io->(@device_code_ptx io=io @cuda dummy())))
-    if can_use_cupti()
+    if can_use_cupti() && CUDA.runtime_version() != v"12.5" # NVIDIA bug #4667039
         @test occursin("dummy", sprint(io->(@device_code_sass io=io @cuda dummy())))
     end