From 76109ce40b3694a8b61ae39fa44d9e87468abc47 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 11 May 2023 04:35:52 -0700
Subject: [PATCH 01/37] move functions, use config option to enable mvc, do so
 before importing driver

---
 python/cudf/cudf/__init__.py           |   7 +-
 python/cudf/cudf/core/udf/utils.py     | 146 +------------------------
 python/cudf/cudf/utils/_numba_setup.py | 131 ++++++++++++++++++++++
 3 files changed, 136 insertions(+), 148 deletions(-)
 create mode 100644 python/cudf/cudf/utils/_numba_setup.py

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 06310e278a2..dfcd9f11236 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -5,7 +5,7 @@
 validate_setup()
 
 import cupy
-from numba import config as numba_config, cuda
+from numba import config as numba_config
 
 import rmm
 from rmm.allocators.cupy import rmm_cupy_allocator
@@ -92,11 +92,12 @@
     # cuDF requires a stronger set of conditions than what is
     # checked by patch_numba_linker_if_needed due to the PTX
     # files needed for JIT Groupby Apply and string UDFs
-    from cudf.core.udf.utils import _PTX_FILE, _setup_numba_linker
+    from cudf.utils._numba_setup import ANY_PTX_FILE, _setup_numba_linker
 
-    _setup_numba_linker(_PTX_FILE)
+    _setup_numba_linker(ANY_PTX_FILE)
 
     del patch_numba_linker_if_needed
+from numba import cuda
 
 cuda.set_memory_manager(RMMNumbaManager)
 cupy.cuda.set_allocator(rmm_cupy_allocator)
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index d890b94127f..a387880c55d 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -1,6 +1,5 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-import glob
 import os
 from typing import Any, Callable, Dict
 
@@ -13,7 +12,6 @@
 from numba.core.datamodel import default_manager, models
 from numba.core.errors import TypingError
 from numba.core.extending import register_model
-from numba.cuda.cudadrv.driver import Linker
 from numba.np import numpy_support
 from numba.types import CPointer, Poison, Record, Tuple, boolean, int64, void
 
@@ -33,6 +31,7 @@
     udf_string,
 )
 from cudf.utils import cudautils
+from cudf.utils._numba_setup import _get_ptx_file
 from cudf.utils.dtypes import (
     BOOL_TYPES,
     DATETIME_TYPES,
@@ -63,58 +62,6 @@
 precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
 launch_arg_getters: Dict[Any, Any] = {}
 
-
-def _get_best_ptx_file(archs, max_compute_capability):
-    """
-    Determine of the available PTX files which one is
-    the most recent up to and including the device cc
-    """
-    filtered_archs = [x for x in archs if x[0] <= max_compute_capability]
-    if filtered_archs:
-        return max(filtered_archs, key=lambda y: y[0])
-    else:
-        return None
-
-
-def _get_ptx_file(path, prefix):
-    if "RAPIDS_NO_INITIALIZE" in os.environ:
-        # cc=60 ptx is always built
-        cc = int(os.environ.get("STRINGS_UDF_CC", "60"))
-    else:
-        dev = cuda.get_current_device()
-
-        # Load the highest compute capability file available that is less than
-        # the current device's.
-        cc = int("".join(str(x) for x in dev.compute_capability))
-    files = glob.glob(os.path.join(path, f"{prefix}*.ptx"))
-    if len(files) == 0:
-        raise RuntimeError(f"Missing PTX files for cc={cc}")
-    regular_sms = []
-
-    for f in files:
-        file_name = os.path.basename(f)
-        sm_number = file_name.rstrip(".ptx").lstrip(prefix)
-        if sm_number.endswith("a"):
-            processed_sm_number = int(sm_number.rstrip("a"))
-            if processed_sm_number == cc:
-                return f
-        else:
-            regular_sms.append((int(sm_number), f))
-
-    regular_result = None
-
-    if regular_sms:
-        regular_result = _get_best_ptx_file(regular_sms, cc)
-
-    if regular_result is None:
-        raise RuntimeError(
-            "This cuDF installation is missing the necessary PTX "
-            f"files that are <={cc}."
-        )
-    else:
-        return regular_result[1]
-
-
 _PTX_FILE = _get_ptx_file(os.path.dirname(__file__), "shim_")
 
 
@@ -390,97 +337,6 @@ def _get_extensionty_size(ty):
     return llty.get_abi_size(target_data)
 
 
-def _get_cuda_version_from_ptx_file(path):
-    """
-    https://docs.nvidia.com/cuda/parallel-thread-execution/
-    Each PTX module must begin with a .version
-    directive specifying the PTX language version
-
-    example header:
-    //
-    // Generated by NVIDIA NVVM Compiler
-    //
-    // Compiler Build ID: CL-31057947
-    // Cuda compilation tools, release 11.6, V11.6.124
-    // Based on NVVM 7.0.1
-    //
-
-    .version 7.6
-    .target sm_52
-    .address_size 64
-
-    """
-    with open(path) as ptx_file:
-        for line in ptx_file:
-            if line.startswith(".version"):
-                ver_line = line
-                break
-        else:
-            raise ValueError("Could not read CUDA version from ptx file.")
-    version = ver_line.strip("\n").split(" ")[1]
-    # from ptx_docs/release_notes above:
-    ver_map = {
-        "7.5": (11, 5),
-        "7.6": (11, 6),
-        "7.7": (11, 7),
-        "7.8": (11, 8),
-        "8.0": (12, 0),
-    }
-
-    cuda_ver = ver_map.get(version)
-    if cuda_ver is None:
-        raise ValueError(
-            f"Could not map PTX version {version} to a CUDA version"
-        )
-
-    return cuda_ver
-
-
-def _setup_numba_linker(path):
-    from ptxcompiler.patch import NO_DRIVER, safe_get_versions
-
-    from cudf.core.udf.utils import (
-        _get_cuda_version_from_ptx_file,
-        maybe_patch_numba_linker,
-    )
-
-    versions = safe_get_versions()
-    if versions != NO_DRIVER:
-        driver_version, runtime_version = versions
-        ptx_toolkit_version = _get_cuda_version_from_ptx_file(path)
-        maybe_patch_numba_linker(
-            driver_version, runtime_version, ptx_toolkit_version
-        )
-
-
-def maybe_patch_numba_linker(
-    driver_version, runtime_version, ptx_toolkit_version
-):
-    from cubinlinker.patch import (
-        _numba_version_ok,
-        get_logger,
-        new_patched_linker,
-    )
-
-    # Numba thinks cubinlinker is only needed if the driver is older than
-    # the ctk, but when PTX files are present, it might also need to patch
-    # because those PTX files may newer than the driver as well
-    logger = get_logger()
-
-    if (driver_version < ptx_toolkit_version) or (
-        driver_version < runtime_version
-    ):
-        logger.debug(
-            "Driver version %s.%s needs patching due to PTX files"
-            % driver_version
-        )
-        if _numba_version_ok:
-            logger.debug("Patching Numba Linker")
-            Linker.new = new_patched_linker
-        else:
-            logger.debug("Cannot patch Numba Linker - unsupported version")
-
-
 @initfunc
 def set_malloc_heap_size(size=None):
     """
diff --git a/python/cudf/cudf/utils/_numba_setup.py b/python/cudf/cudf/utils/_numba_setup.py
new file mode 100644
index 00000000000..4a67055d60a
--- /dev/null
+++ b/python/cudf/cudf/utils/_numba_setup.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+import glob
+import os
+
+from numba import config
+
+ANY_PTX_FILE = os.path.dirname(__file__) + "/../core/udf/shim_60.ptx"
+
+
+def _get_best_ptx_file(archs, max_compute_capability):
+    """
+    Determine of the available PTX files which one is
+    the most recent up to and including the device cc
+    """
+    filtered_archs = [x for x in archs if x[0] <= max_compute_capability]
+    if filtered_archs:
+        return max(filtered_archs, key=lambda y: y[0])
+    else:
+        return None
+
+
+def _get_ptx_file(path, prefix):
+    if "RAPIDS_NO_INITIALIZE" in os.environ:
+        # cc=60 ptx is always built
+        cc = int(os.environ.get("STRINGS_UDF_CC", "60"))
+    else:
+        from numba import cuda
+
+        dev = cuda.get_current_device()
+
+        # Load the highest compute capability file available that is less than
+        # the current device's.
+        cc = int("".join(str(x) for x in dev.compute_capability))
+    files = glob.glob(os.path.join(path, f"{prefix}*.ptx"))
+    if len(files) == 0:
+        raise RuntimeError(f"Missing PTX files for cc={cc}")
+    regular_sms = []
+
+    for f in files:
+        file_name = os.path.basename(f)
+        sm_number = file_name.rstrip(".ptx").lstrip(prefix)
+        if sm_number.endswith("a"):
+            processed_sm_number = int(sm_number.rstrip("a"))
+            if processed_sm_number == cc:
+                return f
+        else:
+            regular_sms.append((int(sm_number), f))
+
+    regular_result = None
+
+    if regular_sms:
+        regular_result = _get_best_ptx_file(regular_sms, cc)
+
+    if regular_result is None:
+        raise RuntimeError(
+            "This cuDF installation is missing the necessary PTX "
+            f"files that are <={cc}."
+        )
+    else:
+        return regular_result[1]
+
+
+def _setup_numba_linker(path):
+    from ptxcompiler.patch import NO_DRIVER, safe_get_versions
+
+    versions = safe_get_versions()
+    if versions != NO_DRIVER:
+        driver_version, runtime_version = versions
+        ptx_toolkit_version = _get_cuda_version_from_ptx_file(path)
+        maybe_patch_numba_linker(
+            driver_version, runtime_version, ptx_toolkit_version
+        )
+
+
+def maybe_patch_numba_linker(
+    driver_version, runtime_version, ptx_toolkit_version
+):
+    # Numba thinks cubinlinker is only needed if the driver is older than
+    # the ctk, but when PTX files are present, it might also need to patch
+    # because those PTX files may newer than the driver as well
+    if (driver_version < ptx_toolkit_version) or (
+        driver_version < runtime_version
+    ):
+        config.NUMBA_CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1
+
+
+def _get_cuda_version_from_ptx_file(path):
+    """
+    https://docs.nvidia.com/cuda/parallel-thread-execution/
+    Each PTX module must begin with a .version
+    directive specifying the PTX language version
+
+    example header:
+    //
+    // Generated by NVIDIA NVVM Compiler
+    //
+    // Compiler Build ID: CL-31057947
+    // Cuda compilation tools, release 11.6, V11.6.124
+    // Based on NVVM 7.0.1
+    //
+
+    .version 7.6
+    .target sm_52
+    .address_size 64
+
+    """
+    with open(path) as ptx_file:
+        for line in ptx_file:
+            if line.startswith(".version"):
+                ver_line = line
+                break
+        else:
+            raise ValueError("Could not read CUDA version from ptx file.")
+    version = ver_line.strip("\n").split(" ")[1]
+    # from ptx_docs/release_notes above:
+    ver_map = {
+        "7.5": (11, 5),
+        "7.6": (11, 6),
+        "7.7": (11, 7),
+        "7.8": (11, 8),
+        "8.0": (12, 0),
+    }
+
+    cuda_ver = ver_map.get(version)
+    if cuda_ver is None:
+        raise ValueError(
+            f"Could not map PTX version {version} to a CUDA version"
+        )
+
+    return cuda_ver

From de2b678944294dd8040b1a352da88a1ac2f5dd3f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 11 May 2023 04:53:14 -0700
Subject: [PATCH 02/37] move more of numbas setup to _numba_setup

---
 python/cudf/cudf/__init__.py           | 25 ++++---------------------
 python/cudf/cudf/utils/_numba_setup.py |  5 +++++
 2 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index dfcd9f11236..acc9e6e4b04 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -80,36 +80,19 @@
     read_text,
 )
 from cudf.options import describe_option, get_option, set_option
+from cudf.utils._numba_setup import _setup_numba
 from cudf.utils.dtypes import _NA_REP
 from cudf.utils.utils import clear_cache, set_allocator
 
-try:
-    from cubinlinker.patch import patch_numba_linker_if_needed
-except ImportError:
-    pass
-else:
-    # Patch Numba to support CUDA enhanced compatibility.
-    # cuDF requires a stronger set of conditions than what is
-    # checked by patch_numba_linker_if_needed due to the PTX
-    # files needed for JIT Groupby Apply and string UDFs
-    from cudf.utils._numba_setup import ANY_PTX_FILE, _setup_numba_linker
+_setup_numba()
 
-    _setup_numba_linker(ANY_PTX_FILE)
-
-    del patch_numba_linker_if_needed
+# This must be imported after _setup_numba_linker is called and the numba
+# config is modified otherwise the config option will have no effect
 from numba import cuda
 
 cuda.set_memory_manager(RMMNumbaManager)
 cupy.cuda.set_allocator(rmm_cupy_allocator)
 
-try:
-    # Numba 0.54: Disable low occupancy warnings
-    numba_config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
-except AttributeError:
-    # Numba < 0.54: No occupancy warnings
-    pass
-del numba_config
-
 
 rmm.register_reinitialize_hook(clear_cache)
 
diff --git a/python/cudf/cudf/utils/_numba_setup.py b/python/cudf/cudf/utils/_numba_setup.py
index 4a67055d60a..8abb86bdce4 100644
--- a/python/cudf/cudf/utils/_numba_setup.py
+++ b/python/cudf/cudf/utils/_numba_setup.py
@@ -8,6 +8,11 @@
 ANY_PTX_FILE = os.path.dirname(__file__) + "/../core/udf/shim_60.ptx"
 
 
+def _setup_numba():
+    _setup_numba_linker(ANY_PTX_FILE)
+    config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+
+
 def _get_best_ptx_file(archs, max_compute_capability):
     """
     Determine of the available PTX files which one is

From 442fefc537949f0fdf9b624e8e4ef0135f161566 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 11 May 2023 05:03:13 -0700
Subject: [PATCH 03/37] update comment in __init__

---
 python/cudf/cudf/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index acc9e6e4b04..c510ededeba 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -86,8 +86,8 @@
 
 _setup_numba()
 
-# This must be imported after _setup_numba_linker is called and the numba
-# config is modified otherwise the config option will have no effect
+# This must be imported after _setup_numba is called and the numba
+# config is modified otherwise the config options will have no effect
 from numba import cuda
 
 cuda.set_memory_manager(RMMNumbaManager)

From f5f915d86dabea2ad013c2416964ad8821a160f9 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 11 May 2023 05:22:14 -0700
Subject: [PATCH 04/37] add a few docs

---
 python/cudf/cudf/utils/_numba_setup.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/python/cudf/cudf/utils/_numba_setup.py b/python/cudf/cudf/utils/_numba_setup.py
index 8abb86bdce4..ddcada62eee 100644
--- a/python/cudf/cudf/utils/_numba_setup.py
+++ b/python/cudf/cudf/utils/_numba_setup.py
@@ -9,6 +9,14 @@
 
 
 def _setup_numba():
+    """
+    Configure numba for use with cuDF. This consists of potentially
+    putting numba into enhanced compatibility mode based on the user
+    driver and runtime versions as well as the version of the cuda
+    toolkit used to build the PTX files shipped with the user cuDF
+    package. It also sets any other config options within numba that
+    are desired for cuDF's operation.
+    """
     _setup_numba_linker(ANY_PTX_FILE)
     config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 

From 19dd82c49874bcf0404eec9e92c719645a010e02 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 11 May 2023 09:30:25 -0700
Subject: [PATCH 05/37] add a debug statement for now

---
 python/cudf/cudf/utils/_numba_setup.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/utils/_numba_setup.py b/python/cudf/cudf/utils/_numba_setup.py
index ddcada62eee..d1d2ef6fd32 100644
--- a/python/cudf/cudf/utils/_numba_setup.py
+++ b/python/cudf/cudf/utils/_numba_setup.py
@@ -92,10 +92,11 @@ def maybe_patch_numba_linker(
     # Numba thinks cubinlinker is only needed if the driver is older than
     # the ctk, but when PTX files are present, it might also need to patch
     # because those PTX files may newer than the driver as well
-    if (driver_version < ptx_toolkit_version) or (
-        driver_version < runtime_version
-    ):
-        config.NUMBA_CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1
+    raise AssertionError(
+        f"driver_version={driver_version},"
+        f"runtime_version={runtime_version},"
+        f"ptx_toolkit_version={ptx_toolkit_version}"
+    )
 
 
 def _get_cuda_version_from_ptx_file(path):

From d360008c258007560fb5b3bcaa5b3a8bff16d8fd Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 11 May 2023 09:32:00 -0700
Subject: [PATCH 06/37] only raise in cec mode

---
 python/cudf/cudf/utils/_numba_setup.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/utils/_numba_setup.py b/python/cudf/cudf/utils/_numba_setup.py
index d1d2ef6fd32..434bdd66394 100644
--- a/python/cudf/cudf/utils/_numba_setup.py
+++ b/python/cudf/cudf/utils/_numba_setup.py
@@ -92,11 +92,15 @@ def maybe_patch_numba_linker(
     # Numba thinks cubinlinker is only needed if the driver is older than
     # the ctk, but when PTX files are present, it might also need to patch
     # because those PTX files may newer than the driver as well
-    raise AssertionError(
-        f"driver_version={driver_version},"
-        f"runtime_version={runtime_version},"
-        f"ptx_toolkit_version={ptx_toolkit_version}"
-    )
+    if (driver_version < ptx_toolkit_version) or (
+        driver_version < runtime_version
+    ):
+        config.NUMBA_CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1
+        raise AssertionError(
+            f"driver_version={driver_version},"
+            f"runtime_version={runtime_version},"
+            f"ptx_toolkit_version={ptx_toolkit_version}"
+        )
 
 
 def _get_cuda_version_from_ptx_file(path):

From 9c76c6138fe098be2ee9a3b6f9583bf6769cd33b Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 11 May 2023 11:23:11 -0700
Subject: [PATCH 07/37] try bumping to numba 0.57

---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 4 ++--
 dependencies.yaml                                | 2 +-
 python/cudf/cudf/utils/_numba_setup.py           | 5 -----
 python/cudf/pyproject.toml                       | 2 +-
 python/dask_cudf/pyproject.toml                  | 2 +-
 6 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 4031f1aa1c3..b6daea7c2bc 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -48,7 +48,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba>=0.56.4,<0.57
+- numba>=0.57
 - numpy>=1.21,<1.24
 - numpydoc
 - nvcc_linux-64=11.8
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index f8074711b88..97c438bec59 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -53,7 +53,7 @@ requirements:
     - cython >=0.29,<0.30
     - scikit-build >=0.13.1
     - setuptools
-    - numba >=0.56.4,<0.57
+    - numba >=0.57
     - dlpack >=0.5,<0.6.0a0
     - pyarrow =11
     - libcudf ={{ version }}
@@ -65,7 +65,7 @@ requirements:
     - typing_extensions
     - pandas >=1.3,<1.6.0dev0
     - cupy >=12.0.0
-    - numba >=0.56.4,<0.57
+    - numba >=0.57
     - numpy >=1.21,<1.24  # Temporarily upper bound numpy to avoid overflow deprecations
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }}
     - libcudf {{ version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 70d7f8c1ec8..e3fcbe69932 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -350,7 +350,7 @@ dependencies:
         packages:
           - cachetools
           - cuda-python>=11.7.1,<12.0
-          - &numba numba>=0.56.4,<0.57
+          - &numba numba>=0.57
           - nvtx>=0.2.1
           - packaging
           - rmm==23.6.*
diff --git a/python/cudf/cudf/utils/_numba_setup.py b/python/cudf/cudf/utils/_numba_setup.py
index 434bdd66394..ddcada62eee 100644
--- a/python/cudf/cudf/utils/_numba_setup.py
+++ b/python/cudf/cudf/utils/_numba_setup.py
@@ -96,11 +96,6 @@ def maybe_patch_numba_linker(
         driver_version < runtime_version
     ):
         config.NUMBA_CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1
-        raise AssertionError(
-            f"driver_version={driver_version},"
-            f"runtime_version={runtime_version},"
-            f"ptx_toolkit_version={ptx_toolkit_version}"
-        )
 
 
 def _get_cuda_version_from_ptx_file(path):
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index d13324a7404..b08dd92d52f 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
     "cuda-python>=11.7.1,<12.0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
-    "numba>=0.56.4,<0.57",
+    "numba>=0.57",
     "numpy>=1.21,<1.24",
     "nvtx>=0.2.1",
     "packaging",
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index ff2a3f2d095..42b6c26c002 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -40,7 +40,7 @@ dynamic = ["entry-points"]
 [project.optional-dependencies]
 test = [
     "dask-cuda==23.6.*",
-    "numba>=0.56.4,<0.57",
+    "numba>=0.57",
     "pytest",
     "pytest-cov",
     "pytest-xdist",

From 950f98f08d228f23a26d0a51709d2e316427d9c8 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 12 May 2023 09:56:53 -0700
Subject: [PATCH 08/37] conditionally import ptxcompiler

---
 python/cudf/cudf/utils/_numba_setup.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/utils/_numba_setup.py b/python/cudf/cudf/utils/_numba_setup.py
index ddcada62eee..02541d8a0cc 100644
--- a/python/cudf/cudf/utils/_numba_setup.py
+++ b/python/cudf/cudf/utils/_numba_setup.py
@@ -75,15 +75,19 @@ def _get_ptx_file(path, prefix):
 
 
 def _setup_numba_linker(path):
-    from ptxcompiler.patch import NO_DRIVER, safe_get_versions
-
-    versions = safe_get_versions()
-    if versions != NO_DRIVER:
-        driver_version, runtime_version = versions
-        ptx_toolkit_version = _get_cuda_version_from_ptx_file(path)
-        maybe_patch_numba_linker(
-            driver_version, runtime_version, ptx_toolkit_version
-        )
+    try:
+        # ptxcompiler will not be present for cuda 12+
+        from ptxcompiler.patch import NO_DRIVER, safe_get_versions
+
+        versions = safe_get_versions()
+        if versions != NO_DRIVER:
+            driver_version, runtime_version = versions
+            ptx_toolkit_version = _get_cuda_version_from_ptx_file(path)
+            maybe_patch_numba_linker(
+                driver_version, runtime_version, ptx_toolkit_version
+            )
+    except ImportError:
+        pass
 
 
 def maybe_patch_numba_linker(

From c8142ea542e5f1036ffafef3381d9471c728229b Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 15 May 2023 06:05:17 -0700
Subject: [PATCH 09/37] update comments a bit

---
 python/cudf/cudf/utils/_numba_setup.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/utils/_numba_setup.py b/python/cudf/cudf/utils/_numba_setup.py
index 02541d8a0cc..ddc6c38dc1d 100644
--- a/python/cudf/cudf/utils/_numba_setup.py
+++ b/python/cudf/cudf/utils/_numba_setup.py
@@ -76,16 +76,22 @@ def _get_ptx_file(path, prefix):
 
 def _setup_numba_linker(path):
     try:
-        # ptxcompiler will not be present for cuda 12+
+        # By default, ptxcompiler will not be installed with CUDA 12
+        # packages. This is ok, because in this situation putting
+        # numba in enhanced compatibility mode is not necessary.
         from ptxcompiler.patch import NO_DRIVER, safe_get_versions
 
         versions = safe_get_versions()
         if versions != NO_DRIVER:
             driver_version, runtime_version = versions
-            ptx_toolkit_version = _get_cuda_version_from_ptx_file(path)
-            maybe_patch_numba_linker(
-                driver_version, runtime_version, ptx_toolkit_version
-            )
+            # Don't check if CEC is necessary in the possible edge
+            # case where a user has a CUDA 12 package and ptxcompiler
+            # in their environment anyways, perhaps installed separately
+            if driver_version < (12, 0):
+                ptx_toolkit_version = _get_cuda_version_from_ptx_file(path)
+                maybe_patch_numba_linker(
+                    driver_version, runtime_version, ptx_toolkit_version
+                )
     except ImportError:
         pass
 

From 8c7bae8861a101334cd3733a90e6594eb1fc2533 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 15 May 2023 09:18:10 -0500
Subject: [PATCH 10/37] Apply suggestions from code review

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/utils/_numba_setup.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/utils/_numba_setup.py b/python/cudf/cudf/utils/_numba_setup.py
index 02541d8a0cc..109e6c6c4b9 100644
--- a/python/cudf/cudf/utils/_numba_setup.py
+++ b/python/cudf/cudf/utils/_numba_setup.py
@@ -12,8 +12,8 @@ def _setup_numba():
     """
     Configure numba for use with cuDF. This consists of potentially
     putting numba into enhanced compatibility mode based on the user
-    driver and runtime versions as well as the version of the cuda
-    toolkit used to build the PTX files shipped with the user cuDF
+    driver and runtime versions as well as the version of the CUDA
+    Toolkit used to build the PTX files shipped with the user cuDF
     package. It also sets any other config options within numba that
     are desired for cuDF's operation.
     """
@@ -24,11 +24,11 @@ def _setup_numba():
 def _get_best_ptx_file(archs, max_compute_capability):
     """
     Determine of the available PTX files which one is
-    the most recent up to and including the device cc
+    the most recent up to and including the device compute capability.
     """
     filtered_archs = [x for x in archs if x[0] <= max_compute_capability]
     if filtered_archs:
-        return max(filtered_archs, key=lambda y: y[0])
+        return max(filtered_archs, key=lambda x: x[0])
     else:
         return None
 
@@ -94,8 +94,8 @@ def maybe_patch_numba_linker(
     driver_version, runtime_version, ptx_toolkit_version
 ):
     # Numba thinks cubinlinker is only needed if the driver is older than
-    # the ctk, but when PTX files are present, it might also need to patch
-    # because those PTX files may newer than the driver as well
+    # the CUDA runtime, but when PTX files are present, it might also need to patch
+    # because those PTX files may be compiled by a CUDA version that is newer than the driver as well
     if (driver_version < ptx_toolkit_version) or (
         driver_version < runtime_version
     ):

From b8d290d34793681eef4c23a6b42f7d5215e70164 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 15 May 2023 07:20:54 -0700
Subject: [PATCH 11/37] _numba_setup -> _setup_numba

---
 python/cudf/cudf/__init__.py                                | 2 +-
 python/cudf/cudf/utils/{_numba_setup.py => _setup_numba.py} | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)
 rename python/cudf/cudf/utils/{_numba_setup.py => _setup_numba.py} (97%)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index c510ededeba..b4fcc4a63e0 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -80,7 +80,7 @@
     read_text,
 )
 from cudf.options import describe_option, get_option, set_option
-from cudf.utils._numba_setup import _setup_numba
+from cudf.utils._setup_numba import _setup_numba
 from cudf.utils.dtypes import _NA_REP
 from cudf.utils.utils import clear_cache, set_allocator
 
diff --git a/python/cudf/cudf/utils/_numba_setup.py b/python/cudf/cudf/utils/_setup_numba.py
similarity index 97%
rename from python/cudf/cudf/utils/_numba_setup.py
rename to python/cudf/cudf/utils/_setup_numba.py
index dca58fed77b..5db9af110e2 100644
--- a/python/cudf/cudf/utils/_numba_setup.py
+++ b/python/cudf/cudf/utils/_setup_numba.py
@@ -100,8 +100,9 @@ def maybe_patch_numba_linker(
     driver_version, runtime_version, ptx_toolkit_version
 ):
     # Numba thinks cubinlinker is only needed if the driver is older than
-    # the CUDA runtime, but when PTX files are present, it might also need to patch
-    # because those PTX files may be compiled by a CUDA version that is newer than the driver as well
+    # the CUDA runtime, but when PTX files are present, it might also need
+    # to patch because those PTX files may be compiled by a CUDA version
+    # that is newer than the driver as well
     if (driver_version < ptx_toolkit_version) or (
         driver_version < runtime_version
     ):

From a50c642025af900762bd403be68be8fbe32dc611 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 15 May 2023 07:49:36 -0700
Subject: [PATCH 12/37] address more reviews

---
 python/cudf/cudf/utils/_setup_numba.py | 32 ++++++++++++++------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/utils/_setup_numba.py b/python/cudf/cudf/utils/_setup_numba.py
index 5db9af110e2..762dee18dc1 100644
--- a/python/cudf/cudf/utils/_setup_numba.py
+++ b/python/cudf/cudf/utils/_setup_numba.py
@@ -5,7 +5,7 @@
 
 from numba import config
 
-ANY_PTX_FILE = os.path.dirname(__file__) + "/../core/udf/shim_60.ptx"
+CC_60_PTX_FILE = os.path.dirname(__file__) + "/../core/udf/shim_60.ptx"
 
 
 def _setup_numba():
@@ -17,7 +17,10 @@ def _setup_numba():
     package. It also sets any other config options within numba that
     are desired for cuDF's operation.
     """
-    _setup_numba_linker(ANY_PTX_FILE)
+    _setup_numba_linker(CC_60_PTX_FILE)
+
+    # disable low occupancy warnings for internal usages of numba,
+    # such as in our iloc implementation
     config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
 
 
@@ -80,20 +83,19 @@ def _setup_numba_linker(path):
         # packages. This is ok, because in this situation putting
         # numba in enhanced compatibility mode is not necessary.
         from ptxcompiler.patch import NO_DRIVER, safe_get_versions
-
-        versions = safe_get_versions()
-        if versions != NO_DRIVER:
-            driver_version, runtime_version = versions
-            # Don't check if CEC is necessary in the possible edge
-            # case where a user has a CUDA 12 package and ptxcompiler
-            # in their environment anyways, perhaps installed separately
-            if driver_version < (12, 0):
-                ptx_toolkit_version = _get_cuda_version_from_ptx_file(path)
-                maybe_patch_numba_linker(
-                    driver_version, runtime_version, ptx_toolkit_version
-                )
     except ImportError:
-        pass
+        return
+    versions = safe_get_versions()
+    if versions != NO_DRIVER:
+        driver_version, runtime_version = versions
+        # Don't check if CEC is necessary in the possible edge
+        # case where a user has a CUDA 12 package and ptxcompiler
+        # in their environment anyways, perhaps installed separately
+        if driver_version < (12, 0):
+            ptx_toolkit_version = _get_cuda_version_from_ptx_file(path)
+            maybe_patch_numba_linker(
+                driver_version, runtime_version, ptx_toolkit_version
+            )
 
 
 def maybe_patch_numba_linker(

From 96b6f01fda82f3dcc074959fc51975d0cdc3c881 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 16 May 2023 09:15:14 -0700
Subject: [PATCH 13/37] use a context manager to squash occupancy warnings for
 numba kernels

---
 python/cudf/cudf/core/indexed_frame.py        |  4 +-
 python/cudf/cudf/core/udf/groupby_utils.py    |  4 +-
 python/cudf/cudf/core/udf/utils.py            |  2 +-
 python/cudf/cudf/tests/test_dataframe_copy.py |  5 ++-
 .../cudf/tests/test_extension_compilation.py  | 15 ++++---
 python/cudf/cudf/tests/test_string_udfs.py    |  8 ++--
 python/cudf/cudf/utils/_setup_numba.py        | 13 ++++--
 python/cudf/cudf/utils/applyutils.py          | 10 +++--
 python/cudf/cudf/utils/cudautils.py           | 42 ++++++++++---------
 python/cudf/cudf/utils/queryutils.py          |  4 +-
 10 files changed, 67 insertions(+), 40 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 7141958f62d..c6b2c10b9da 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -68,6 +68,7 @@
     _return_arr_from_dtype,
 )
 from cudf.utils import docutils
+from cudf.utils._setup_numba import CUDFNumbaConfig
 from cudf.utils.utils import _cudf_nvtx_annotate
 
 doc_reset_index_template = """
@@ -2193,7 +2194,8 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
         input_args = _get_input_args_from_frame(self)
         launch_args = output_args + input_args + list(args)
         try:
-            kernel.forall(len(self))(*launch_args)
+            with CUDFNumbaConfig():
+                kernel.forall(len(self))(*launch_args)
         except Exception as e:
             raise RuntimeError("UDF kernel execution failed.") from e
 
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index ae09dd1d704..30f27f1b654 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -27,6 +27,7 @@
     _supported_dtypes_from_frame,
     precompiled,
 )
+from cudf.utils._setup_numba import CUDFNumbaConfig
 from cudf.utils.utils import _cudf_nvtx_annotate
 
 
@@ -198,6 +199,7 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
     )
 
     # Launch kernel
-    specialized[ngroups, tpb](*launch_args)
+    with CUDFNumbaConfig():
+        specialized[ngroups, tpb](*launch_args)
 
     return output
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index a387880c55d..075d9677209 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -31,7 +31,7 @@
     udf_string,
 )
 from cudf.utils import cudautils
-from cudf.utils._numba_setup import _get_ptx_file
+from cudf.utils._setup_numba import _get_ptx_file
 from cudf.utils.dtypes import (
     BOOL_TYPES,
     DATETIME_TYPES,
diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py
index 85e994bd733..3f75a50fb40 100644
--- a/python/cudf/cudf/tests/test_dataframe_copy.py
+++ b/python/cudf/cudf/tests/test_dataframe_copy.py
@@ -8,6 +8,7 @@
 
 from cudf.core.dataframe import DataFrame
 from cudf.testing._utils import ALL_TYPES, assert_eq
+from cudf.utils._setup_numba import CUDFNumbaConfig
 
 """
 DataFrame copy expectations
@@ -159,8 +160,8 @@ def test_kernel_deep_copy():
     gdf = DataFrame.from_pandas(pdf)
     cdf = gdf.copy(deep=True)
     sr = gdf["b"]
-
-    add_one[1, len(sr)](sr._column.data_array_view(mode="write"))
+    with CUDFNumbaConfig():
+        add_one[1, len(sr)](sr._column.data_array_view(mode="write"))
     assert not gdf.to_string().split() == cdf.to_string().split()
 
 
diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
index f1ed17c5df5..525204bf002 100644
--- a/python/cudf/cudf/tests/test_extension_compilation.py
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 import operator
 
 import cupy as cp
@@ -12,6 +12,7 @@
 from cudf.core.udf.api import Masked
 from cudf.core.udf.masked_typing import MaskedType
 from cudf.testing._utils import parametrize_numeric_dtypes_pairwise
+from cudf.utils._setup_numba import CUDFNumbaConfig
 
 arith_ops = (
     operator.add,
@@ -106,7 +107,8 @@ def test_kernel(x, y, err):
             err[0] = 3
 
     err = cp.asarray([0], dtype="int8")
-    test_kernel[1, 1](1, 2, err)
+    with CUDFNumbaConfig():
+        test_kernel[1, 1](1, 2, err)
     assert err[0] == 0
 
 
@@ -214,7 +216,8 @@ def test_kernel(err):
             err[0] = 2
 
     err = cp.asarray([0], dtype="int8")
-    test_kernel[1, 1](err)
+    with CUDFNumbaConfig():
+        test_kernel[1, 1](err)
     assert err[0] == 0
 
 
@@ -304,7 +307,8 @@ def test_kernel(err):
             err[0] = 2
 
     err = cp.asarray([0], dtype="int8")
-    test_kernel[1, 1](err)
+    with CUDFNumbaConfig():
+        test_kernel[1, 1](err)
     assert err[0] == 0
 
 
@@ -326,5 +330,6 @@ def test_kernel(err):
             err[0] = 1
 
     err = cp.asarray([0], dtype="int8")
-    test_kernel[1, 1](err)
+    with CUDFNumbaConfig():
+        test_kernel[1, 1](err)
     assert err[0] == 0
diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py
index 049dfdc8e30..62bffad0a01 100644
--- a/python/cudf/cudf/tests/test_string_udfs.py
+++ b/python/cudf/cudf/tests/test_string_udfs.py
@@ -22,6 +22,7 @@
 )
 from cudf.core.udf.utils import _PTX_FILE, _get_extensionty_size
 from cudf.testing._utils import assert_eq, sv_to_udf_str
+from cudf.utils._setup_numba import CUDFNumbaConfig
 
 
 def get_kernels(func, dtype, size):
@@ -85,8 +86,8 @@ def run_udf_test(data, func, dtype):
     sv_kernel, udf_str_kernel = get_kernels(func, dtype, len(data))
 
     expect = pd.Series(data).apply(func)
-
-    sv_kernel.forall(len(data))(str_views, output)
+    with CUDFNumbaConfig():
+        sv_kernel.forall(len(data))(str_views, output)
     if dtype == "str":
         result = column_from_udf_string_array(output)
     else:
@@ -94,7 +95,8 @@ def run_udf_test(data, func, dtype):
 
     got = cudf.Series(result, dtype=dtype)
     assert_eq(expect, got, check_dtype=False)
-    udf_str_kernel.forall(len(data))(str_views, output)
+    with CUDFNumbaConfig():
+        udf_str_kernel.forall(len(data))(str_views, output)
     if dtype == "str":
         result = column_from_udf_string_array(output)
     else:
diff --git a/python/cudf/cudf/utils/_setup_numba.py b/python/cudf/cudf/utils/_setup_numba.py
index 762dee18dc1..287f5958fb6 100644
--- a/python/cudf/cudf/utils/_setup_numba.py
+++ b/python/cudf/cudf/utils/_setup_numba.py
@@ -19,10 +19,6 @@ def _setup_numba():
     """
     _setup_numba_linker(CC_60_PTX_FILE)
 
-    # disable low occupancy warnings for internal usages of numba,
-    # such as in our iloc implementation
-    config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
-
 
 def _get_best_ptx_file(archs, max_compute_capability):
     """
@@ -155,3 +151,12 @@ def _get_cuda_version_from_ptx_file(path):
         )
 
     return cuda_ver
+
+
+class CUDFNumbaConfig:
+    def __enter__(self):
+        self.enter_val = config.CUDA_LOW_OCCUPANCY_WARNINGS
+        config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        config.CUDA_LOW_OCCUPANCY_WARNINGS = self.enter_val
diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py
index 933b98367b6..14709bcbbfb 100644
--- a/python/cudf/cudf/utils/applyutils.py
+++ b/python/cudf/cudf/utils/applyutils.py
@@ -12,6 +12,7 @@
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import column
 from cudf.utils import utils
+from cudf.utils._setup_numba import CUDFNumbaConfig
 from cudf.utils.docutils import docfmt_partial
 
 _doc_applyparams = """
@@ -195,7 +196,8 @@ def compile(self, func, argnames, extra_argnames):
         return kernel
 
     def launch_kernel(self, df, args):
-        self.kernel.forall(len(df))(*args)
+        with CUDFNumbaConfig():
+            self.kernel.forall(len(df))(*args)
 
 
 class ApplyChunksCompiler(ApplyKernelCompilerBase):
@@ -209,12 +211,14 @@ def compile(self, func, argnames, extra_argnames):
     def launch_kernel(self, df, args, chunks, blkct=None, tpb=None):
         chunks = self.normalize_chunks(len(df), chunks)
         if blkct is None and tpb is None:
-            self.kernel.forall(len(df))(len(df), chunks, *args)
+            with CUDFNumbaConfig():
+                self.kernel.forall(len(df))(len(df), chunks, *args)
         else:
             assert tpb is not None
             if blkct is None:
                 blkct = chunks.size
-            self.kernel[blkct, tpb](len(df), chunks, *args)
+            with CUDFNumbaConfig():
+                self.kernel[blkct, tpb](len(df), chunks, *args)
 
     def normalize_chunks(self, size, chunks):
         if isinstance(chunks, int):
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index e2bd4556ce8..4f047d3c125 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 from pickle import dumps
 
@@ -8,6 +8,7 @@
 from numba.np import numpy_support
 
 import cudf
+from cudf.utils._setup_numba import CUDFNumbaConfig
 
 #
 # Misc kernels
@@ -80,19 +81,20 @@ def find_index_of_val(arr, val, mask=None, compare="eq"):
     """
     found = cuda.device_array(shape=(arr.shape), dtype="int32")
     if found.size > 0:
-        if compare == "gt":
-            gpu_mark_gt.forall(found.size)(arr, val, found, arr.size)
-        elif compare == "lt":
-            gpu_mark_lt.forall(found.size)(arr, val, found, arr.size)
-        else:
-            if arr.dtype in ("float32", "float64"):
-                gpu_mark_found_float.forall(found.size)(
-                    arr, val, found, arr.size
-                )
+        with CUDFNumbaConfig():
+            if compare == "gt":
+                gpu_mark_gt.forall(found.size)(arr, val, found, arr.size)
+            elif compare == "lt":
+                gpu_mark_lt.forall(found.size)(arr, val, found, arr.size)
             else:
-                gpu_mark_found_int.forall(found.size)(
-                    arr, val, found, arr.size
-                )
+                if arr.dtype in ("float32", "float64"):
+                    gpu_mark_found_float.forall(found.size)(
+                        arr, val, found, arr.size
+                    )
+                else:
+                    gpu_mark_found_int.forall(found.size)(
+                        arr, val, found, arr.size
+                    )
 
     return cudf.core.column.column.as_column(found).set_mask(mask)
 
@@ -154,9 +156,10 @@ def gpu_window_sizes_from_offset(arr, window_sizes, offset):
 def window_sizes_from_offset(arr, offset):
     window_sizes = cuda.device_array(shape=(arr.shape), dtype="int32")
     if arr.size > 0:
-        gpu_window_sizes_from_offset.forall(arr.size)(
-            arr, window_sizes, offset
-        )
+        with CUDFNumbaConfig():
+            gpu_window_sizes_from_offset.forall(arr.size)(
+                arr, window_sizes, offset
+            )
     return window_sizes
 
 
@@ -177,9 +180,10 @@ def gpu_grouped_window_sizes_from_offset(
 def grouped_window_sizes_from_offset(arr, group_starts, offset):
     window_sizes = cuda.device_array(shape=(arr.shape), dtype="int32")
     if arr.size > 0:
-        gpu_grouped_window_sizes_from_offset.forall(arr.size)(
-            arr, window_sizes, group_starts, offset
-        )
+        with CUDFNumbaConfig():
+            gpu_grouped_window_sizes_from_offset.forall(arr.size)(
+                arr, window_sizes, group_starts, offset
+            )
     return window_sizes
 
 
diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py
index 4ce89b526d6..6cbe702db90 100644
--- a/python/cudf/cudf/utils/queryutils.py
+++ b/python/cudf/cudf/utils/queryutils.py
@@ -11,6 +11,7 @@
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import column_empty
 from cudf.utils import applyutils
+from cudf.utils._setup_numba import CUDFNumbaConfig
 from cudf.utils.dtypes import (
     BOOL_TYPES,
     DATETIME_TYPES,
@@ -247,6 +248,7 @@ def query_execute(df, expr, callenv):
     out = column_empty(nrows, dtype=np.bool_)
     # run kernel
     args = [out] + colarrays + envargs
-    kernel.forall(nrows)(*args)
+    with CUDFNumbaConfig():
+        kernel.forall(nrows)(*args)
     out_mask = applyutils.make_aggregate_nullmask(df, columns=columns)
     return out.set_mask(out_mask).fillna(False)

From 47d8a2e7149edd5071ffe0349d1cc393f9c88302 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 17 May 2023 09:43:07 -0700
Subject: [PATCH 14/37] revert numba upgrade

---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 4 ++--
 dependencies.yaml                                | 2 +-
 python/cudf/pyproject.toml                       | 2 +-
 python/dask_cudf/pyproject.toml                  | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index b6daea7c2bc..4031f1aa1c3 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -48,7 +48,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba>=0.57
+- numba>=0.56.4,<0.57
 - numpy>=1.21,<1.24
 - numpydoc
 - nvcc_linux-64=11.8
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 97c438bec59..f8074711b88 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -53,7 +53,7 @@ requirements:
     - cython >=0.29,<0.30
     - scikit-build >=0.13.1
     - setuptools
-    - numba >=0.57
+    - numba >=0.56.4,<0.57
     - dlpack >=0.5,<0.6.0a0
     - pyarrow =11
     - libcudf ={{ version }}
@@ -65,7 +65,7 @@ requirements:
     - typing_extensions
     - pandas >=1.3,<1.6.0dev0
     - cupy >=12.0.0
-    - numba >=0.57
+    - numba >=0.56.4,<0.57
     - numpy >=1.21,<1.24  # Temporarily upper bound numpy to avoid overflow deprecations
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }}
     - libcudf {{ version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index e3fcbe69932..70d7f8c1ec8 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -350,7 +350,7 @@ dependencies:
         packages:
           - cachetools
           - cuda-python>=11.7.1,<12.0
-          - &numba numba>=0.57
+          - &numba numba>=0.56.4,<0.57
           - nvtx>=0.2.1
           - packaging
           - rmm==23.6.*
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index b08dd92d52f..d13324a7404 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
     "cuda-python>=11.7.1,<12.0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
-    "numba>=0.57",
+    "numba>=0.56.4,<0.57",
     "numpy>=1.21,<1.24",
     "nvtx>=0.2.1",
     "packaging",
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 42b6c26c002..ff2a3f2d095 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -40,7 +40,7 @@ dynamic = ["entry-points"]
 [project.optional-dependencies]
 test = [
     "dask-cuda==23.6.*",
-    "numba>=0.57",
+    "numba>=0.56.4,<0.57",
     "pytest",
     "pytest-cov",
     "pytest-xdist",

From b9634f9e240217f6f95a27fcfaf7c29a2a67f28f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 17 May 2023 11:34:00 -0700
Subject: [PATCH 15/37] adjust logic, introduce runtime check in apply/groupby
 udfs

---
 python/cudf/cudf/core/udf/utils.py     | 10 ++++++++++
 python/cudf/cudf/utils/_setup_numba.py | 27 +++++++++++---------------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index 218e3c3c294..4a3ed23b6bd 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -253,6 +253,16 @@ def _compile_or_get(
     we then obtain the return type from that separate compilation and
     use it to allocate an output column of the right dtype.
     """
+    # runtime check for CEC mode which is disabled for CUDA 12 for now
+    if cuda.cudadrv.driver.get_version() == (
+        12,
+        0,
+    ) and cuda.cudadrv.runtime.get_version() > (12, 0):
+        raise ValueError(
+            "Minor version compatibility not yet supported for "
+            "CUDA driver versions newer than 12.0"
+        )
+
     if not all(is_scalar(arg) for arg in args):
         raise TypeError("only scalar valued args are supported by apply")
 
diff --git a/python/cudf/cudf/utils/_setup_numba.py b/python/cudf/cudf/utils/_setup_numba.py
index 287f5958fb6..127c1e3e489 100644
--- a/python/cudf/cudf/utils/_setup_numba.py
+++ b/python/cudf/cudf/utils/_setup_numba.py
@@ -4,6 +4,7 @@
 import os
 
 from numba import config
+from numba.cuda.cudadrv.driver import Linker
 
 CC_60_PTX_FILE = os.path.dirname(__file__) + "/../core/udf/shim_60.ptx"
 
@@ -78,6 +79,7 @@ def _setup_numba_linker(path):
         # By default, ptxcompiler will not be installed with CUDA 12
         # packages. This is ok, because in this situation putting
         # numba in enhanced compatibility mode is not necessary.
+        from cubinlinker.patch import _numba_version_ok, new_patched_linker
         from ptxcompiler.patch import NO_DRIVER, safe_get_versions
     except ImportError:
         return
@@ -89,22 +91,15 @@ def _setup_numba_linker(path):
         # in their environment anyways, perhaps installed separately
         if driver_version < (12, 0):
             ptx_toolkit_version = _get_cuda_version_from_ptx_file(path)
-            maybe_patch_numba_linker(
-                driver_version, runtime_version, ptx_toolkit_version
-            )
-
-
-def maybe_patch_numba_linker(
-    driver_version, runtime_version, ptx_toolkit_version
-):
-    # Numba thinks cubinlinker is only needed if the driver is older than
-    # the CUDA runtime, but when PTX files are present, it might also need
-    # to patch because those PTX files may be compiled by a CUDA version
-    # that is newer than the driver as well
-    if (driver_version < ptx_toolkit_version) or (
-        driver_version < runtime_version
-    ):
-        config.NUMBA_CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1
+            # Numba thinks cubinlinker is only needed if the driver is older
+            # than the CUDA runtime, but when PTX files are present, it might
+            # also need to patch because those PTX files may be compiled by
+            # a CUDA version that is newer than the driver as well
+            if (driver_version < ptx_toolkit_version) or (
+                driver_version < runtime_version
+            ):
+                if _numba_version_ok:
+                    Linker.new = new_patched_linker
 
 
 def _get_cuda_version_from_ptx_file(path):

From 7a594b3c7da9064e0d9baa3c747c291c27c7fa76 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 17 May 2023 13:55:30 -0700
Subject: [PATCH 16/37] Address reviews

---
 python/cudf/cudf/__init__.py           |  2 --
 python/cudf/cudf/utils/_setup_numba.py | 15 +++++++++------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index b4fcc4a63e0..170b808b42c 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -86,8 +86,6 @@
 
 _setup_numba()
 
-# This must be imported after _setup_numba is called and the numba
-# config is modified otherwise the config options will have no effect
 from numba import cuda
 
 cuda.set_memory_manager(RMMNumbaManager)
diff --git a/python/cudf/cudf/utils/_setup_numba.py b/python/cudf/cudf/utils/_setup_numba.py
index 127c1e3e489..4fe7f29e939 100644
--- a/python/cudf/cudf/utils/_setup_numba.py
+++ b/python/cudf/cudf/utils/_setup_numba.py
@@ -11,12 +11,8 @@
 
 def _setup_numba():
     """
-    Configure numba for use with cuDF. This consists of potentially
-    putting numba into enhanced compatibility mode based on the user
-    driver and runtime versions as well as the version of the CUDA
-    Toolkit used to build the PTX files shipped with the user cuDF
-    package. It also sets any other config options within numba that
-    are desired for cuDF's operation.
+    Perform any numba patching or configuration desired upon
+    cuDF import.
     """
     _setup_numba_linker(CC_60_PTX_FILE)
 
@@ -75,6 +71,13 @@ def _get_ptx_file(path, prefix):
 
 
 def _setup_numba_linker(path):
+    """
+    Configure the numba linker for use with cuDF. This consists of
+    potentially putting numba into enhanced compatibility mode
+    based on the user driver and runtime versions as well as the
+    version of the CUDA Toolkit used to build the PTX files shipped
+    with the user cuDF package.
+    """
     try:
         # By default, ptxcompiler will not be installed with CUDA 12
         # packages. This is ok, because in this situation putting

From cf642d0bde1377cc2809c89de6ef125892d17e50 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 18 May 2023 11:08:17 -0700
Subject: [PATCH 17/37] partially address reviews

---
 python/cudf/cudf/utils/_setup_numba.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/utils/_setup_numba.py b/python/cudf/cudf/utils/_setup_numba.py
index 4fe7f29e939..b0d863be1c1 100644
--- a/python/cudf/cudf/utils/_setup_numba.py
+++ b/python/cudf/cudf/utils/_setup_numba.py
@@ -9,14 +9,6 @@
 CC_60_PTX_FILE = os.path.dirname(__file__) + "/../core/udf/shim_60.ptx"
 
 
-def _setup_numba():
-    """
-    Perform any numba patching or configuration desired upon
-    cuDF import.
-    """
-    _setup_numba_linker(CC_60_PTX_FILE)
-
-
 def _get_best_ptx_file(archs, max_compute_capability):
     """
     Determine of the available PTX files which one is
@@ -70,7 +62,7 @@ def _get_ptx_file(path, prefix):
         return regular_result[1]
 
 
-def _setup_numba_linker(path):
+def _setup_numba():
     """
     Configure the numba linker for use with cuDF. This consists of
     potentially putting numba into enhanced compatibility mode
@@ -93,7 +85,9 @@ def _setup_numba_linker(path):
         # case where a user has a CUDA 12 package and ptxcompiler
         # in their environment anyways, perhaps installed separately
         if driver_version < (12, 0):
-            ptx_toolkit_version = _get_cuda_version_from_ptx_file(path)
+            ptx_toolkit_version = _get_cuda_version_from_ptx_file(
+                CC_60_PTX_FILE
+            )
             # Numba thinks cubinlinker is only needed if the driver is older
             # than the CUDA runtime, but when PTX files are present, it might
             # also need to patch because those PTX files may be compiled by
@@ -103,6 +97,8 @@ def _setup_numba_linker(path):
             ):
                 if _numba_version_ok:
                     Linker.new = new_patched_linker
+                else:
+                    config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1
 
 
 def _get_cuda_version_from_ptx_file(path):

From cb5a75604363390f48847f2b788cb662f5438ad3 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 19 May 2023 06:23:08 -0700
Subject: [PATCH 18/37] Revert "revert numba upgrade"

This reverts commit 47d8a2e7149edd5071ffe0349d1cc393f9c88302.
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 4 ++--
 dependencies.yaml                                | 2 +-
 python/cudf/pyproject.toml                       | 2 +-
 python/dask_cudf/pyproject.toml                  | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 4031f1aa1c3..b6daea7c2bc 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -48,7 +48,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba>=0.56.4,<0.57
+- numba>=0.57
 - numpy>=1.21,<1.24
 - numpydoc
 - nvcc_linux-64=11.8
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index f8074711b88..97c438bec59 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -53,7 +53,7 @@ requirements:
     - cython >=0.29,<0.30
     - scikit-build >=0.13.1
     - setuptools
-    - numba >=0.56.4,<0.57
+    - numba >=0.57
     - dlpack >=0.5,<0.6.0a0
     - pyarrow =11
     - libcudf ={{ version }}
@@ -65,7 +65,7 @@ requirements:
     - typing_extensions
     - pandas >=1.3,<1.6.0dev0
     - cupy >=12.0.0
-    - numba >=0.56.4,<0.57
+    - numba >=0.57
     - numpy >=1.21,<1.24  # Temporarily upper bound numpy to avoid overflow deprecations
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }}
     - libcudf {{ version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 70d7f8c1ec8..e3fcbe69932 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -350,7 +350,7 @@ dependencies:
         packages:
           - cachetools
           - cuda-python>=11.7.1,<12.0
-          - &numba numba>=0.56.4,<0.57
+          - &numba numba>=0.57
           - nvtx>=0.2.1
           - packaging
           - rmm==23.6.*
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index d13324a7404..b08dd92d52f 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
     "cuda-python>=11.7.1,<12.0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
-    "numba>=0.56.4,<0.57",
+    "numba>=0.57",
     "numpy>=1.21,<1.24",
     "nvtx>=0.2.1",
     "packaging",
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index ff2a3f2d095..42b6c26c002 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -40,7 +40,7 @@ dynamic = ["entry-points"]
 [project.optional-dependencies]
 test = [
     "dask-cuda==23.6.*",
-    "numba>=0.56.4,<0.57",
+    "numba>=0.57",
     "pytest",
     "pytest-cov",
     "pytest-xdist",

From dcc73e10a0704182d9ff84eaa352018f9ed37172 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 19 May 2023 07:21:21 -0700
Subject: [PATCH 19/37] _setup_numba.py -> _numba.py, CUDFNumbaConfig ->
 _CUDFNumbaConfig

---
 python/cudf/cudf/core/indexed_frame.py                |  4 ++--
 python/cudf/cudf/core/udf/groupby_utils.py            |  4 ++--
 python/cudf/cudf/tests/test_extension_compilation.py  | 10 +++++-----
 python/cudf/cudf/tests/test_string_udfs.py            |  6 +++---
 python/cudf/cudf/utils/{_setup_numba.py => _numba.py} |  9 ++-------
 python/cudf/cudf/utils/applyutils.py                  |  8 ++++----
 python/cudf/cudf/utils/cudautils.py                   |  8 ++++----
 python/cudf/cudf/utils/queryutils.py                  |  4 ++--
 8 files changed, 24 insertions(+), 29 deletions(-)
 rename python/cudf/cudf/utils/{_setup_numba.py => _numba.py} (93%)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index c6b2c10b9da..abd8ad162c7 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -68,7 +68,7 @@
     _return_arr_from_dtype,
 )
 from cudf.utils import docutils
-from cudf.utils._setup_numba import CUDFNumbaConfig
+from cudf.utils._numba import _CUDFNumbaConfig
 from cudf.utils.utils import _cudf_nvtx_annotate
 
 doc_reset_index_template = """
@@ -2194,7 +2194,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
         input_args = _get_input_args_from_frame(self)
         launch_args = output_args + input_args + list(args)
         try:
-            with CUDFNumbaConfig():
+            with _CUDFNumbaConfig():
                 kernel.forall(len(self))(*launch_args)
         except Exception as e:
             raise RuntimeError("UDF kernel execution failed.") from e
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index e0d41515f9b..60eba7eb37b 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -27,7 +27,7 @@
     _supported_cols_from_frame,
     _supported_dtypes_from_frame,
 )
-from cudf.utils._setup_numba import CUDFNumbaConfig
+from cudf.utils._numba import _CUDFNumbaConfig
 from cudf.utils.utils import _cudf_nvtx_annotate
 
 
@@ -197,7 +197,7 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
     )
 
     # Launch kernel
-    with CUDFNumbaConfig():
+    with _CUDFNumbaConfig():
         specialized[ngroups, tpb](*launch_args)
 
     return output
diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
index 525204bf002..857cc114ffa 100644
--- a/python/cudf/cudf/tests/test_extension_compilation.py
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -12,7 +12,7 @@
 from cudf.core.udf.api import Masked
 from cudf.core.udf.masked_typing import MaskedType
 from cudf.testing._utils import parametrize_numeric_dtypes_pairwise
-from cudf.utils._setup_numba import CUDFNumbaConfig
+from cudf.utils._numba import _CUDFNumbaConfig
 
 arith_ops = (
     operator.add,
@@ -107,7 +107,7 @@ def test_kernel(x, y, err):
             err[0] = 3
 
     err = cp.asarray([0], dtype="int8")
-    with CUDFNumbaConfig():
+    with _CUDFNumbaConfig():
         test_kernel[1, 1](1, 2, err)
     assert err[0] == 0
 
@@ -216,7 +216,7 @@ def test_kernel(err):
             err[0] = 2
 
     err = cp.asarray([0], dtype="int8")
-    with CUDFNumbaConfig():
+    with _CUDFNumbaConfig():
         test_kernel[1, 1](err)
     assert err[0] == 0
 
@@ -307,7 +307,7 @@ def test_kernel(err):
             err[0] = 2
 
     err = cp.asarray([0], dtype="int8")
-    with CUDFNumbaConfig():
+    with _CUDFNumbaConfig():
         test_kernel[1, 1](err)
     assert err[0] == 0
 
@@ -330,6 +330,6 @@ def test_kernel(err):
             err[0] = 1
 
     err = cp.asarray([0], dtype="int8")
-    with CUDFNumbaConfig():
+    with _CUDFNumbaConfig():
         test_kernel[1, 1](err)
     assert err[0] == 0
diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py
index 62bffad0a01..88c73ccf964 100644
--- a/python/cudf/cudf/tests/test_string_udfs.py
+++ b/python/cudf/cudf/tests/test_string_udfs.py
@@ -22,7 +22,7 @@
 )
 from cudf.core.udf.utils import _PTX_FILE, _get_extensionty_size
 from cudf.testing._utils import assert_eq, sv_to_udf_str
-from cudf.utils._setup_numba import CUDFNumbaConfig
+from cudf.utils._numba import _CUDFNumbaConfig
 
 
 def get_kernels(func, dtype, size):
@@ -86,7 +86,7 @@ def run_udf_test(data, func, dtype):
     sv_kernel, udf_str_kernel = get_kernels(func, dtype, len(data))
 
     expect = pd.Series(data).apply(func)
-    with CUDFNumbaConfig():
+    with _CUDFNumbaConfig():
         sv_kernel.forall(len(data))(str_views, output)
     if dtype == "str":
         result = column_from_udf_string_array(output)
@@ -95,7 +95,7 @@ def run_udf_test(data, func, dtype):
 
     got = cudf.Series(result, dtype=dtype)
     assert_eq(expect, got, check_dtype=False)
-    with CUDFNumbaConfig():
+    with _CUDFNumbaConfig():
         udf_str_kernel.forall(len(data))(str_views, output)
     if dtype == "str":
         result = column_from_udf_string_array(output)
diff --git a/python/cudf/cudf/utils/_setup_numba.py b/python/cudf/cudf/utils/_numba.py
similarity index 93%
rename from python/cudf/cudf/utils/_setup_numba.py
rename to python/cudf/cudf/utils/_numba.py
index b0d863be1c1..053d7a0e1cb 100644
--- a/python/cudf/cudf/utils/_setup_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -4,7 +4,6 @@
 import os
 
 from numba import config
-from numba.cuda.cudadrv.driver import Linker
 
 CC_60_PTX_FILE = os.path.dirname(__file__) + "/../core/udf/shim_60.ptx"
 
@@ -74,7 +73,6 @@ def _setup_numba():
         # By default, ptxcompiler will not be installed with CUDA 12
         # packages. This is ok, because in this situation putting
         # numba in enhanced compatibility mode is not necessary.
-        from cubinlinker.patch import _numba_version_ok, new_patched_linker
         from ptxcompiler.patch import NO_DRIVER, safe_get_versions
     except ImportError:
         return
@@ -95,10 +93,7 @@ def _setup_numba():
             if (driver_version < ptx_toolkit_version) or (
                 driver_version < runtime_version
             ):
-                if _numba_version_ok:
-                    Linker.new = new_patched_linker
-                else:
-                    config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1
+                config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1
 
 
 def _get_cuda_version_from_ptx_file(path):
@@ -147,7 +142,7 @@ def _get_cuda_version_from_ptx_file(path):
     return cuda_ver
 
 
-class CUDFNumbaConfig:
+class _CUDFNumbaConfig:
     def __enter__(self):
         self.enter_val = config.CUDA_LOW_OCCUPANCY_WARNINGS
         config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py
index 14709bcbbfb..b8cf6910402 100644
--- a/python/cudf/cudf/utils/applyutils.py
+++ b/python/cudf/cudf/utils/applyutils.py
@@ -12,7 +12,7 @@
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import column
 from cudf.utils import utils
-from cudf.utils._setup_numba import CUDFNumbaConfig
+from cudf.utils._numba import _CUDFNumbaConfig
 from cudf.utils.docutils import docfmt_partial
 
 _doc_applyparams = """
@@ -196,7 +196,7 @@ def compile(self, func, argnames, extra_argnames):
         return kernel
 
     def launch_kernel(self, df, args):
-        with CUDFNumbaConfig():
+        with _CUDFNumbaConfig():
             self.kernel.forall(len(df))(*args)
 
 
@@ -211,13 +211,13 @@ def compile(self, func, argnames, extra_argnames):
     def launch_kernel(self, df, args, chunks, blkct=None, tpb=None):
         chunks = self.normalize_chunks(len(df), chunks)
         if blkct is None and tpb is None:
-            with CUDFNumbaConfig():
+            with _CUDFNumbaConfig():
                 self.kernel.forall(len(df))(len(df), chunks, *args)
         else:
             assert tpb is not None
             if blkct is None:
                 blkct = chunks.size
-            with CUDFNumbaConfig():
+            with _CUDFNumbaConfig():
                 self.kernel[blkct, tpb](len(df), chunks, *args)
 
     def normalize_chunks(self, size, chunks):
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index 4f047d3c125..a10eaab0bff 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -8,7 +8,7 @@
 from numba.np import numpy_support
 
 import cudf
-from cudf.utils._setup_numba import CUDFNumbaConfig
+from cudf.utils._numba import _CUDFNumbaConfig
 
 #
 # Misc kernels
@@ -81,7 +81,7 @@ def find_index_of_val(arr, val, mask=None, compare="eq"):
     """
     found = cuda.device_array(shape=(arr.shape), dtype="int32")
     if found.size > 0:
-        with CUDFNumbaConfig():
+        with _CUDFNumbaConfig():
             if compare == "gt":
                 gpu_mark_gt.forall(found.size)(arr, val, found, arr.size)
             elif compare == "lt":
@@ -156,7 +156,7 @@ def gpu_window_sizes_from_offset(arr, window_sizes, offset):
 def window_sizes_from_offset(arr, offset):
     window_sizes = cuda.device_array(shape=(arr.shape), dtype="int32")
     if arr.size > 0:
-        with CUDFNumbaConfig():
+        with _CUDFNumbaConfig():
             gpu_window_sizes_from_offset.forall(arr.size)(
                 arr, window_sizes, offset
             )
@@ -180,7 +180,7 @@ def gpu_grouped_window_sizes_from_offset(
 def grouped_window_sizes_from_offset(arr, group_starts, offset):
     window_sizes = cuda.device_array(shape=(arr.shape), dtype="int32")
     if arr.size > 0:
-        with CUDFNumbaConfig():
+        with _CUDFNumbaConfig():
             gpu_grouped_window_sizes_from_offset.forall(arr.size)(
                 arr, window_sizes, group_starts, offset
             )
diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py
index 6cbe702db90..51093375eda 100644
--- a/python/cudf/cudf/utils/queryutils.py
+++ b/python/cudf/cudf/utils/queryutils.py
@@ -11,7 +11,7 @@
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import column_empty
 from cudf.utils import applyutils
-from cudf.utils._setup_numba import CUDFNumbaConfig
+from cudf.utils._numba import _CUDFNumbaConfig
 from cudf.utils.dtypes import (
     BOOL_TYPES,
     DATETIME_TYPES,
@@ -248,7 +248,7 @@ def query_execute(df, expr, callenv):
     out = column_empty(nrows, dtype=np.bool_)
     # run kernel
     args = [out] + colarrays + envargs
-    with CUDFNumbaConfig():
+    with _CUDFNumbaConfig():
         kernel.forall(nrows)(*args)
     out_mask = applyutils.make_aggregate_nullmask(df, columns=columns)
     return out.set_mask(out_mask).fillna(False)

From 053193a8add7d015e984562c714faf40807a06d5 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 19 May 2023 07:55:28 -0700
Subject: [PATCH 20/37] try vendoring some ptxcompiler code

---
 python/cudf/cudf/utils/_numba.py | 46 +++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index 053d7a0e1cb..06aa51aa7fb 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -1,11 +1,43 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 import glob
+import math
 import os
+import subprocess
+import sys
+import warnings
 
 from numba import config
 
 CC_60_PTX_FILE = os.path.dirname(__file__) + "/../core/udf/shim_60.ptx"
+NO_DRIVER = (math.inf, math.inf)
+
+CMD = """\
+from ctypes import c_int, byref
+from numba import cuda
+dv = c_int(0)
+cuda.cudadrv.driver.driver.cuDriverGetVersion(byref(dv))
+drv_major = dv.value // 1000
+drv_minor = (dv.value - (drv_major * 1000)) // 10
+run_major, run_minor = cuda.runtime.get_version()
+print(f'{drv_major} {drv_minor} {run_major} {run_minor}')
+"""
+
+
+def _get_versions():
+    """
+    This function is mostly vendored from ptxcompiler and is used
+    to check the system CUDA driver and runtime versions in its absence.
+    """
+    cp = subprocess.run([sys.executable, "-c", CMD], capture_output=True)
+    if cp.returncode:
+        return NO_DRIVER
+
+    versions = [int(s) for s in cp.stdout.strip().split()]
+    driver_version = tuple(versions[:2])
+    runtime_version = tuple(versions[2:])
+
+    return driver_version, runtime_version
 
 
 def _get_best_ptx_file(archs, max_compute_capability):
@@ -75,7 +107,19 @@ def _setup_numba():
         # numba in enhanced compatibility mode is not necessary.
         from ptxcompiler.patch import NO_DRIVER, safe_get_versions
     except ImportError:
-        return
+        versions = _get_versions()
+        if versions != NO_DRIVER:
+            driver_version, runtime_version = versions
+            if runtime_version > driver_version:
+                warnings.warn(
+                    f"Using CUDA toolkit version {runtime_version} with CUDA "
+                    f"driver version {driver_version} requires minor version "
+                    "compatibility, which is not yet supported for CUDA "
+                    "driver versions newer than 12.0. It is likely that many "
+                    "cuDF operations will not work in this state. Please "
+                    f"install CUDA toolkit version {driver_version} to "
+                    "continue using cuDF."
+                )
     versions = safe_get_versions()
     if versions != NO_DRIVER:
         driver_version, runtime_version = versions

From c2285fa67adec6d5f836fe6903c5ec685964a559 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 19 May 2023 07:59:22 -0700
Subject: [PATCH 21/37] add the comment about the MVC config option and
 numba.cuda imports back in

---
 python/cudf/cudf/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 170b808b42c..4ea19d543ec 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -84,6 +84,9 @@
 from cudf.utils.dtypes import _NA_REP
 from cudf.utils.utils import clear_cache, set_allocator
 
+# this must be called before numba.cuda is imported, because
+# it sets the numba config variable responsible for enabling
+# MVC. Setting it after importing cuda has no effect.
 _setup_numba()
 
 from numba import cuda

From b72eef004618e209bc69c3b82749e566ba1709f6 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 19 May 2023 08:24:16 -0700
Subject: [PATCH 22/37] fix imports

---
 python/cudf/cudf/__init__.py       | 2 +-
 python/cudf/cudf/core/udf/utils.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 4ea19d543ec..d44d622bdb7 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -80,7 +80,7 @@
     read_text,
 )
 from cudf.options import describe_option, get_option, set_option
-from cudf.utils._setup_numba import _setup_numba
+from cudf.utils._numba import _setup_numba
 from cudf.utils.dtypes import _NA_REP
 from cudf.utils.utils import clear_cache, set_allocator
 
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index 4a3ed23b6bd..fbddd75344f 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -31,7 +31,7 @@
     udf_string,
 )
 from cudf.utils import cudautils
-from cudf.utils._setup_numba import _get_ptx_file
+from cudf.utils._numba import _get_ptx_file
 from cudf.utils.dtypes import (
     BOOL_TYPES,
     DATETIME_TYPES,

From bd27a2f2bd8e09fa6d7d179b163636e5c7453503 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 19 May 2023 09:04:13 -0700
Subject: [PATCH 23/37] switch error

---
 python/cudf/cudf/utils/_numba.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index 06aa51aa7fb..e275c3fa881 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -106,7 +106,7 @@ def _setup_numba():
         # packages. This is ok, because in this situation putting
         # numba in enhanced compatibility mode is not necessary.
         from ptxcompiler.patch import NO_DRIVER, safe_get_versions
-    except ImportError:
+    except ModuleNotFoundError:
         versions = _get_versions()
         if versions != NO_DRIVER:
             driver_version, runtime_version = versions

From 8c9c070927901e6f45f613929f0332ad1dda5f6d Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 19 May 2023 09:29:20 -0700
Subject: [PATCH 24/37] slightly adjust logic

---
 python/cudf/cudf/utils/_numba.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index e275c3fa881..ec6421886c4 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -10,7 +10,7 @@
 from numba import config
 
 CC_60_PTX_FILE = os.path.dirname(__file__) + "/../core/udf/shim_60.ptx"
-NO_DRIVER = (math.inf, math.inf)
+_NO_DRIVER = (math.inf, math.inf)
 
 CMD = """\
 from ctypes import c_int, byref
@@ -31,7 +31,7 @@ def _get_versions():
     """
     cp = subprocess.run([sys.executable, "-c", CMD], capture_output=True)
     if cp.returncode:
-        return NO_DRIVER
+        return _NO_DRIVER
 
     versions = [int(s) for s in cp.stdout.strip().split()]
     driver_version = tuple(versions[:2])
@@ -108,7 +108,7 @@ def _setup_numba():
         from ptxcompiler.patch import NO_DRIVER, safe_get_versions
     except ModuleNotFoundError:
         versions = _get_versions()
-        if versions != NO_DRIVER:
+        if versions != _NO_DRIVER:
             driver_version, runtime_version = versions
             if runtime_version > driver_version:
                 warnings.warn(

From 662b30bba001beea5dff19621c1dd14d5ff9c4bd Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Sun, 21 May 2023 19:13:27 -0700
Subject: [PATCH 25/37] add missing return

---
 python/cudf/cudf/utils/_numba.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index ec6421886c4..fd59244985f 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -120,6 +120,7 @@ def _setup_numba():
                     f"install CUDA toolkit version {driver_version} to "
                     "continue using cuDF."
                 )
+        return
     versions = safe_get_versions()
     if versions != NO_DRIVER:
         driver_version, runtime_version = versions

From 93af61398b8d2fd50a51b0fa2d8ace1ceda49ce9 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 22 May 2023 04:05:02 -0700
Subject: [PATCH 26/37] shuffle imports

---
 python/cudf/cudf/__init__.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index d44d622bdb7..96aee9f065e 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -1,11 +1,16 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
+# this must be called before numba.cuda is imported, because
+# it sets the numba config variable responsible for enabling
+# MVC. Setting it after importing cuda has no effect.
+from cudf.utils._numba import _setup_numba
 from cudf.utils.gpu_utils import validate_setup
 
+_setup_numba()
 validate_setup()
 
 import cupy
-from numba import config as numba_config
+from numba import config as numba_config, cuda
 
 import rmm
 from rmm.allocators.cupy import rmm_cupy_allocator
@@ -80,17 +85,9 @@
     read_text,
 )
 from cudf.options import describe_option, get_option, set_option
-from cudf.utils._numba import _setup_numba
 from cudf.utils.dtypes import _NA_REP
 from cudf.utils.utils import clear_cache, set_allocator
 
-# this must be called before numba.cuda is imported, because
-# it sets the numba config variable responsible for enabling
-# MVC. Setting it after importing cuda has no effect.
-_setup_numba()
-
-from numba import cuda
-
 cuda.set_memory_manager(RMMNumbaManager)
 cupy.cuda.set_allocator(rmm_cupy_allocator)
 

From 2ff5c5d8f31196bd2de158f89e964ec305e8d711 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 22 May 2023 06:40:31 -0700
Subject: [PATCH 27/37] delete explicit runtime check for MVC in cuda 12+ as
 it's needed more globally now

---
 python/cudf/cudf/core/udf/utils.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index fbddd75344f..944e5a05842 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -254,14 +254,6 @@ def _compile_or_get(
     use it to allocate an output column of the right dtype.
     """
     # runtime check for CEC mode which is disabled for CUDA 12 for now
-    if cuda.cudadrv.driver.get_version() == (
-        12,
-        0,
-    ) and cuda.cudadrv.runtime.get_version() > (12, 0):
-        raise ValueError(
-            "Minor version compatibility not yet supported for "
-            "CUDA driver versions newer than 12.0"
-        )
 
     if not all(is_scalar(arg) for arg in args):
         raise TypeError("only scalar valued args are supported by apply")

From 5cb0ce6d67b2f2c8813f3140cbbf3311ca4c7743 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 22 May 2023 17:06:43 -0700
Subject: [PATCH 28/37] attempt a simplifying change

---
 python/cudf/cudf/__init__.py           |   3 +
 python/cudf/cudf/utils/_numba.py       |  64 ++++----------
 python/cudf/cudf/utils/_ptxcompiler.py | 118 +++++++++++++++++++++++++
 3 files changed, 136 insertions(+), 49 deletions(-)
 create mode 100644 python/cudf/cudf/utils/_ptxcompiler.py

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 96aee9f065e..ad99b583045 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -1,5 +1,8 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
+from cuda import cuda, cudart
 
+cuda.cuDriverGetVersion()
+cudart.cudaRuntimeGetVersion()
 # this must be called before numba.cuda is imported, because
 # it sets the numba config variable responsible for enabling
 # MVC. Setting it after importing cuda has no effect.
diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index fd59244985f..6daf41a1f73 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -1,43 +1,12 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 import glob
-import math
 import os
-import subprocess
-import sys
 import warnings
 
 from numba import config
 
 CC_60_PTX_FILE = os.path.dirname(__file__) + "/../core/udf/shim_60.ptx"
-_NO_DRIVER = (math.inf, math.inf)
-
-CMD = """\
-from ctypes import c_int, byref
-from numba import cuda
-dv = c_int(0)
-cuda.cudadrv.driver.driver.cuDriverGetVersion(byref(dv))
-drv_major = dv.value // 1000
-drv_minor = (dv.value - (drv_major * 1000)) // 10
-run_major, run_minor = cuda.runtime.get_version()
-print(f'{drv_major} {drv_minor} {run_major} {run_minor}')
-"""
-
-
-def _get_versions():
-    """
-    This function is mostly vendored from ptxcompiler and is used
-    to check the system CUDA driver and runtime versions in its absence.
-    """
-    cp = subprocess.run([sys.executable, "-c", CMD], capture_output=True)
-    if cp.returncode:
-        return _NO_DRIVER
-
-    versions = [int(s) for s in cp.stdout.strip().split()]
-    driver_version = tuple(versions[:2])
-    runtime_version = tuple(versions[2:])
-
-    return driver_version, runtime_version
 
 
 def _get_best_ptx_file(archs, max_compute_capability):
@@ -107,27 +76,24 @@ def _setup_numba():
         # numba in enhanced compatibility mode is not necessary.
         from ptxcompiler.patch import NO_DRIVER, safe_get_versions
     except ModuleNotFoundError:
-        versions = _get_versions()
-        if versions != _NO_DRIVER:
-            driver_version, runtime_version = versions
-            if runtime_version > driver_version:
-                warnings.warn(
-                    f"Using CUDA toolkit version {runtime_version} with CUDA "
-                    f"driver version {driver_version} requires minor version "
-                    "compatibility, which is not yet supported for CUDA "
-                    "driver versions newer than 12.0. It is likely that many "
-                    "cuDF operations will not work in this state. Please "
-                    f"install CUDA toolkit version {driver_version} to "
-                    "continue using cuDF."
-                )
-        return
+        # use vendored version
+        from _ptxcompiler import NO_DRIVER, safe_get_versions
+
     versions = safe_get_versions()
     if versions != NO_DRIVER:
         driver_version, runtime_version = versions
-        # Don't check if CEC is necessary in the possible edge
-        # case where a user has a CUDA 12 package and ptxcompiler
-        # in their environment anyways, perhaps installed separately
-        if driver_version < (12, 0):
+        if driver_version >= (12, 0) and runtime_version > driver_version:
+            warnings.warn(
+                f"Using CUDA toolkit version {runtime_version} with CUDA "
+                f"driver version {driver_version} requires minor version "
+                "compatibility, which is not yet supported for CUDA "
+                "driver versions newer than 12.0. It is likely that many "
+                "cuDF operations will not work in this state. Please "
+                f"install CUDA toolkit version {driver_version} to "
+                "continue using cuDF."
+            )
+        else:
+            # Support MVC for all CUDA versions in the 11.x range
             ptx_toolkit_version = _get_cuda_version_from_ptx_file(
                 CC_60_PTX_FILE
             )
diff --git a/python/cudf/cudf/utils/_ptxcompiler.py b/python/cudf/cudf/utils/_ptxcompiler.py
new file mode 100644
index 00000000000..c4b8eda2337
--- /dev/null
+++ b/python/cudf/cudf/utils/_ptxcompiler.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os
+import subprocess
+import sys
+import warnings
+
+NO_DRIVER = (math.inf, math.inf)
+
+CMD = """\
+from ctypes import c_int, byref
+from numba import cuda
+dv = c_int(0)
+cuda.cudadrv.driver.driver.cuDriverGetVersion(byref(dv))
+drv_major = dv.value // 1000
+drv_minor = (dv.value - (drv_major * 1000)) // 10
+run_major, run_minor = cuda.runtime.get_version()
+print(f'{drv_major} {drv_minor} {run_major} {run_minor}')
+"""
+
+
+def patch_forced_by_user():
+    # The patch is needed if the user explicitly
+    # forced it with an environment variable.
+    apply = os.getenv("PTXCOMPILER_APPLY_NUMBA_CODEGEN_PATCH")
+    if apply is not None:
+        try:
+            apply = int(apply)
+        except ValueError:
+            apply = False
+
+    return bool(apply)
+
+
+def check_disabled_in_env():
+    # We should avoid checking whether the patch is
+    # needed if the user requested that we don't check
+    # (e.g. in a non-fork-safe environment)
+    check = os.getenv("PTXCOMPILER_CHECK_NUMBA_CODEGEN_PATCH_NEEDED")
+    if check is not None:
+        try:
+            check = int(check)
+        except ValueError:
+            check = False
+    else:
+        check = True
+
+    return not check
+
+
+def get_versions():
+    cp = subprocess.run([sys.executable, "-c", CMD], capture_output=True)
+    if cp.returncode:
+        msg = (
+            f"Error getting driver and runtime versions:\n\nstdout:\n\n"
+            f"{cp.stdout.decode()}\n\nstderr:\n\n{cp.stderr.decode()}\n\n"
+            "Not patching Numba"
+        )
+        warnings.warn(msg, UserWarning)
+        return NO_DRIVER
+
+    versions = [int(s) for s in cp.stdout.strip().split()]
+    driver_version = tuple(versions[:2])
+    runtime_version = tuple(versions[2:])
+
+    return driver_version, runtime_version
+
+
+def safe_get_versions():
+    """
+    Return a 2-tuple of deduced driver and runtime versions.
+
+    To ensure that this function does not initialize a CUDA context,
+    calls to the runtime and driver are made in a subprocess.
+
+    If PTXCOMPILER_CHECK_NUMBA_CODEGEN_PATCH_NEEDED is set
+    in the environment, then this subprocess call is not launched.
+    To specify the driver and runtime versions of the environment
+    in this case, set PTXCOMPILER_KNOWN_DRIVER_VERSION and
+    PTXCOMPILER_KNOWN_RUNTIME_VERSION appropriately.
+    """
+    if check_disabled_in_env():
+        try:
+            # allow user to specify driver/runtime
+            # versions manually, if necessary
+            driver_version = os.environ[
+                "PTXCOMPILER_KNOWN_DRIVER_VERSION"
+            ].split(".")
+            runtime_version = os.environ[
+                "PTXCOMPILER_KNOWN_RUNTIME_VERSION"
+            ].split(".")
+            driver_version, runtime_version = (
+                tuple(map(int, driver_version)),
+                tuple(map(int, runtime_version)),
+            )
+        except (KeyError, ValueError):
+            warnings.warn(
+                "No way to determine driver and runtime versions for "
+                "patching, set PTXCOMPILER_KNOWN_DRIVER_VERSION and "
+                "PTXCOMPILER_KNOWN_RUNTIME_VERSION"
+            )
+            return NO_DRIVER
+    else:
+        driver_version, runtime_version = get_versions()
+    return driver_version, runtime_version

From fc69663fc8ed27b4817c104a2f238dcc5d4bbe5e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 22 May 2023 17:35:44 -0700
Subject: [PATCH 29/37] update ptx/ctk version mapping table

---
 python/cudf/cudf/utils/_numba.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index 6daf41a1f73..71c2a2932bc 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -137,11 +137,17 @@ def _get_cuda_version_from_ptx_file(path):
     version = ver_line.strip("\n").split(" ")[1]
     # from ptx_docs/release_notes above:
     ver_map = {
+        "7.0": (11, 0),
+        "7.1": (11, 1),
+        "7.2": (11, 2),
+        "7.3": (11, 3),
+        "7.4": (11, 4),
         "7.5": (11, 5),
         "7.6": (11, 6),
         "7.7": (11, 7),
         "7.8": (11, 8),
         "8.0": (12, 0),
+        "8.1": (12, 1),
     }
 
     cuda_ver = ver_map.get(version)

From 0797cdeab35128eba56e3419522d9c1ac78542cd Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 22 May 2023 18:22:32 -0700
Subject: [PATCH 30/37] fix local imports

---
 python/cudf/cudf/utils/_numba.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index 71c2a2932bc..f691553e9d2 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -77,7 +77,7 @@ def _setup_numba():
         from ptxcompiler.patch import NO_DRIVER, safe_get_versions
     except ModuleNotFoundError:
         # use vendored version
-        from _ptxcompiler import NO_DRIVER, safe_get_versions
+        from cudf.utils._ptxcompiler import NO_DRIVER, safe_get_versions
 
     versions = safe_get_versions()
     if versions != NO_DRIVER:

From e7999927f17cfb5adbec5b52f2aa5ebf801897e7 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 22 May 2023 20:29:40 -0700
Subject: [PATCH 31/37] remove extraneous testing code

---
 python/cudf/cudf/__init__.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index ad99b583045..96aee9f065e 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -1,8 +1,5 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
-from cuda import cuda, cudart
 
-cuda.cuDriverGetVersion()
-cudart.cudaRuntimeGetVersion()
 # this must be called before numba.cuda is imported, because
 # it sets the numba config variable responsible for enabling
 # MVC. Setting it after importing cuda has no effect.

From 41e92a9d90a32740733bd37d13ffb71b8884fbb3 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 23 May 2023 07:17:35 -0500
Subject: [PATCH 32/37] Apply suggestions from code review

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
Co-authored-by: Graham Markall <535640+gmarkall@users.noreply.github.com>
---
 python/cudf/cudf/__init__.py     | 4 ++--
 python/cudf/cudf/utils/_numba.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 96aee9f065e..de0f2d67add 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
-# this must be called before numba.cuda is imported, because
+# _setup_numba _must be called before numba.cuda is imported, because
 # it sets the numba config variable responsible for enabling
-# MVC. Setting it after importing cuda has no effect.
+# Minor Version Compatibility. Setting it after importing numba.cuda has no effect.
 from cudf.utils._numba import _setup_numba
 from cudf.utils.gpu_utils import validate_setup
 
diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index f691553e9d2..49d9ea679fd 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -87,7 +87,7 @@ def _setup_numba():
                 f"Using CUDA toolkit version {runtime_version} with CUDA "
                 f"driver version {driver_version} requires minor version "
                 "compatibility, which is not yet supported for CUDA "
-                "driver versions newer than 12.0. It is likely that many "
+                "driver versions 12.0 and above. It is likely that many "
                 "cuDF operations will not work in this state. Please "
                 f"install CUDA toolkit version {driver_version} to "
                 "continue using cuDF."

From 8839f8c7157d3f3c8bddef39cbf5cc376087701e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 23 May 2023 05:19:48 -0700
Subject: [PATCH 33/37] cleanup

---
 python/cudf/cudf/core/udf/utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index 944e5a05842..35a3f6c1ffd 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -253,8 +253,6 @@ def _compile_or_get(
     we then obtain the return type from that separate compilation and
     use it to allocate an output column of the right dtype.
     """
-    # runtime check for CEC mode which is disabled for CUDA 12 for now
-
     if not all(is_scalar(arg) for arg in args):
         raise TypeError("only scalar valued args are supported by apply")
 

From c27a4b1312bbb23f5e52d6d548659e2f2cd1351e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 23 May 2023 05:26:08 -0700
Subject: [PATCH 34/37] clarify cuda 12 comments

---
 python/cudf/cudf/utils/_numba.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index 49d9ea679fd..b381ad93ec5 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -70,10 +70,12 @@ def _setup_numba():
     version of the CUDA Toolkit used to build the PTX files shipped
     with the user cuDF package.
     """
+    # ptxcompiler is a requirement for cuda 11.x packages but not
+    # cuda 12.x packages. However its version checking machinery
+    # is still necessary. If a user happens to have ptxcompiler
+    # in a cuda 12 environment, it's use for the purposes of
+    # checking the driver and runtime versions is harmless
     try:
-        # By default, ptxcompiler will not be installed with CUDA 12
-        # packages. This is ok, because in this situation putting
-        # numba in enhanced compatibility mode is not necessary.
         from ptxcompiler.patch import NO_DRIVER, safe_get_versions
     except ModuleNotFoundError:
         # use vendored version

From 6925612438d6efb7c6c1f8104ed8dd861419d80e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 23 May 2023 05:28:31 -0700
Subject: [PATCH 35/37] version map changes

---
 python/cudf/cudf/utils/_numba.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index b381ad93ec5..4e9ef65b918 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -137,13 +137,12 @@ def _get_cuda_version_from_ptx_file(path):
         else:
             raise ValueError("Could not read CUDA version from ptx file.")
     version = ver_line.strip("\n").split(" ")[1]
-    # from ptx_docs/release_notes above:
+    # This dictionary maps from supported versions of NVVM to the
+    # PTX version it produces. The lowest value should be the minimum
+    # CUDA version required to compile the library. Currently CUDA 11.5
+    # or higher is required to build cudf. New CUDA versions should
+    # be added to this dictionary when officially supported.
     ver_map = {
-        "7.0": (11, 0),
-        "7.1": (11, 1),
-        "7.2": (11, 2),
-        "7.3": (11, 3),
-        "7.4": (11, 4),
         "7.5": (11, 5),
         "7.6": (11, 6),
         "7.7": (11, 7),

From 439a667966d25056932010a05d92e327c789a831 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 23 May 2023 05:42:11 -0700
Subject: [PATCH 36/37] remove function from ptxcompiler that is not used

---
 python/cudf/cudf/utils/_ptxcompiler.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/python/cudf/cudf/utils/_ptxcompiler.py b/python/cudf/cudf/utils/_ptxcompiler.py
index c4b8eda2337..5ef27916da7 100644
--- a/python/cudf/cudf/utils/_ptxcompiler.py
+++ b/python/cudf/cudf/utils/_ptxcompiler.py
@@ -32,19 +32,6 @@
 """
 
 
-def patch_forced_by_user():
-    # The patch is needed if the user explicitly
-    # forced it with an environment variable.
-    apply = os.getenv("PTXCOMPILER_APPLY_NUMBA_CODEGEN_PATCH")
-    if apply is not None:
-        try:
-            apply = int(apply)
-        except ValueError:
-            apply = False
-
-    return bool(apply)
-
-
 def check_disabled_in_env():
     # We should avoid checking whether the patch is
     # needed if the user requested that we don't check

From 1bfb3827f3ac14b7ea9717944aee67e4ec72a4d4 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 23 May 2023 09:05:44 -0700
Subject: [PATCH 37/37] address remaining reviews

---
 python/cudf/cudf/utils/_numba.py       | 4 +++-
 python/cudf/cudf/utils/_ptxcompiler.py | 6 ++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index 4e9ef65b918..194db9c90a6 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -6,7 +6,9 @@
 
 from numba import config
 
-CC_60_PTX_FILE = os.path.dirname(__file__) + "/../core/udf/shim_60.ptx"
+CC_60_PTX_FILE = os.path.join(
+    os.path.dirname(__file__), "../core/udf/shim_60.ptx"
+)
 
 
 def _get_best_ptx_file(archs, max_compute_capability):
diff --git a/python/cudf/cudf/utils/_ptxcompiler.py b/python/cudf/cudf/utils/_ptxcompiler.py
index 5ef27916da7..54f5ea08ee1 100644
--- a/python/cudf/cudf/utils/_ptxcompiler.py
+++ b/python/cudf/cudf/utils/_ptxcompiler.py
@@ -20,7 +20,7 @@
 
 NO_DRIVER = (math.inf, math.inf)
 
-CMD = """\
+NUMBA_CHECK_VERSION_CMD = """\
 from ctypes import c_int, byref
 from numba import cuda
 dv = c_int(0)
@@ -49,7 +49,9 @@ def check_disabled_in_env():
 
 
 def get_versions():
-    cp = subprocess.run([sys.executable, "-c", CMD], capture_output=True)
+    cp = subprocess.run(
+        [sys.executable, "-c", NUMBA_CHECK_VERSION_CMD], capture_output=True
+    )
     if cp.returncode:
         msg = (
             f"Error getting driver and runtime versions:\n\nstdout:\n\n"