diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 4031f1aa1c3..b6daea7c2bc 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -48,7 +48,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba>=0.56.4,<0.57 +- numba>=0.57 - numpy>=1.21,<1.24 - numpydoc - nvcc_linux-64=11.8 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 005792d187f..6a0faa0ebbc 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -64,7 +64,7 @@ requirements: - typing_extensions - pandas >=1.3,<1.6.0dev0 - cupy >=12.0.0 - - numba >=0.56.4,<0.57 + - numba >=0.57 - numpy >=1.21,<1.24 # Temporarily upper bound numpy to avoid overflow deprecations - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} - libcudf {{ version }} diff --git a/dependencies.yaml b/dependencies.yaml index 70d7f8c1ec8..e3fcbe69932 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -350,7 +350,7 @@ dependencies: packages: - cachetools - cuda-python>=11.7.1,<12.0 - - &numba numba>=0.56.4,<0.57 + - &numba numba>=0.57 - nvtx>=0.2.1 - packaging - rmm==23.6.* diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 06310e278a2..de0f2d67add 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -1,7 +1,12 @@ # Copyright (c) 2018-2023, NVIDIA CORPORATION. +# _setup_numba _must be called before numba.cuda is imported, because +# it sets the numba config variable responsible for enabling +# Minor Version Compatibility. Setting it after importing numba.cuda has no effect. +from cudf.utils._numba import _setup_numba from cudf.utils.gpu_utils import validate_setup +_setup_numba() validate_setup() import cupy @@ -83,32 +88,9 @@ from cudf.utils.dtypes import _NA_REP from cudf.utils.utils import clear_cache, set_allocator -try: - from cubinlinker.patch import patch_numba_linker_if_needed -except ImportError: - pass -else: - # Patch Numba to support CUDA enhanced compatibility. - # cuDF requires a stronger set of conditions than what is - # checked by patch_numba_linker_if_needed due to the PTX - # files needed for JIT Groupby Apply and string UDFs - from cudf.core.udf.utils import _PTX_FILE, _setup_numba_linker - - _setup_numba_linker(_PTX_FILE) - - del patch_numba_linker_if_needed - cuda.set_memory_manager(RMMNumbaManager) cupy.cuda.set_allocator(rmm_cupy_allocator) -try: - # Numba 0.54: Disable low occupancy warnings - numba_config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 -except AttributeError: - # Numba < 0.54: No occupancy warnings - pass -del numba_config - rmm.register_reinitialize_hook(clear_cache) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 7141958f62d..abd8ad162c7 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -68,6 +68,7 @@ _return_arr_from_dtype, ) from cudf.utils import docutils +from cudf.utils._numba import _CUDFNumbaConfig from cudf.utils.utils import _cudf_nvtx_annotate doc_reset_index_template = """ @@ -2193,7 +2194,8 @@ def _apply(self, func, kernel_getter, *args, **kwargs): input_args = _get_input_args_from_frame(self) launch_args = output_args + input_args + list(args) try: - kernel.forall(len(self))(*launch_args) + with _CUDFNumbaConfig(): + kernel.forall(len(self))(*launch_args) except Exception as e: raise RuntimeError("UDF kernel execution failed.") from e diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index f0d168fb733..60eba7eb37b 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -27,6 +27,7 @@ _supported_cols_from_frame, _supported_dtypes_from_frame, ) +from cudf.utils._numba import _CUDFNumbaConfig from cudf.utils.utils import _cudf_nvtx_annotate @@ -196,7 +197,8 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): ) # Launch kernel - specialized[ngroups, tpb](*launch_args) + with _CUDFNumbaConfig(): + specialized[ngroups, tpb](*launch_args) return output diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 9d7df530ccc..35a3f6c1ffd 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -1,6 +1,5 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. -import glob import os from typing import Any, Callable, Dict @@ -13,7 +12,6 @@ from numba.core.datamodel import default_manager, models from numba.core.errors import TypingError from numba.core.extending import register_model -from numba.cuda.cudadrv.driver import Linker from numba.np import numpy_support from numba.types import CPointer, Poison, Record, Tuple, boolean, int64, void @@ -33,6 +31,7 @@ udf_string, ) from cudf.utils import cudautils +from cudf.utils._numba import _get_ptx_file from cudf.utils.dtypes import ( BOOL_TYPES, DATETIME_TYPES, @@ -63,58 +62,6 @@ precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32) launch_arg_getters: Dict[Any, Any] = {} - -def _get_best_ptx_file(archs, max_compute_capability): - """ - Determine of the available PTX files which one is - the most recent up to and including the device cc - """ - filtered_archs = [x for x in archs if x[0] <= max_compute_capability] - if filtered_archs: - return max(filtered_archs, key=lambda y: y[0]) - else: - return None - - -def _get_ptx_file(path, prefix): - if "RAPIDS_NO_INITIALIZE" in os.environ: - # cc=60 ptx is always built - cc = int(os.environ.get("STRINGS_UDF_CC", "60")) - else: - dev = cuda.get_current_device() - - # Load the highest compute capability file available that is less than - # the current device's. - cc = int("".join(str(x) for x in dev.compute_capability)) - files = glob.glob(os.path.join(path, f"{prefix}*.ptx")) - if len(files) == 0: - raise RuntimeError(f"Missing PTX files for cc={cc}") - regular_sms = [] - - for f in files: - file_name = os.path.basename(f) - sm_number = file_name.rstrip(".ptx").lstrip(prefix) - if sm_number.endswith("a"): - processed_sm_number = int(sm_number.rstrip("a")) - if processed_sm_number == cc: - return f - else: - regular_sms.append((int(sm_number), f)) - - regular_result = None - - if regular_sms: - regular_result = _get_best_ptx_file(regular_sms, cc) - - if regular_result is None: - raise RuntimeError( - "This cuDF installation is missing the necessary PTX " - f"files that are <={cc}." - ) - else: - return regular_result[1] - - _PTX_FILE = _get_ptx_file(os.path.dirname(__file__), "shim_") @@ -392,97 +339,6 @@ def _get_extensionty_size(ty): return llty.get_abi_size(target_data) -def _get_cuda_version_from_ptx_file(path): - """ - https://docs.nvidia.com/cuda/parallel-thread-execution/ - Each PTX module must begin with a .version - directive specifying the PTX language version - - example header: - // - // Generated by NVIDIA NVVM Compiler - // - // Compiler Build ID: CL-31057947 - // Cuda compilation tools, release 11.6, V11.6.124 - // Based on NVVM 7.0.1 - // - - .version 7.6 - .target sm_52 - .address_size 64 - - """ - with open(path) as ptx_file: - for line in ptx_file: - if line.startswith(".version"): - ver_line = line - break - else: - raise ValueError("Could not read CUDA version from ptx file.") - version = ver_line.strip("\n").split(" ")[1] - # from ptx_docs/release_notes above: - ver_map = { - "7.5": (11, 5), - "7.6": (11, 6), - "7.7": (11, 7), - "7.8": (11, 8), - "8.0": (12, 0), - } - - cuda_ver = ver_map.get(version) - if cuda_ver is None: - raise ValueError( - f"Could not map PTX version {version} to a CUDA version" - ) - - return cuda_ver - - -def _setup_numba_linker(path): - from ptxcompiler.patch import NO_DRIVER, safe_get_versions - - from cudf.core.udf.utils import ( - _get_cuda_version_from_ptx_file, - maybe_patch_numba_linker, - ) - - versions = safe_get_versions() - if versions != NO_DRIVER: - driver_version, runtime_version = versions - ptx_toolkit_version = _get_cuda_version_from_ptx_file(path) - maybe_patch_numba_linker( - driver_version, runtime_version, ptx_toolkit_version - ) - - -def maybe_patch_numba_linker( - driver_version, runtime_version, ptx_toolkit_version -): - from cubinlinker.patch import ( - _numba_version_ok, - get_logger, - new_patched_linker, - ) - - # Numba thinks cubinlinker is only needed if the driver is older than - # the ctk, but when PTX files are present, it might also need to patch - # because those PTX files may newer than the driver as well - logger = get_logger() - - if (driver_version < ptx_toolkit_version) or ( - driver_version < runtime_version - ): - logger.debug( - "Driver version %s.%s needs patching due to PTX files" - % driver_version - ) - if _numba_version_ok: - logger.debug("Patching Numba Linker") - Linker.new = new_patched_linker - else: - logger.debug("Cannot patch Numba Linker - unsupported version") - - @initfunc def set_malloc_heap_size(size=None): """ diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py index f1ed17c5df5..857cc114ffa 100644 --- a/python/cudf/cudf/tests/test_extension_compilation.py +++ b/python/cudf/cudf/tests/test_extension_compilation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. import operator import cupy as cp @@ -12,6 +12,7 @@ from cudf.core.udf.api import Masked from cudf.core.udf.masked_typing import MaskedType from cudf.testing._utils import parametrize_numeric_dtypes_pairwise +from cudf.utils._numba import _CUDFNumbaConfig arith_ops = ( operator.add, @@ -106,7 +107,8 @@ def test_kernel(x, y, err): err[0] = 3 err = cp.asarray([0], dtype="int8") - test_kernel[1, 1](1, 2, err) + with _CUDFNumbaConfig(): + test_kernel[1, 1](1, 2, err) assert err[0] == 0 @@ -214,7 +216,8 @@ def test_kernel(err): err[0] = 2 err = cp.asarray([0], dtype="int8") - test_kernel[1, 1](err) + with _CUDFNumbaConfig(): + test_kernel[1, 1](err) assert err[0] == 0 @@ -304,7 +307,8 @@ def test_kernel(err): err[0] = 2 err = cp.asarray([0], dtype="int8") - test_kernel[1, 1](err) + with _CUDFNumbaConfig(): + test_kernel[1, 1](err) assert err[0] == 0 @@ -326,5 +330,6 @@ def test_kernel(err): err[0] = 1 err = cp.asarray([0], dtype="int8") - test_kernel[1, 1](err) + with _CUDFNumbaConfig(): + test_kernel[1, 1](err) assert err[0] == 0 diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py index 049dfdc8e30..88c73ccf964 100644 --- a/python/cudf/cudf/tests/test_string_udfs.py +++ b/python/cudf/cudf/tests/test_string_udfs.py @@ -22,6 +22,7 @@ ) from cudf.core.udf.utils import _PTX_FILE, _get_extensionty_size from cudf.testing._utils import assert_eq, sv_to_udf_str +from cudf.utils._numba import _CUDFNumbaConfig def get_kernels(func, dtype, size): @@ -85,8 +86,8 @@ def run_udf_test(data, func, dtype): sv_kernel, udf_str_kernel = get_kernels(func, dtype, len(data)) expect = pd.Series(data).apply(func) - - sv_kernel.forall(len(data))(str_views, output) + with _CUDFNumbaConfig(): + sv_kernel.forall(len(data))(str_views, output) if dtype == "str": result = column_from_udf_string_array(output) else: @@ -94,7 +95,8 @@ def run_udf_test(data, func, dtype): got = cudf.Series(result, dtype=dtype) assert_eq(expect, got, check_dtype=False) - udf_str_kernel.forall(len(data))(str_views, output) + with _CUDFNumbaConfig(): + udf_str_kernel.forall(len(data))(str_views, output) if dtype == "str": result = column_from_udf_string_array(output) else: diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py new file mode 100644 index 00000000000..194db9c90a6 --- /dev/null +++ b/python/cudf/cudf/utils/_numba.py @@ -0,0 +1,171 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +import glob +import os +import warnings + +from numba import config + +CC_60_PTX_FILE = os.path.join( + os.path.dirname(__file__), "../core/udf/shim_60.ptx" +) + + +def _get_best_ptx_file(archs, max_compute_capability): + """ + Determine of the available PTX files which one is + the most recent up to and including the device compute capability. + """ + filtered_archs = [x for x in archs if x[0] <= max_compute_capability] + if filtered_archs: + return max(filtered_archs, key=lambda x: x[0]) + else: + return None + + +def _get_ptx_file(path, prefix): + if "RAPIDS_NO_INITIALIZE" in os.environ: + # cc=60 ptx is always built + cc = int(os.environ.get("STRINGS_UDF_CC", "60")) + else: + from numba import cuda + + dev = cuda.get_current_device() + + # Load the highest compute capability file available that is less than + # the current device's. + cc = int("".join(str(x) for x in dev.compute_capability)) + files = glob.glob(os.path.join(path, f"{prefix}*.ptx")) + if len(files) == 0: + raise RuntimeError(f"Missing PTX files for cc={cc}") + regular_sms = [] + + for f in files: + file_name = os.path.basename(f) + sm_number = file_name.rstrip(".ptx").lstrip(prefix) + if sm_number.endswith("a"): + processed_sm_number = int(sm_number.rstrip("a")) + if processed_sm_number == cc: + return f + else: + regular_sms.append((int(sm_number), f)) + + regular_result = None + + if regular_sms: + regular_result = _get_best_ptx_file(regular_sms, cc) + + if regular_result is None: + raise RuntimeError( + "This cuDF installation is missing the necessary PTX " + f"files that are <={cc}." + ) + else: + return regular_result[1] + + +def _setup_numba(): + """ + Configure the numba linker for use with cuDF. This consists of + potentially putting numba into enhanced compatibility mode + based on the user driver and runtime versions as well as the + version of the CUDA Toolkit used to build the PTX files shipped + with the user cuDF package. + """ + # ptxcompiler is a requirement for cuda 11.x packages but not + # cuda 12.x packages. However its version checking machinery + # is still necessary. If a user happens to have ptxcompiler + # in a cuda 12 environment, it's use for the purposes of + # checking the driver and runtime versions is harmless + try: + from ptxcompiler.patch import NO_DRIVER, safe_get_versions + except ModuleNotFoundError: + # use vendored version + from cudf.utils._ptxcompiler import NO_DRIVER, safe_get_versions + + versions = safe_get_versions() + if versions != NO_DRIVER: + driver_version, runtime_version = versions + if driver_version >= (12, 0) and runtime_version > driver_version: + warnings.warn( + f"Using CUDA toolkit version {runtime_version} with CUDA " + f"driver version {driver_version} requires minor version " + "compatibility, which is not yet supported for CUDA " + "driver versions 12.0 and above. It is likely that many " + "cuDF operations will not work in this state. Please " + f"install CUDA toolkit version {driver_version} to " + "continue using cuDF." + ) + else: + # Support MVC for all CUDA versions in the 11.x range + ptx_toolkit_version = _get_cuda_version_from_ptx_file( + CC_60_PTX_FILE + ) + # Numba thinks cubinlinker is only needed if the driver is older + # than the CUDA runtime, but when PTX files are present, it might + # also need to patch because those PTX files may be compiled by + # a CUDA version that is newer than the driver as well + if (driver_version < ptx_toolkit_version) or ( + driver_version < runtime_version + ): + config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1 + + +def _get_cuda_version_from_ptx_file(path): + """ + https://docs.nvidia.com/cuda/parallel-thread-execution/ + Each PTX module must begin with a .version + directive specifying the PTX language version + + example header: + // + // Generated by NVIDIA NVVM Compiler + // + // Compiler Build ID: CL-31057947 + // Cuda compilation tools, release 11.6, V11.6.124 + // Based on NVVM 7.0.1 + // + + .version 7.6 + .target sm_52 + .address_size 64 + + """ + with open(path) as ptx_file: + for line in ptx_file: + if line.startswith(".version"): + ver_line = line + break + else: + raise ValueError("Could not read CUDA version from ptx file.") + version = ver_line.strip("\n").split(" ")[1] + # This dictionary maps from supported versions of NVVM to the + # PTX version it produces. The lowest value should be the minimum + # CUDA version required to compile the library. Currently CUDA 11.5 + # or higher is required to build cudf. New CUDA versions should + # be added to this dictionary when officially supported. + ver_map = { + "7.5": (11, 5), + "7.6": (11, 6), + "7.7": (11, 7), + "7.8": (11, 8), + "8.0": (12, 0), + "8.1": (12, 1), + } + + cuda_ver = ver_map.get(version) + if cuda_ver is None: + raise ValueError( + f"Could not map PTX version {version} to a CUDA version" + ) + + return cuda_ver + + +class _CUDFNumbaConfig: + def __enter__(self): + self.enter_val = config.CUDA_LOW_OCCUPANCY_WARNINGS + config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 + + def __exit__(self, exc_type, exc_value, traceback): + config.CUDA_LOW_OCCUPANCY_WARNINGS = self.enter_val diff --git a/python/cudf/cudf/utils/_ptxcompiler.py b/python/cudf/cudf/utils/_ptxcompiler.py new file mode 100644 index 00000000000..54f5ea08ee1 --- /dev/null +++ b/python/cudf/cudf/utils/_ptxcompiler.py @@ -0,0 +1,107 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import os +import subprocess +import sys +import warnings + +NO_DRIVER = (math.inf, math.inf) + +NUMBA_CHECK_VERSION_CMD = """\ +from ctypes import c_int, byref +from numba import cuda +dv = c_int(0) +cuda.cudadrv.driver.driver.cuDriverGetVersion(byref(dv)) +drv_major = dv.value // 1000 +drv_minor = (dv.value - (drv_major * 1000)) // 10 +run_major, run_minor = cuda.runtime.get_version() +print(f'{drv_major} {drv_minor} {run_major} {run_minor}') +""" + + +def check_disabled_in_env(): + # We should avoid checking whether the patch is + # needed if the user requested that we don't check + # (e.g. in a non-fork-safe environment) + check = os.getenv("PTXCOMPILER_CHECK_NUMBA_CODEGEN_PATCH_NEEDED") + if check is not None: + try: + check = int(check) + except ValueError: + check = False + else: + check = True + + return not check + + +def get_versions(): + cp = subprocess.run( + [sys.executable, "-c", NUMBA_CHECK_VERSION_CMD], capture_output=True + ) + if cp.returncode: + msg = ( + f"Error getting driver and runtime versions:\n\nstdout:\n\n" + f"{cp.stdout.decode()}\n\nstderr:\n\n{cp.stderr.decode()}\n\n" + "Not patching Numba" + ) + warnings.warn(msg, UserWarning) + return NO_DRIVER + + versions = [int(s) for s in cp.stdout.strip().split()] + driver_version = tuple(versions[:2]) + runtime_version = tuple(versions[2:]) + + return driver_version, runtime_version + + +def safe_get_versions(): + """ + Return a 2-tuple of deduced driver and runtime versions. + + To ensure that this function does not initialize a CUDA context, + calls to the runtime and driver are made in a subprocess. + + If PTXCOMPILER_CHECK_NUMBA_CODEGEN_PATCH_NEEDED is set + in the environment, then this subprocess call is not launched. + To specify the driver and runtime versions of the environment + in this case, set PTXCOMPILER_KNOWN_DRIVER_VERSION and + PTXCOMPILER_KNOWN_RUNTIME_VERSION appropriately. + """ + if check_disabled_in_env(): + try: + # allow user to specify driver/runtime + # versions manually, if necessary + driver_version = os.environ[ + "PTXCOMPILER_KNOWN_DRIVER_VERSION" + ].split(".") + runtime_version = os.environ[ + "PTXCOMPILER_KNOWN_RUNTIME_VERSION" + ].split(".") + driver_version, runtime_version = ( + tuple(map(int, driver_version)), + tuple(map(int, runtime_version)), + ) + except (KeyError, ValueError): + warnings.warn( + "No way to determine driver and runtime versions for " + "patching, set PTXCOMPILER_KNOWN_DRIVER_VERSION and " + "PTXCOMPILER_KNOWN_RUNTIME_VERSION" + ) + return NO_DRIVER + else: + driver_version, runtime_version = get_versions() + return driver_version, runtime_version diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index 933b98367b6..b8cf6910402 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -12,6 +12,7 @@ from cudf.core.buffer import acquire_spill_lock from cudf.core.column import column from cudf.utils import utils +from cudf.utils._numba import _CUDFNumbaConfig from cudf.utils.docutils import docfmt_partial _doc_applyparams = """ @@ -195,7 +196,8 @@ def compile(self, func, argnames, extra_argnames): return kernel def launch_kernel(self, df, args): - self.kernel.forall(len(df))(*args) + with _CUDFNumbaConfig(): + self.kernel.forall(len(df))(*args) class ApplyChunksCompiler(ApplyKernelCompilerBase): @@ -209,12 +211,14 @@ def compile(self, func, argnames, extra_argnames): def launch_kernel(self, df, args, chunks, blkct=None, tpb=None): chunks = self.normalize_chunks(len(df), chunks) if blkct is None and tpb is None: - self.kernel.forall(len(df))(len(df), chunks, *args) + with _CUDFNumbaConfig(): + self.kernel.forall(len(df))(len(df), chunks, *args) else: assert tpb is not None if blkct is None: blkct = chunks.size - self.kernel[blkct, tpb](len(df), chunks, *args) + with _CUDFNumbaConfig(): + self.kernel[blkct, tpb](len(df), chunks, *args) def normalize_chunks(self, size, chunks): if isinstance(chunks, int): diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py index e2bd4556ce8..a10eaab0bff 100755 --- a/python/cudf/cudf/utils/cudautils.py +++ b/python/cudf/cudf/utils/cudautils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. from pickle import dumps @@ -8,6 +8,7 @@ from numba.np import numpy_support import cudf +from cudf.utils._numba import _CUDFNumbaConfig # # Misc kernels @@ -80,19 +81,20 @@ def find_index_of_val(arr, val, mask=None, compare="eq"): """ found = cuda.device_array(shape=(arr.shape), dtype="int32") if found.size > 0: - if compare == "gt": - gpu_mark_gt.forall(found.size)(arr, val, found, arr.size) - elif compare == "lt": - gpu_mark_lt.forall(found.size)(arr, val, found, arr.size) - else: - if arr.dtype in ("float32", "float64"): - gpu_mark_found_float.forall(found.size)( - arr, val, found, arr.size - ) + with _CUDFNumbaConfig(): + if compare == "gt": + gpu_mark_gt.forall(found.size)(arr, val, found, arr.size) + elif compare == "lt": + gpu_mark_lt.forall(found.size)(arr, val, found, arr.size) else: - gpu_mark_found_int.forall(found.size)( - arr, val, found, arr.size - ) + if arr.dtype in ("float32", "float64"): + gpu_mark_found_float.forall(found.size)( + arr, val, found, arr.size + ) + else: + gpu_mark_found_int.forall(found.size)( + arr, val, found, arr.size + ) return cudf.core.column.column.as_column(found).set_mask(mask) @@ -154,9 +156,10 @@ def gpu_window_sizes_from_offset(arr, window_sizes, offset): def window_sizes_from_offset(arr, offset): window_sizes = cuda.device_array(shape=(arr.shape), dtype="int32") if arr.size > 0: - gpu_window_sizes_from_offset.forall(arr.size)( - arr, window_sizes, offset - ) + with _CUDFNumbaConfig(): + gpu_window_sizes_from_offset.forall(arr.size)( + arr, window_sizes, offset + ) return window_sizes @@ -177,9 +180,10 @@ def gpu_grouped_window_sizes_from_offset( def grouped_window_sizes_from_offset(arr, group_starts, offset): window_sizes = cuda.device_array(shape=(arr.shape), dtype="int32") if arr.size > 0: - gpu_grouped_window_sizes_from_offset.forall(arr.size)( - arr, window_sizes, group_starts, offset - ) + with _CUDFNumbaConfig(): + gpu_grouped_window_sizes_from_offset.forall(arr.size)( + arr, window_sizes, group_starts, offset + ) return window_sizes diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index 4ce89b526d6..51093375eda 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -11,6 +11,7 @@ from cudf.core.buffer import acquire_spill_lock from cudf.core.column import column_empty from cudf.utils import applyutils +from cudf.utils._numba import _CUDFNumbaConfig from cudf.utils.dtypes import ( BOOL_TYPES, DATETIME_TYPES, @@ -247,6 +248,7 @@ def query_execute(df, expr, callenv): out = column_empty(nrows, dtype=np.bool_) # run kernel args = [out] + colarrays + envargs - kernel.forall(nrows)(*args) + with _CUDFNumbaConfig(): + kernel.forall(nrows)(*args) out_mask = applyutils.make_aggregate_nullmask(df, columns=columns) return out.set_mask(out_mask).fillna(False) diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index d13324a7404..b08dd92d52f 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -31,7 +31,7 @@ dependencies = [ "cuda-python>=11.7.1,<12.0", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", - "numba>=0.56.4,<0.57", + "numba>=0.57", "numpy>=1.21,<1.24", "nvtx>=0.2.1", "packaging", diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index ff2a3f2d095..42b6c26c002 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -40,7 +40,7 @@ dynamic = ["entry-points"] [project.optional-dependencies] test = [ "dask-cuda==23.6.*", - "numba>=0.56.4,<0.57", + "numba>=0.57", "pytest", "pytest-cov", "pytest-xdist",