Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Forward-merge branch-23.06 to branch-23.08 #13420

Merged
merged 1 commit into from
May 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ dependencies:
- nbsphinx
- ninja
- notebook
- numba>=0.56.4,<0.57
- numba>=0.57
- numpy>=1.21,<1.24
- numpydoc
- nvcc_linux-64=11.8
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ requirements:
- typing_extensions
- pandas >=1.3,<1.6.0dev0
- cupy >=12.0.0
- numba >=0.56.4,<0.57
- numba >=0.57
- numpy >=1.21,<1.24 # Temporarily upper bound numpy to avoid overflow deprecations
- {{ pin_compatible('pyarrow', max_pin='x.x.x') }}
- libcudf {{ version }}
Expand Down
2 changes: 1 addition & 1 deletion dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ dependencies:
packages:
- cachetools
- cuda-python>=11.7.1,<12.0
- &numba numba>=0.56.4,<0.57
- &numba numba>=0.57
- nvtx>=0.2.1
- packaging
- rmm==23.6.*
Expand Down
28 changes: 5 additions & 23 deletions python/cudf/cudf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
# Copyright (c) 2018-2023, NVIDIA CORPORATION.

# _setup_numba _must be called before numba.cuda is imported, because
# it sets the numba config variable responsible for enabling
# Minor Version Compatibility. Setting it after importing numba.cuda has no effect.
from cudf.utils._numba import _setup_numba
from cudf.utils.gpu_utils import validate_setup

_setup_numba()
validate_setup()

import cupy
Expand Down Expand Up @@ -83,32 +88,9 @@
from cudf.utils.dtypes import _NA_REP
from cudf.utils.utils import clear_cache, set_allocator

try:
from cubinlinker.patch import patch_numba_linker_if_needed
except ImportError:
pass
else:
# Patch Numba to support CUDA enhanced compatibility.
# cuDF requires a stronger set of conditions than what is
# checked by patch_numba_linker_if_needed due to the PTX
# files needed for JIT Groupby Apply and string UDFs
from cudf.core.udf.utils import _PTX_FILE, _setup_numba_linker

_setup_numba_linker(_PTX_FILE)

del patch_numba_linker_if_needed

cuda.set_memory_manager(RMMNumbaManager)
cupy.cuda.set_allocator(rmm_cupy_allocator)

try:
# Numba 0.54: Disable low occupancy warnings
numba_config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
except AttributeError:
# Numba < 0.54: No occupancy warnings
pass
del numba_config


rmm.register_reinitialize_hook(clear_cache)

Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
_return_arr_from_dtype,
)
from cudf.utils import docutils
from cudf.utils._numba import _CUDFNumbaConfig
from cudf.utils.utils import _cudf_nvtx_annotate

doc_reset_index_template = """
Expand Down Expand Up @@ -2193,7 +2194,8 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
input_args = _get_input_args_from_frame(self)
launch_args = output_args + input_args + list(args)
try:
kernel.forall(len(self))(*launch_args)
with _CUDFNumbaConfig():
kernel.forall(len(self))(*launch_args)
except Exception as e:
raise RuntimeError("UDF kernel execution failed.") from e

Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/udf/groupby_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
_supported_cols_from_frame,
_supported_dtypes_from_frame,
)
from cudf.utils._numba import _CUDFNumbaConfig
from cudf.utils.utils import _cudf_nvtx_annotate


Expand Down Expand Up @@ -196,7 +197,8 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
)

# Launch kernel
specialized[ngroups, tpb](*launch_args)
with _CUDFNumbaConfig():
specialized[ngroups, tpb](*launch_args)

return output

Expand Down
146 changes: 1 addition & 145 deletions python/cudf/cudf/core/udf/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

import glob
import os
from typing import Any, Callable, Dict

Expand All @@ -13,7 +12,6 @@
from numba.core.datamodel import default_manager, models
from numba.core.errors import TypingError
from numba.core.extending import register_model
from numba.cuda.cudadrv.driver import Linker
from numba.np import numpy_support
from numba.types import CPointer, Poison, Record, Tuple, boolean, int64, void

Expand All @@ -33,6 +31,7 @@
udf_string,
)
from cudf.utils import cudautils
from cudf.utils._numba import _get_ptx_file
from cudf.utils.dtypes import (
BOOL_TYPES,
DATETIME_TYPES,
Expand Down Expand Up @@ -63,58 +62,6 @@
precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
launch_arg_getters: Dict[Any, Any] = {}


def _get_best_ptx_file(archs, max_compute_capability):
"""
Determine of the available PTX files which one is
the most recent up to and including the device cc
"""
filtered_archs = [x for x in archs if x[0] <= max_compute_capability]
if filtered_archs:
return max(filtered_archs, key=lambda y: y[0])
else:
return None


def _get_ptx_file(path, prefix):
if "RAPIDS_NO_INITIALIZE" in os.environ:
# cc=60 ptx is always built
cc = int(os.environ.get("STRINGS_UDF_CC", "60"))
else:
dev = cuda.get_current_device()

# Load the highest compute capability file available that is less than
# the current device's.
cc = int("".join(str(x) for x in dev.compute_capability))
files = glob.glob(os.path.join(path, f"{prefix}*.ptx"))
if len(files) == 0:
raise RuntimeError(f"Missing PTX files for cc={cc}")
regular_sms = []

for f in files:
file_name = os.path.basename(f)
sm_number = file_name.rstrip(".ptx").lstrip(prefix)
if sm_number.endswith("a"):
processed_sm_number = int(sm_number.rstrip("a"))
if processed_sm_number == cc:
return f
else:
regular_sms.append((int(sm_number), f))

regular_result = None

if regular_sms:
regular_result = _get_best_ptx_file(regular_sms, cc)

if regular_result is None:
raise RuntimeError(
"This cuDF installation is missing the necessary PTX "
f"files that are <={cc}."
)
else:
return regular_result[1]


_PTX_FILE = _get_ptx_file(os.path.dirname(__file__), "shim_")


Expand Down Expand Up @@ -392,97 +339,6 @@ def _get_extensionty_size(ty):
return llty.get_abi_size(target_data)


def _get_cuda_version_from_ptx_file(path):
"""
https://docs.nvidia.com/cuda/parallel-thread-execution/
Each PTX module must begin with a .version
directive specifying the PTX language version

example header:
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-31057947
// Cuda compilation tools, release 11.6, V11.6.124
// Based on NVVM 7.0.1
//

.version 7.6
.target sm_52
.address_size 64

"""
with open(path) as ptx_file:
for line in ptx_file:
if line.startswith(".version"):
ver_line = line
break
else:
raise ValueError("Could not read CUDA version from ptx file.")
version = ver_line.strip("\n").split(" ")[1]
# from ptx_docs/release_notes above:
ver_map = {
"7.5": (11, 5),
"7.6": (11, 6),
"7.7": (11, 7),
"7.8": (11, 8),
"8.0": (12, 0),
}

cuda_ver = ver_map.get(version)
if cuda_ver is None:
raise ValueError(
f"Could not map PTX version {version} to a CUDA version"
)

return cuda_ver


def _setup_numba_linker(path):
from ptxcompiler.patch import NO_DRIVER, safe_get_versions

from cudf.core.udf.utils import (
_get_cuda_version_from_ptx_file,
maybe_patch_numba_linker,
)

versions = safe_get_versions()
if versions != NO_DRIVER:
driver_version, runtime_version = versions
ptx_toolkit_version = _get_cuda_version_from_ptx_file(path)
maybe_patch_numba_linker(
driver_version, runtime_version, ptx_toolkit_version
)


def maybe_patch_numba_linker(
driver_version, runtime_version, ptx_toolkit_version
):
from cubinlinker.patch import (
_numba_version_ok,
get_logger,
new_patched_linker,
)

# Numba thinks cubinlinker is only needed if the driver is older than
# the ctk, but when PTX files are present, it might also need to patch
# because those PTX files may newer than the driver as well
logger = get_logger()

if (driver_version < ptx_toolkit_version) or (
driver_version < runtime_version
):
logger.debug(
"Driver version %s.%s needs patching due to PTX files"
% driver_version
)
if _numba_version_ok:
logger.debug("Patching Numba Linker")
Linker.new = new_patched_linker
else:
logger.debug("Cannot patch Numba Linker - unsupported version")


@initfunc
def set_malloc_heap_size(size=None):
"""
Expand Down
15 changes: 10 additions & 5 deletions python/cudf/cudf/tests/test_extension_compilation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
import operator

import cupy as cp
Expand All @@ -12,6 +12,7 @@
from cudf.core.udf.api import Masked
from cudf.core.udf.masked_typing import MaskedType
from cudf.testing._utils import parametrize_numeric_dtypes_pairwise
from cudf.utils._numba import _CUDFNumbaConfig

arith_ops = (
operator.add,
Expand Down Expand Up @@ -106,7 +107,8 @@ def test_kernel(x, y, err):
err[0] = 3

err = cp.asarray([0], dtype="int8")
test_kernel[1, 1](1, 2, err)
with _CUDFNumbaConfig():
test_kernel[1, 1](1, 2, err)
assert err[0] == 0


Expand Down Expand Up @@ -214,7 +216,8 @@ def test_kernel(err):
err[0] = 2

err = cp.asarray([0], dtype="int8")
test_kernel[1, 1](err)
with _CUDFNumbaConfig():
test_kernel[1, 1](err)
assert err[0] == 0


Expand Down Expand Up @@ -304,7 +307,8 @@ def test_kernel(err):
err[0] = 2

err = cp.asarray([0], dtype="int8")
test_kernel[1, 1](err)
with _CUDFNumbaConfig():
test_kernel[1, 1](err)
assert err[0] == 0


Expand All @@ -326,5 +330,6 @@ def test_kernel(err):
err[0] = 1

err = cp.asarray([0], dtype="int8")
test_kernel[1, 1](err)
with _CUDFNumbaConfig():
test_kernel[1, 1](err)
assert err[0] == 0
8 changes: 5 additions & 3 deletions python/cudf/cudf/tests/test_string_udfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
)
from cudf.core.udf.utils import _PTX_FILE, _get_extensionty_size
from cudf.testing._utils import assert_eq, sv_to_udf_str
from cudf.utils._numba import _CUDFNumbaConfig


def get_kernels(func, dtype, size):
Expand Down Expand Up @@ -85,16 +86,17 @@ def run_udf_test(data, func, dtype):
sv_kernel, udf_str_kernel = get_kernels(func, dtype, len(data))

expect = pd.Series(data).apply(func)

sv_kernel.forall(len(data))(str_views, output)
with _CUDFNumbaConfig():
sv_kernel.forall(len(data))(str_views, output)
if dtype == "str":
result = column_from_udf_string_array(output)
else:
result = output

got = cudf.Series(result, dtype=dtype)
assert_eq(expect, got, check_dtype=False)
udf_str_kernel.forall(len(data))(str_views, output)
with _CUDFNumbaConfig():
udf_str_kernel.forall(len(data))(str_views, output)
if dtype == "str":
result = column_from_udf_string_array(output)
else:
Expand Down
Loading