Add read-only functions on string dtypes to DataFrame.apply and `Se…

…ries.apply` (#11319) This PR provides initial support for string data inside UDFs passed to `DataFrame.apply` and `Series.apply`. The allowed APIs are based on python's `str` class. It aims to implement python string semantics as closely as possible starting with APIs that ***return numeric data only.*** These are the following 21 functions: - `str.count` - `str.startswith` - `str.endswith` - `str.find` - `str.rfind` - `str.isalnum` - `str.isdecimal` - `str.isdigit` - `str.islower` - `str.isupper` - `str.isalpha` - `str.istitle` - `str.isspace` - `==`, `!=`, `>=`, `<=`, `>`, `<` (between two strings) - `len` - `__contains__` The following 3 functions are not included due to having no libcudf equivalent code available to back them (due to them referring to python concepts) - `str.isascii` - `str.isidentifier` - `str.isprintable` This works by creating a library of `__device__` functions based on libcudf which perform the above functions for one single string. The rest of the code is a library of numba extensions that replace a python UDF with a chain of those `__device__` functions and creates a kernel that calls the result across a grid of threads, taking a full column of strings as input. cc @davidwendt @gmarkall Authors: - https://github.com/brandon-b-miller - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) - Ashwin Srinath (https://github.com/shwina) - David Wendt (https://github.com/davidwendt) URL: #11319
rapidsai · Sep 20, 2022 · 0528b38 · 0528b38
1 parent d10406f
commit 0528b38
Show file tree

Hide file tree

Showing 45 changed files with 5,629 additions and 92 deletions.
diff --git a/cpp/.clang-format → .clang-format b/cpp/.clang-format → .clang-format
diff --git a/.gitattributes b/.gitattributes
@@ -1,4 +1,5 @@
 python/cudf/cudf/_version.py export-subst
+python/strings_udf/strings_udf/_version.py export-subst
 python/cudf_kafka/cudf_kafka/_version.py export-subst
 python/custreamz/custreamz/_version.py export-subst
 python/dask_cudf/dask_cudf/_version.py export-subst
diff --git a/.gitignore b/.gitignore
@@ -35,6 +35,8 @@ python/cudf_kafka/*/_lib/**/*.cpp
 python/cudf_kafka/*/_lib/**/*.h
 python/custreamz/*/_lib/**/*.cpp
 python/custreamz/*/_lib/**/*.h
+python/strings_udf/strings_udf/_lib/*.cpp
+python/strings_udf/strings_udf/*.ptx
 .Python
 env/
 develop-eggs/

diff --git a/build.sh b/build.sh
@@ -17,7 +17,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats"
+VALIDARGS="clean libcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz strings_udf -v -g -n -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats"
 HELP="$0 [clean] [libcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
    clean                         - remove all existing build artifacts and configuration (start
                                    over)
@@ -335,6 +335,15 @@ if buildAll || hasArg cudf; then
     fi
 fi
 
+if buildAll || hasArg strings_udf; then
+
+    cd ${REPODIR}/python/strings_udf
+    python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
+    if [[ ${INSTALL_TARGET} != "" ]]; then
+        python setup.py install --single-version-externally-managed --record=record.txt  -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
+    fi
+fi
+
 # Build and install the dask_cudf Python package
 if buildAll || hasArg dask_cudf; then
 

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
@@ -80,6 +80,14 @@ fi
 if [ "$BUILD_LIBCUDF" == '1' ]; then
   gpuci_logger "Build conda pkg for libcudf"
   gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libcudf $CONDA_BUILD_ARGS
+
+  # BUILD_LIBCUDF == 1 means this job is being run on the cpu_build jobs
+  # that is where we must also build the strings_udf package
+  gpuci_logger "Build conda pkg for strings_udf (python 3.8)"
+  gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/strings_udf $CONDA_BUILD_ARGS --python=3.8
+  gpuci_logger "Build conda pkg for strings_udf (python 3.9)"
+  gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/strings_udf $CONDA_BUILD_ARGS --python=3.9
+
   mkdir -p ${CONDA_BLD_DIR}/libcudf/work
   cp -r ${CONDA_BLD_DIR}/work/* ${CONDA_BLD_DIR}/libcudf/work
   gpuci_logger "sccache stats"
@@ -108,6 +116,10 @@ if [ "$BUILD_CUDF" == '1' ]; then
 
   gpuci_logger "Build conda pkg for custreamz"
   gpuci_conda_retry mambabuild --croot ${CONDA_BLD_DIR} conda/recipes/custreamz --python=$PYTHON $CONDA_BUILD_ARGS $CONDA_CHANNEL
+
+  gpuci_logger "Build conda pkg for strings_udf"
+  gpuci_conda_retry mambabuild --croot ${CONDA_BLD_DIR} conda/recipes/strings_udf --python=$PYTHON $CONDA_BUILD_ARGS $CONDA_CHANNEL
+
 fi
 ################################################################################
 # UPLOAD - Conda packages

diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
@@ -33,6 +33,12 @@ if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF" == "1" ]]; then
   export LIBCUDF_FILES=$(conda build --no-build-id --croot "${CONDA_BLD_DIR}" conda/recipes/libcudf --output)
   LIBCUDF_FILES=$(echo "$LIBCUDF_FILES" | sed 's/.*libcudf-example.*//') # skip libcudf-example pkg upload
   gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing --no-progress $LIBCUDF_FILES
+
+  # also build strings_udf on cpu machines
+  export STRINGS_UDF_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/strings_udf --python=$PYTHON --output)
+  test -e ${STRINGS_UDF_FILE}
+  echo "Upload strings_udf: ${STRINGS_UDF_FILE}"
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${STRINGS_UDF_FILE} --no-progress
 fi
 
 if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF" == "1" ]]; then

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -121,11 +121,11 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     install_dask
 
     ################################################################################
-    # BUILD - Build libcudf, cuDF, libcudf_kafka, and dask_cudf from source
+    # BUILD - Build libcudf, cuDF, libcudf_kafka, dask_cudf, and strings_udf from source
     ################################################################################
 
     gpuci_logger "Build from source"
-    "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests --ptds
+    "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka strings_udf benchmarks tests --ptds
 
     ################################################################################
     # TEST - Run GoogleTest
@@ -183,7 +183,11 @@ else
     gpuci_conda_retry mambabuild --croot ${CONDA_BLD_DIR} conda/recipes/cudf_kafka --python=$PYTHON -c ${CONDA_ARTIFACT_PATH}
     gpuci_conda_retry mambabuild --croot ${CONDA_BLD_DIR} conda/recipes/custreamz --python=$PYTHON -c ${CONDA_ARTIFACT_PATH}
 
-    gpuci_logger "Installing cudf, dask-cudf, cudf_kafka and custreamz"
+    # the CUDA component of strings_udf must be built on cuda 11.5 just like libcudf
+    # but because there is no separate python package, we must also build the python on the 11.5 jobs
+    # this means that at this point (on the GPU test jobs) the whole package is already built and has been
+    # copied by CI from the upstream 11.5 jobs into $CONDA_ARTIFACT_PATH
+    gpuci_logger "Installing cudf, dask-cudf, cudf_kafka, and custreamz"
     gpuci_mamba_retry install cudf dask-cudf cudf_kafka custreamz -c "${CONDA_BLD_DIR}" -c "${CONDA_ARTIFACT_PATH}"
 
     gpuci_logger "GoogleTests"
@@ -258,6 +262,31 @@ cd "$WORKSPACE/python/custreamz"
 gpuci_logger "Python py.test for cuStreamz"
 py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml="$WORKSPACE/junit-custreamz.xml" -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:"$WORKSPACE/python/custreamz/custreamz-coverage.xml" --cov-report term custreamz
 
+gpuci_logger "Installing strings_udf"
+gpuci_mamba_retry install strings_udf -c "${CONDA_BLD_DIR}" -c "${CONDA_ARTIFACT_PATH}"
+
+cd "$WORKSPACE/python/strings_udf/strings_udf"
+gpuci_logger "Python py.test for strings_udf"
+
+# We do not want to exit with a nonzero exit code in the case where no
+# strings_udf tests are run because that will always happen when the local CUDA
+# version is not 11.5. We need to suppress the exit code because this script is
+# run with set -e and we're already setting a trap that we don't want to
+# override here.
+
+STRINGS_UDF_PYTEST_RETCODE=0
+py.test -n 8 --cache-clear --basetemp="$WORKSPACE/strings-udf-cuda-tmp" --junitxml="$WORKSPACE/junit-strings-udf.xml" -v --cov-config=.coveragerc --cov=strings_udf --cov-report=xml:"$WORKSPACE/python/strings_udf/strings-udf-coverage.xml" --cov-report term tests || STRINGS_UDF_PYTEST_RETCODE=$?
+
+if [ ${STRINGS_UDF_PYTEST_RETCODE} -eq 5 ]; then
+    echo "No strings UDF tests were run, but this script will continue to execute."
+elif [ ${STRINGS_UDF_PYTEST_RETCODE} -ne 0 ]; then
+    exit ${STRINGS_UDF_PYTEST_RETCODE}
+else
+    cd "$WORKSPACE/python/cudf/cudf"
+    gpuci_logger "Python py.test retest cuDF UDFs"
+    py.test tests/test_udf_masked_ops.py -n 8 --cache-clear
+fi
+
 # Run benchmarks with both cudf and pandas to ensure compatibility is maintained.
 # Benchmarks are run in DEBUG_ONLY mode, meaning that only small data sizes are used.
 # Therefore, these runs only verify that benchmarks are valid.

diff --git a/conda/recipes/strings_udf/build.sh b/conda/recipes/strings_udf/build.sh
@@ -0,0 +1,4 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+# This assumes the script is executed from the root of the repo directory
+./build.sh strings_udf
diff --git a/conda/recipes/strings_udf/conda_build_config.yaml b/conda/recipes/strings_udf/conda_build_config.yaml
@@ -0,0 +1,14 @@
+c_compiler_version:
+  - 9
+
+cxx_compiler_version:
+  - 9
+
+sysroot_version:
+  - "2.17"
+
+cmake_version:
+  - ">=3.20.1,!=3.23.0"
+
+cuda_compiler:
+  - nvcc
diff --git a/conda/recipes/strings_udf/meta.yaml b/conda/recipes/strings_udf/meta.yaml
@@ -0,0 +1,65 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
+{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set py_version=environ.get('CONDA_PY', 38) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
+{% set cuda_major=cuda_version.split('.')[0] %}
+
+package:
+  name: strings_udf
+  version: {{ version }}
+
+source:
+  git_url: ../../..
+
+build:
+  number: {{ GIT_DESCRIBE_NUMBER }}
+  string: cuda_{{ cuda_major }}_py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  script_env:
+    - VERSION_SUFFIX
+    - PARALLEL_LEVEL
+  # libcudf's run_exports pinning is looser than we would like
+  ignore_run_exports:
+    - libcudf
+  ignore_run_exports_from:
+    - {{ compiler('cuda') }}
+
+requirements:
+  build:
+    - cmake {{ cmake_version }}
+    - {{ compiler('c') }}
+    - {{ compiler('cxx') }}
+    - {{ compiler('cuda') }} {{ cuda_version }}
+    - sysroot_{{ target_platform }} {{ sysroot_version }}
+  host:
+    - python
+    - cython >=0.29,<0.30
+    - scikit-build>=0.13.1
+    - setuptools
+    - numba >=0.54
+    - libcudf ={{ version }}
+    - cudf ={{ version }}
+    - cudatoolkit ={{ cuda_version }}
+  run:
+    - python
+    - typing_extensions
+    - numba >=0.54
+    - numpy
+    - libcudf ={{ version }}
+    - cudf ={{ version }}
+    - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
+    - cachetools
+    - ptxcompiler  # [linux64]  # CUDA enhanced compatibility. See https://github.com/rapidsai/ptxcompiler
+test:                                   # [linux64]
+  requires:                             # [linux64]
+    - cudatoolkit {{ cuda_version }}.*  # [linux64]
+  imports:                              # [linux64]
+    - strings_udf                       # [linux64]
+
+about:
+  home: https://rapids.ai/
+  license: Apache-2.0
+  license_family: APACHE
+  license_file: LICENSE
+  summary: strings_udf library
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
@@ -57,7 +57,12 @@
 from cudf.core.missing import NA
 from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import _Resampler
-from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame
+from cudf.core.udf.utils import (
+    _compile_or_get,
+    _get_input_args_from_frame,
+    _post_process_output_col,
+    _return_arr_from_dtype,
+)
 from cudf.utils import docutils
 from cudf.utils.utils import _cudf_nvtx_annotate
 
@@ -1819,30 +1824,19 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
             ) from e
 
         # Mask and data column preallocated
-        ans_col = cp.empty(len(self), dtype=retty)
+        ans_col = _return_arr_from_dtype(retty, len(self))
         ans_mask = cudf.core.column.column_empty(len(self), dtype="bool")
-        launch_args = [(ans_col, ans_mask), len(self)]
-        offsets = []
-
-        # if _compile_or_get succeeds, it is safe to create a kernel that only
-        # consumes the columns that are of supported dtype
-        for col in _supported_cols_from_frame(self).values():
-            data = col.data
-            mask = col.mask
-            if mask is None:
-                launch_args.append(data)
-            else:
-                launch_args.append((data, mask))
-            offsets.append(col.offset)
-        launch_args += offsets
-        launch_args += list(args)
+        output_args = [(ans_col, ans_mask), len(self)]
+        input_args = _get_input_args_from_frame(self)
+        launch_args = output_args + input_args + list(args)
 
         try:
             kernel.forall(len(self))(*launch_args)
         except Exception as e:
             raise RuntimeError("UDF kernel execution failed.") from e
 
-        col = cudf.core.column.as_column(ans_col)
+        col = _post_process_output_col(ans_col, retty)
+
         col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
         result = cudf.Series._from_data({None: col}, self._index)
 

diff --git a/python/cudf/cudf/core/udf/__init__.py b/python/cudf/cudf/core/udf/__init__.py
@@ -1,3 +1,65 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2022, NVIDIA CORPORATION.
+import numpy as np
+from numba import cuda, types
+from numba.cuda.cudaimpl import (
+    lower as cuda_lower,
+    registry as cuda_lowering_registry,
+)
 
-from . import lowering, typing
+from cudf.core.dtypes import dtype
+from cudf.core.udf import api, row_function, utils
+from cudf.utils.dtypes import STRING_TYPES
+
+from . import masked_lowering, masked_typing
+
+_units = ["ns", "ms", "us", "s"]
+_datetime_cases = {types.NPDatetime(u) for u in _units}
+_timedelta_cases = {types.NPTimedelta(u) for u in _units}
+
+
+_supported_masked_types = (
+    types.integer_domain
+    | types.real_domain
+    | _datetime_cases
+    | _timedelta_cases
+    | {types.boolean}
+)
+
+_STRING_UDFS_ENABLED = False
+try:
+    import strings_udf
+
+    if strings_udf.ENABLED:
+        from . import strings_typing  # isort: skip
+        from . import strings_lowering  # isort: skip
+        from strings_udf import ptxpath
+        from strings_udf._lib.cudf_jit_udf import to_string_view_array
+        from strings_udf._typing import str_view_arg_handler, string_view
+
+        # add an overload of MaskedType.__init__(string_view, bool)
+        cuda_lower(api.Masked, strings_typing.string_view, types.boolean)(
+            masked_lowering.masked_constructor
+        )
+
+        # add an overload of pack_return(string_view)
+        cuda_lower(api.pack_return, strings_typing.string_view)(
+            masked_lowering.pack_return_scalar_impl
+        )
+
+        _supported_masked_types |= {strings_typing.string_view}
+        utils.launch_arg_getters[dtype("O")] = to_string_view_array
+        utils.masked_array_types[dtype("O")] = string_view
+        utils.JIT_SUPPORTED_TYPES |= STRING_TYPES
+        utils.ptx_files.append(ptxpath)
+        utils.arg_handlers.append(str_view_arg_handler)
+        row_function.itemsizes[dtype("O")] = string_view.size_bytes
+
+        _STRING_UDFS_ENABLED = True
+    else:
+        del strings_udf
+
+except ImportError as e:
+    # allow cuDF to work without strings_udf
+    pass
+
+masked_typing.register_masked_constructor(_supported_masked_types)
diff --git a/python/cudf/cudf/core/udf/lowering.py → python/cudf/cudf/core/udf/masked_lowering.py b/python/cudf/cudf/core/udf/lowering.py → python/cudf/cudf/core/udf/masked_lowering.py
@@ -18,7 +18,7 @@
     comparison_ops,
     unary_ops,
 )
-from cudf.core.udf.typing import MaskedType, NAType
+from cudf.core.udf.masked_typing import MaskedType, NAType
 
 
 @cuda_lowering_registry.lower_constant(NAType)
@@ -62,7 +62,6 @@ def masked_scalar_op_impl(context, builder, sig, args):
         result = cgutils.create_struct_proxy(masked_return_type)(
             context, builder
         )
-
         # compute output validity
         valid = builder.and_(m1.valid, m2.valid)
         result.valid = valid