Merge pull request #458 from NVIDIA-Merlin/sync_from_gitlab

Sync from gitlab
NVIDIA-Merlin · Sep 18, 2024 · af90256 · af90256
2 parents d81ac0e + 3bc63b8
commit af90256
Show file tree

Hide file tree

Showing 20 changed files with 159 additions and 53 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -32,12 +32,6 @@
 [submodule "third_party/librdkafka"]
 	path = third_party/librdkafka
 	url = https://github.com/edenhill/librdkafka.git
-[submodule "third_party/protobuf"]
-	path = third_party/protobuf
-	url = https://github.com/protocolbuffers/protobuf.git
-[submodule "third_party/hadoop"]
-	path = third_party/hadoop
-	url = https://github.com/apache/hadoop.git
 [submodule "third_party/HierarchicalKV"]
 	path = third_party/HierarchicalKV
 	url = https://github.com/NVIDIA-Merlin/HierarchicalKV.git
diff --git a/.nspect-vuln-allowlist.toml b/.nspect-vuln-allowlist.toml
@@ -1,12 +1,57 @@
-version = "4.3.0"
+version = "24.06"
 
 [oss]
 
 [oss.excluded]
-
+
+[[oss.excluded.directories]]
+paths = ['third_party/hadoop/hadoop-mapreduce-project'] 
+comment = 'No Use'
+nspect_ids = ['NSPECT-OZP9-WUQA']
+
+[[oss.excluded.directories]]
+paths = ['third_party/hadoop/hadoop-tools/hadoop-azure'] 
+comment = 'No Use'
+nspect_ids = ['NSPECT-OZP9-WUQA']
+
+[[oss.excluded.directories]]
+paths = ['third_party/hadoop/hadoop-yarn-project/hadoop-yarn'] 
+comment = 'No Use'
+nspect_ids = ['NSPECT-OZP9-WUQA']
+
+[[oss.excluded.directories]]
+paths = ['third_party/hadoop/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api]
+comment = 'No Use'
+nspect_ids = ['NSPECT-OZP9-WUQA']
+
+[[oss.excluded.directories]]
+paths = ['third_party/hadoop/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core']
+comment = 'No Use'
+nspect_ids = ['NSPECT-OZP9-WUQA']
+
+[[oss.excluded.directories]]
+paths = ['third_party/hadoop/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice']
+comment = 'No Use'
+nspect_ids = ['NSPECT-OZP9-WUQA']
+
+[[oss.excluded.directories]]
+paths = ['hadoop/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-server']
+comment = 'No Use'
+nspect_ids = ['NSPECT-OZP9-WUQA']
+
+[[oss.excluded.directories]]
+paths = ['third_party/hadoop/hadoop-tools'] 
+comment = 'No Use'
+nspect_ids = ['NSPECT-OZP9-WUQA']
+
+[[oss.excluded.directories]]
+paths = ['third_party/hadoop/hadoop-common'] 
+comment = 'No Use'
+nspect_ids = ['NSPECT-OZP9-WUQA']
+
 [[oss.excluded.directories]]
-paths = ['third_party/hadoop/*'] 
-comment = 'We do not use and are not planning on using the Hadoop Yarn Web UI'
+paths = ['third_party/hadoop/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/*'] 
+comment = 'No Use'
 nspect_ids = ['NSPECT-OZP9-WUQA']
 
 [[oss.excluded.directories]]
@@ -18,3 +63,8 @@ nspect_ids = ['NSPECT-OZP9-WUQA']
 paths = ['third_party/protobuf/*']
 comment = 'We never use csharp, java, php, the thir party googletest, etc., inside ptotobuf'
 nspect_ids = ['NSPECT-OZP9-WUQA']
+
+[[oss.excluded.directories]]
+paths = ['third_party/hadoop', 'third_party/hadoop/*'] 
+comment = 'No Use'
+nspect_ids = ['NSPECT-OZP9-WUQA']
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -14,6 +14,7 @@
 #
 
 cmake_minimum_required(VERSION 3.17)
+
 project(HugeCTR LANGUAGES CXX CUDA)
 
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
@@ -351,17 +352,57 @@ add_subdirectory(gpu_cache/src)
 
 option(ENABLE_HDFS "Enable HDFS" OFF)
 if(ENABLE_HDFS)
-  if(ENABLE_HDFS STREQUAL "MINIMAL")
-    message("HDFS build mode: Client only")
-  else()
-    message("HDFS build mode: Full")
-  endif()
+  message(STATUS "HDFS build mode: Client only")
   set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -DENABLE_HDFS")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_HDFS")
+
+  set(FETCHCONTENT_QUIET OFF)
+
+  # Java.
+  if (NOT EXISTS /usr/bin/mvn)
+    execute_process(WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
+      COMMAND /bin/bash ${PROJECT_SOURCE_DIR}/sbin/install-jdk-and-maven.sh
+      COMMAND_ERROR_IS_FATAL ANY
+    )
+  endif()
 
-  # Build and Install Hadoop
-  include(SetupHadoop)
-  hadoop_setup(${ENABLE_HDFS})
+  # Hadoop.
+  # sudo apt install libboost-date-time-dev
+  # sudo apt install libboost-program-options-dev
+  # sudo apt install libprotobuf-dev
+  # sudo apt install libfuse-dev
+  # sudo apt install libprotoc-dev
+  FetchContent_Declare(hadoop
+    DOWNLOAD_COMMAND git clone
+      --branch rel/release-3.4.0
+      --depth 1
+      --progress https://github.com/apache/hadoop.git
+      "${CMAKE_BINARY_DIR}/_deps/hadoop-src"
+  )
+  FetchContent_Populate(hadoop)
+  set(hadoop_SOURCE_DIR "${hadoop_SOURCE_DIR}/hadoop-hdfs-project/hadoop-hdfs-native-client")
+  set(hadoop_BINARY_DIR "${hadoop_SOURCE_DIR}/target/hadoop-hdfs-native-client-3.4.0")
+  if(EXISTS ${hadoop_BINARY_DIR}/include/hdfs.h AND EXISTS ${hadoop_BINARY_DIR}/lib/native/libhdfs.a)
+    message(STATUS "Found hdfs library in ${hadoop_BINARY_DIR}")
+  else()
+    execute_process(WORKING_DIRECTORY "${hadoop_SOURCE_DIR}"
+      COMMAND mvn clean package
+        -Pdist,native
+        -DskipTests
+        -Dtar
+        -Dmaven.javadoc.skip=true
+        -Drequire.snappy
+        -Drequire.zstd
+        -Drequire.openssl
+        -Drequire.pmdk
+      COMMAND_ERROR_IS_FATAL ANY
+    )
+  endif()
+  set(FETCHCONTENT_QUIET ON)
+
+  include_directories("${hadoop_BINARY_DIR}/include")
+  link_directories("${hadoop_BINARY_DIR}/lib/native")
+
   set(ENABLE_HDFS ON)
 endif()
 

diff --git a/HugeCTR/embedding/all2all_embedding_collection.cu b/HugeCTR/embedding/all2all_embedding_collection.cu
@@ -380,7 +380,7 @@ __global__ void cal_lookup_idx(size_t lookup_num, offset_t *bucket_after_filter,
 }
 
 template <typename offset_t>
-__global__ void count_ratio_filter(size_t bucket_num, char *filterd, const offset_t *bucket_range,
+__global__ void count_ratio_filter(size_t bucket_num, char *filtered, const offset_t *bucket_range,
                                    offset_t *bucket_after_filter) {
   int32_t i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_t step = blockDim.x * gridDim.x;
@@ -389,7 +389,7 @@ __global__ void count_ratio_filter(size_t bucket_num, char *filterd, const offse
     offset_t end = bucket_range[i + 1];
     bucket_after_filter[i + 1] = 0;
     for (offset_t idx = start; idx < end; idx++) {
-      if (filterd[idx] == 1) {
+      if (filtered[idx] == 1) {
         bucket_after_filter[i + 1]++;
       }
     }
@@ -400,7 +400,7 @@ __global__ void count_ratio_filter(size_t bucket_num, char *filterd, const offse
 }
 
 void filter(std::shared_ptr<CoreResourceManager> core,
-            const UniformModelParallelEmbeddingMeta &meta, const core23::Tensor &filterd,
+            const UniformModelParallelEmbeddingMeta &meta, const core23::Tensor &filtered,
             core23::Tensor &bucket_range, core23::Tensor &bucket_after_filter,
             core23::TensorParams &params, EmbeddingInput &emb_input, core23::Tensor &lookup_offset,
             core23::Tensor &temp_scan_storage, core23::Tensor &temp_select_storage,
@@ -416,7 +416,7 @@ void filter(std::shared_ptr<CoreResourceManager> core,
     DISPATCH_INTEGRAL_FUNCTION_CORE23(keys_after_filter.data_type().type(), key_t, [&] {
       offset_t *bucket_after_filter_ptr = bucket_after_filter.data<offset_t>();
       const offset_t *bucket_range_ptr = bucket_range.data<offset_t>();
-      char *filterd_ptr = filterd.data<char>();
+      char *filterd_ptr = filtered.data<char>();
       count_ratio_filter<<<grid_size, block_size, 0, stream>>>(
           bucket_num, filterd_ptr, bucket_range_ptr, bucket_after_filter_ptr);
       cub::DeviceScan::InclusiveSum(

diff --git a/HugeCTR/include/data_readers/multi_hot/detail/aio_context.hpp b/HugeCTR/include/data_readers/multi_hot/detail/aio_context.hpp
@@ -36,10 +36,11 @@ class AIOContext : public IOContext {
 
   size_t io_depth_ = 0;
   size_t num_inflight_ = 0;
+  size_t alignment_ = 0;
   io_context_t ctx_ = 0;
   std::vector<IOEvent> tmp_events_;  // prevent dynamic memory allocation
   std::vector<iocb> iocb_buffer_;
   std::queue<iocb*> free_cbs_;
 };
 
-}  // namespace HugeCTR
+}  // namespace HugeCTR
diff --git a/HugeCTR/src/CMakeLists.txt b/HugeCTR/src/CMakeLists.txt
@@ -67,7 +67,7 @@ target_link_libraries(huge_ctr_shared PRIVATE nlohmann_json::nlohmann_json)
 target_link_libraries(huge_ctr_shared PUBLIC gpu_cache)
 
 if(ENABLE_HDFS)
-  target_link_libraries(huge_ctr_shared PUBLIC ${DB_LIB_PATHS}/libhdfs.so)
+  target_link_libraries(huge_ctr_shared PUBLIC hdfs)
 endif()
 
 if(ENABLE_S3)

diff --git a/HugeCTR/src/data_readers/multi_hot/detail/aio_context.cpp b/HugeCTR/src/data_readers/multi_hot/detail/aio_context.cpp
@@ -35,6 +35,12 @@ AIOContext::AIOContext(size_t io_depth) : io_depth_(io_depth), iocb_buffer_(io_d
   if (io_queue_init(io_depth, &ctx_) < 0) {
     throw std::runtime_error("io_queue_init failed");
   }
+
+  long page_size = sysconf(_SC_PAGESIZE);
+  if (page_size == -1) {
+    throw std::runtime_error("sysconf failed to return page size.");
+  }
+  alignment_ = static_cast<size_t>(page_size);
 }
 
 AIOContext::~AIOContext() {
@@ -118,8 +124,6 @@ IOError AIOContext::errno_to_enum(int err) {
   }
 }
 
-size_t AIOContext::get_alignment() const {
-  return 4096;  // O_DIRECT requirement
-}
+size_t AIOContext::get_alignment() const { return alignment_; }
 
-}  // namespace HugeCTR
+}  // namespace HugeCTR
diff --git a/HugeCTR/src/hps/CMakeLists.txt b/HugeCTR/src/hps/CMakeLists.txt
@@ -36,11 +36,7 @@ add_compile_definitions(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 add_library(huge_ctr_hps SHARED ${huge_ctr_hps_src})
 
 if(ENABLE_HDFS)
-  target_link_libraries(
-    huge_ctr_hps
-    PUBLIC
-      ${DB_LIB_PATHS}/libhdfs.so # from Hugectr
-  )
+  target_link_libraries(huge_ctr_hps PUBLIC hdfs)
 endif()
 
 if(ENABLE_S3)

diff --git a/HugeCTR/src/inference_benchmark/CMakeLists.txt b/HugeCTR/src/inference_benchmark/CMakeLists.txt
@@ -20,11 +20,7 @@ file(GLOB hps_benchmark_src
 )
 
 if(ENABLE_HDFS)
-  target_link_libraries(
-    huge_ctr_inference
-    PUBLIC
-      ${DB_LIB_PATHS}/libhdfs.so # from Hugectr
-  )
+  target_link_libraries(huge_ctr_inference PUBLIC hdfs)
 endif()
 
 if(ENABLE_S3)

diff --git a/docs/source/hugectr_contributor_guide.md b/docs/source/hugectr_contributor_guide.md
@@ -104,10 +104,10 @@ To build HugeCTR Training Container from source, do the following:
    - **ENABLE_INFERENCE**: You can use this option to build HugeCTR in inference mode, which was designed for the inference framework. In this mode, an inference shared library
      will be built for the HugeCTR Backend. Only interfaces that support the HugeCTR Backend can be used. Therefore, you can’t train models in this mode. This option is set to
      OFF by default. For building inference container, please refer to [Build HugeCTR Inference Container from Source](#build-hugectr-inference-container-from-source)
-   - **ENABLE_HDFS**: You can use this option to build HugeCTR together with HDFS to enable HDFS related functions. Permissible values are `ON`, `MINIMAL` and `OFF` *(default)*. Setting this option to `ON` leads to building all necessary Hadoop modules that are required for building AND running both HugeCTR and HDFS. In contrast, `MINIMAL` restricts building only the minimum necessary set of components for building HugeCTR.
+   - **ENABLE_HDFS**: You can use this option to build HugeCTR together with HDFS to enable HDFS related functions. Permissible values are `ON` and `OFF` *(default)*. Setting this option to `ON` leads to building all necessary Hadoop modules that are required for building so that it can connect to HDFS deployments.
    - **ENABLE_S3**: You can use this option to build HugeCTR together with Amazon AWS S3 SDK to enable S3 related functions. Permissible values are `ON` and `OFF` *(default)*. Setting this option to `ON` leads to building all necessary AWS SKKs and dependencies that are required for building AND running both HugeCTR and S3. 
 
-   **Please note that setting DENABLE_HDFS=ON/MINIMAL or DENABLE_S3=ON requires root permission. So before using these two options to do the customized building, make sure you use `-u root` when you run the docker container.**
+   **Please note that setting DENABLE_HDFS=ON or DENABLE_S3=ON requires root permission. So before using these two options to do the customized building, make sure you use `-u root` when you run the docker container.**
 
    Here are some examples of how you can build HugeCTR using these build options:
    ```shell
@@ -124,7 +124,7 @@ To build HugeCTR Training Container from source, do the following:
 
    ```shell
    $ mkdir -p build && cd build
-   $ cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;80" -DENABLE_HDFS=MINIMAL .. # Target is NVIDIA V100 / A100 with only minimum HDFS components mode on.
+   $ cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;80" -DENABLE_HDFS=ON .. # Target is NVIDIA V100 / A100 with HDFS components mode on.
    $ make -j && make install
    ```
 

diff --git a/sbin/install-hadoop.sh b/sbin/install-hadoop.sh
@@ -40,7 +40,7 @@ if [[ ! -f "${HADOOP_HOME}/include/hdfs.h" ]]; then
   cp hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/include/hdfs/hdfs.h ${HADOOP_HOME}/include
 fi
 
-# Cleanup reundant files.
+# Cleanup redundant files.
 for f in $(find ${HADOOP_HOME} -name *.cmd); do
   rm -rf $f
 done

diff --git a/sparse_operation_kit/CMakeLists.txt b/sparse_operation_kit/CMakeLists.txt
@@ -32,6 +32,8 @@ if (NOT TF_RESULT)
     list(GET TF_VERSION_LIST 0 TF_VERSION_MAJOR)
     list(GET TF_VERSION_LIST 1 TF_VERSION_MINOR)
     list(GET TF_VERSION_LIST 2 TF_VERSION_PATCH)
+    message(STATUS "TF_VERSION_MAJOR = ${TF_VERSION_MAJOR}")
+    message(STATUS "TF_VERSION_MINOR = ${TF_VERSION_MINOR}")
     if(${TF_VERSION_MAJOR} GREATER 1 AND ${TF_VERSION_MINOR} GREATER 9)
         add_definitions(-DTF_GE_210)
         set_property(GLOBAL PROPERTY SOK_CXX_STANDARD_PROPERTY cxx_std_17)
@@ -51,6 +53,11 @@ if (NOT TF_RESULT)
     if(${TF_VERSION_MAJOR} GREATER 1 AND ${TF_VERSION_MINOR} GREATER 11)
         add_definitions(-DTF_GE_212)
     endif()
+
+
+    if(${TF_VERSION_MAJOR} GREATER 1 AND ${TF_VERSION_MINOR} GREATER 15)
+        add_definitions(-DTF_GE_216)
+    endif()
 else()
     message(FATAL_ERROR "Can not detect tensorflow in your environment,please install tensorflow(tf1 support version 1.15, for tf2 support version 2.60~latest) ")
 endif()

diff --git a/sparse_operation_kit/ReadMe.md b/sparse_operation_kit/ReadMe.md
@@ -87,7 +87,7 @@ You can also build the SOK module from source code. Here are the steps to follow
 ### Pre-requisites ###
 CUDA Version:>= 11.2
 
-TF2 Version:2.6.0~2.14.0
+TF2 Version:2.6.0~2.16.0
 
 TF1 Version:1.15
 

diff --git a/sparse_operation_kit/kit_src/lookup/impl/core_impl/gpu_resource_impl.hpp b/sparse_operation_kit/kit_src/lookup/impl/core_impl/gpu_resource_impl.hpp
@@ -27,6 +27,10 @@
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
 #endif
 
+#ifdef TF_GE_216
+#include "tensorflow/c/experimental/stream_executor/stream_executor_internal.h"
+#endif
+
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -49,6 +53,16 @@ class GPUResource final : public GPUResourceBase {
       LOG(FATAL) << "Get DeviceContext fail! please check OpKernel running on GPU.";
     }
     const GPUDeviceContext *gpu_dc = static_cast<GPUDeviceContext *>(dc);
+
+#ifdef TF_GE_216
+    cudaStream_t stream =
+        reinterpret_cast<cudaStream_t>(gpu_dc->stream()->platform_specific_handle().stream);
+
+    if (!stream) {
+      LOG(FATAL) << "Get default CUDA stream fail!";
+    }
+    stream_map_[current_stream_name_] = stream;
+#else
     cudaStream_t *stream =
         reinterpret_cast<cudaStream_t *>(gpu_dc->stream()->implementation()->GpuStreamMemberHack());
 
@@ -62,6 +76,8 @@ class GPUResource final : public GPUResourceBase {
       LOG(FATAL) << "Get default CUDA stream fail!";
     }
     stream_map_[current_stream_name_] = *stream;
+
+#endif
   }
 
   void set_stream(const std::string &name) override { current_stream_name_ = name; }
@@ -84,4 +100,4 @@ class GPUResource final : public GPUResourceBase {
   std::string current_stream_name_;
   std::unordered_map<std::string, cudaStream_t> stream_map_;
 };
-}  // namespace tf_internal
+}  // namespace tf_internal
diff --git a/third_party/hadoop b/third_party/hadoop
diff --git a/third_party/protobuf b/third_party/protobuf
diff --git a/tools/dlrm_script/dlrm_raw.cu b/tools/dlrm_script/dlrm_raw.cu
@@ -156,7 +156,7 @@ void process_kaggle_dataset(const std::string &input_dir_path, const std::string
       if (col.type().id() == cudf::type_id::STRING) {
         auto str_col = cudf::strings_column_view(col.view());
         int64_t num_strings = str_col.size();
-        char *char_array = const_cast<char *>(str_col.chars().data<char>());
+        char *char_array = const_cast<char *>(str_col.chars_begin(cudf::get_default_stream()));
         int32_t *offsets = const_cast<int32_t *>(str_col.offsets().data<int32_t>());
 
         build_categorical_index<key_type, value_type><<<grid, block>>>(
@@ -517,7 +517,7 @@ void process_terabyte_dataset(const std::string &input_dir_path, const std::stri
         if (col.type().id() == cudf::type_id::STRING) {
           auto str_col = cudf::strings_column_view(col.view());
           int64_t num_strings = str_col.size();
-          char *char_array = const_cast<char *>(str_col.chars().data<char>());
+          char *char_array = const_cast<char *>(str_col.chars_begin(cudf::get_default_stream()));
           int32_t *offsets = const_cast<int32_t *>(str_col.offsets().data<int32_t>());
 
           build_categorical_index<key_type, value_type><<<grid, block>>>(