Skip to content

Commit

Permalink
Merge pull request #458 from NVIDIA-Merlin/sync_from_gitlab
Browse files Browse the repository at this point in the history
Sync from gitlab
  • Loading branch information
EmmaQiaoCh committed Sep 18, 2024
2 parents d81ac0e + 3bc63b8 commit af90256
Show file tree
Hide file tree
Showing 20 changed files with 159 additions and 53 deletions.
6 changes: 0 additions & 6 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,6 @@
[submodule "third_party/librdkafka"]
path = third_party/librdkafka
url = https://github.com/edenhill/librdkafka.git
[submodule "third_party/protobuf"]
path = third_party/protobuf
url = https://github.com/protocolbuffers/protobuf.git
[submodule "third_party/hadoop"]
path = third_party/hadoop
url = https://github.com/apache/hadoop.git
[submodule "third_party/HierarchicalKV"]
path = third_party/HierarchicalKV
url = https://github.com/NVIDIA-Merlin/HierarchicalKV.git
58 changes: 54 additions & 4 deletions .nspect-vuln-allowlist.toml
Original file line number Diff line number Diff line change
@@ -1,12 +1,57 @@
version = "4.3.0"
version = "24.06"

[oss]

[oss.excluded]


[[oss.excluded.directories]]
paths = ['third_party/hadoop/hadoop-mapreduce-project']
comment = 'No Use'
nspect_ids = ['NSPECT-OZP9-WUQA']

[[oss.excluded.directories]]
paths = ['third_party/hadoop/hadoop-tools/hadoop-azure']
comment = 'No Use'
nspect_ids = ['NSPECT-OZP9-WUQA']

[[oss.excluded.directories]]
paths = ['third_party/hadoop/hadoop-yarn-project/hadoop-yarn']
comment = 'No Use'
nspect_ids = ['NSPECT-OZP9-WUQA']

[[oss.excluded.directories]]
paths = ['third_party/hadoop/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api]
comment = 'No Use'
nspect_ids = ['NSPECT-OZP9-WUQA']

[[oss.excluded.directories]]
paths = ['third_party/hadoop/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core']
comment = 'No Use'
nspect_ids = ['NSPECT-OZP9-WUQA']

[[oss.excluded.directories]]
paths = ['third_party/hadoop/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice']
comment = 'No Use'
nspect_ids = ['NSPECT-OZP9-WUQA']

[[oss.excluded.directories]]
paths = ['hadoop/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-server']
comment = 'No Use'
nspect_ids = ['NSPECT-OZP9-WUQA']

[[oss.excluded.directories]]
paths = ['third_party/hadoop/hadoop-tools']
comment = 'No Use'
nspect_ids = ['NSPECT-OZP9-WUQA']

[[oss.excluded.directories]]
paths = ['third_party/hadoop/hadoop-common']
comment = 'No Use'
nspect_ids = ['NSPECT-OZP9-WUQA']

[[oss.excluded.directories]]
paths = ['third_party/hadoop/*']
comment = 'We do not use and are not planning on using the Hadoop Yarn Web UI'
paths = ['third_party/hadoop/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/*']
comment = 'No Use'
nspect_ids = ['NSPECT-OZP9-WUQA']

[[oss.excluded.directories]]
Expand All @@ -18,3 +63,8 @@ nspect_ids = ['NSPECT-OZP9-WUQA']
paths = ['third_party/protobuf/*']
comment = 'We never use csharp, java, php, the thir party googletest, etc., inside ptotobuf'
nspect_ids = ['NSPECT-OZP9-WUQA']

[[oss.excluded.directories]]
paths = ['third_party/hadoop', 'third_party/hadoop/*']
comment = 'No Use'
nspect_ids = ['NSPECT-OZP9-WUQA']
57 changes: 49 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#

cmake_minimum_required(VERSION 3.17)

project(HugeCTR LANGUAGES CXX CUDA)

list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
Expand Down Expand Up @@ -351,17 +352,57 @@ add_subdirectory(gpu_cache/src)

option(ENABLE_HDFS "Enable HDFS" OFF)
if(ENABLE_HDFS)
if(ENABLE_HDFS STREQUAL "MINIMAL")
message("HDFS build mode: Client only")
else()
message("HDFS build mode: Full")
endif()
message(STATUS "HDFS build mode: Client only")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DENABLE_HDFS")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_HDFS")

set(FETCHCONTENT_QUIET OFF)

# Java.
if (NOT EXISTS /usr/bin/mvn)
execute_process(WORKING_DIRECTORY "${CMAKE_BINARY_DIR}"
COMMAND /bin/bash ${PROJECT_SOURCE_DIR}/sbin/install-jdk-and-maven.sh
COMMAND_ERROR_IS_FATAL ANY
)
endif()

# Build and Install Hadoop
include(SetupHadoop)
hadoop_setup(${ENABLE_HDFS})
# Hadoop.
# sudo apt install libboost-date-time-dev
# sudo apt install libboost-program-options-dev
# sudo apt install libprotobuf-dev
# sudo apt install libfuse-dev
# sudo apt install libprotoc-dev
FetchContent_Declare(hadoop
DOWNLOAD_COMMAND git clone
--branch rel/release-3.4.0
--depth 1
--progress https://github.com/apache/hadoop.git
"${CMAKE_BINARY_DIR}/_deps/hadoop-src"
)
FetchContent_Populate(hadoop)
set(hadoop_SOURCE_DIR "${hadoop_SOURCE_DIR}/hadoop-hdfs-project/hadoop-hdfs-native-client")
set(hadoop_BINARY_DIR "${hadoop_SOURCE_DIR}/target/hadoop-hdfs-native-client-3.4.0")
if(EXISTS ${hadoop_BINARY_DIR}/include/hdfs.h AND EXISTS ${hadoop_BINARY_DIR}/lib/native/libhdfs.a)
message(STATUS "Found hdfs library in ${hadoop_BINARY_DIR}")
else()
execute_process(WORKING_DIRECTORY "${hadoop_SOURCE_DIR}"
COMMAND mvn clean package
-Pdist,native
-DskipTests
-Dtar
-Dmaven.javadoc.skip=true
-Drequire.snappy
-Drequire.zstd
-Drequire.openssl
-Drequire.pmdk
COMMAND_ERROR_IS_FATAL ANY
)
endif()
set(FETCHCONTENT_QUIET ON)

include_directories("${hadoop_BINARY_DIR}/include")
link_directories("${hadoop_BINARY_DIR}/lib/native")

set(ENABLE_HDFS ON)
endif()

Expand Down
8 changes: 4 additions & 4 deletions HugeCTR/embedding/all2all_embedding_collection.cu
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ __global__ void cal_lookup_idx(size_t lookup_num, offset_t *bucket_after_filter,
}

template <typename offset_t>
__global__ void count_ratio_filter(size_t bucket_num, char *filterd, const offset_t *bucket_range,
__global__ void count_ratio_filter(size_t bucket_num, char *filtered, const offset_t *bucket_range,
offset_t *bucket_after_filter) {
int32_t i = blockIdx.x * blockDim.x + threadIdx.x;
int32_t step = blockDim.x * gridDim.x;
Expand All @@ -389,7 +389,7 @@ __global__ void count_ratio_filter(size_t bucket_num, char *filterd, const offse
offset_t end = bucket_range[i + 1];
bucket_after_filter[i + 1] = 0;
for (offset_t idx = start; idx < end; idx++) {
if (filterd[idx] == 1) {
if (filtered[idx] == 1) {
bucket_after_filter[i + 1]++;
}
}
Expand All @@ -400,7 +400,7 @@ __global__ void count_ratio_filter(size_t bucket_num, char *filterd, const offse
}

void filter(std::shared_ptr<CoreResourceManager> core,
const UniformModelParallelEmbeddingMeta &meta, const core23::Tensor &filterd,
const UniformModelParallelEmbeddingMeta &meta, const core23::Tensor &filtered,
core23::Tensor &bucket_range, core23::Tensor &bucket_after_filter,
core23::TensorParams &params, EmbeddingInput &emb_input, core23::Tensor &lookup_offset,
core23::Tensor &temp_scan_storage, core23::Tensor &temp_select_storage,
Expand All @@ -416,7 +416,7 @@ void filter(std::shared_ptr<CoreResourceManager> core,
DISPATCH_INTEGRAL_FUNCTION_CORE23(keys_after_filter.data_type().type(), key_t, [&] {
offset_t *bucket_after_filter_ptr = bucket_after_filter.data<offset_t>();
const offset_t *bucket_range_ptr = bucket_range.data<offset_t>();
char *filterd_ptr = filterd.data<char>();
char *filterd_ptr = filtered.data<char>();
count_ratio_filter<<<grid_size, block_size, 0, stream>>>(
bucket_num, filterd_ptr, bucket_range_ptr, bucket_after_filter_ptr);
cub::DeviceScan::InclusiveSum(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,11 @@ class AIOContext : public IOContext {

size_t io_depth_ = 0;
size_t num_inflight_ = 0;
size_t alignment_ = 0;
io_context_t ctx_ = 0;
std::vector<IOEvent> tmp_events_; // prevent dynamic memory allocation
std::vector<iocb> iocb_buffer_;
std::queue<iocb*> free_cbs_;
};

} // namespace HugeCTR
} // namespace HugeCTR
2 changes: 1 addition & 1 deletion HugeCTR/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ target_link_libraries(huge_ctr_shared PRIVATE nlohmann_json::nlohmann_json)
target_link_libraries(huge_ctr_shared PUBLIC gpu_cache)

if(ENABLE_HDFS)
target_link_libraries(huge_ctr_shared PUBLIC ${DB_LIB_PATHS}/libhdfs.so)
target_link_libraries(huge_ctr_shared PUBLIC hdfs)
endif()

if(ENABLE_S3)
Expand Down
12 changes: 8 additions & 4 deletions HugeCTR/src/data_readers/multi_hot/detail/aio_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ AIOContext::AIOContext(size_t io_depth) : io_depth_(io_depth), iocb_buffer_(io_d
if (io_queue_init(io_depth, &ctx_) < 0) {
throw std::runtime_error("io_queue_init failed");
}

long page_size = sysconf(_SC_PAGESIZE);
if (page_size == -1) {
throw std::runtime_error("sysconf failed to return page size.");
}
alignment_ = static_cast<size_t>(page_size);
}

AIOContext::~AIOContext() {
Expand Down Expand Up @@ -118,8 +124,6 @@ IOError AIOContext::errno_to_enum(int err) {
}
}

size_t AIOContext::get_alignment() const {
return 4096; // O_DIRECT requirement
}
size_t AIOContext::get_alignment() const { return alignment_; }

} // namespace HugeCTR
} // namespace HugeCTR
6 changes: 1 addition & 5 deletions HugeCTR/src/hps/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,7 @@ add_compile_definitions(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
add_library(huge_ctr_hps SHARED ${huge_ctr_hps_src})

if(ENABLE_HDFS)
target_link_libraries(
huge_ctr_hps
PUBLIC
${DB_LIB_PATHS}/libhdfs.so # from Hugectr
)
target_link_libraries(huge_ctr_hps PUBLIC hdfs)
endif()

if(ENABLE_S3)
Expand Down
6 changes: 1 addition & 5 deletions HugeCTR/src/inference_benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,7 @@ file(GLOB hps_benchmark_src
)

if(ENABLE_HDFS)
target_link_libraries(
huge_ctr_inference
PUBLIC
${DB_LIB_PATHS}/libhdfs.so # from Hugectr
)
target_link_libraries(huge_ctr_inference PUBLIC hdfs)
endif()

if(ENABLE_S3)
Expand Down
6 changes: 3 additions & 3 deletions docs/source/hugectr_contributor_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,10 @@ To build HugeCTR Training Container from source, do the following:
- **ENABLE_INFERENCE**: You can use this option to build HugeCTR in inference mode, which was designed for the inference framework. In this mode, an inference shared library
will be built for the HugeCTR Backend. Only interfaces that support the HugeCTR Backend can be used. Therefore, you can’t train models in this mode. This option is set to
OFF by default. For building inference container, please refer to [Build HugeCTR Inference Container from Source](#build-hugectr-inference-container-from-source)
- **ENABLE_HDFS**: You can use this option to build HugeCTR together with HDFS to enable HDFS related functions. Permissible values are `ON`, `MINIMAL` and `OFF` *(default)*. Setting this option to `ON` leads to building all necessary Hadoop modules that are required for building AND running both HugeCTR and HDFS. In contrast, `MINIMAL` restricts building only the minimum necessary set of components for building HugeCTR.
- **ENABLE_HDFS**: You can use this option to build HugeCTR together with HDFS to enable HDFS related functions. Permissible values are `ON` and `OFF` *(default)*. Setting this option to `ON` leads to building all necessary Hadoop modules that are required for building so that it can connect to HDFS deployments.
- **ENABLE_S3**: You can use this option to build HugeCTR together with Amazon AWS S3 SDK to enable S3 related functions. Permissible values are `ON` and `OFF` *(default)*. Setting this option to `ON` leads to building all necessary AWS SKKs and dependencies that are required for building AND running both HugeCTR and S3.

**Please note that setting DENABLE_HDFS=ON/MINIMAL or DENABLE_S3=ON requires root permission. So before using these two options to do the customized building, make sure you use `-u root` when you run the docker container.**
**Please note that setting DENABLE_HDFS=ON or DENABLE_S3=ON requires root permission. So before using these two options to do the customized building, make sure you use `-u root` when you run the docker container.**

Here are some examples of how you can build HugeCTR using these build options:
```shell
Expand All @@ -124,7 +124,7 @@ To build HugeCTR Training Container from source, do the following:

```shell
$ mkdir -p build && cd build
$ cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;80" -DENABLE_HDFS=MINIMAL .. # Target is NVIDIA V100 / A100 with only minimum HDFS components mode on.
$ cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;80" -DENABLE_HDFS=ON .. # Target is NVIDIA V100 / A100 with HDFS components mode on.
$ make -j && make install
```

Expand Down
2 changes: 1 addition & 1 deletion sbin/install-hadoop.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ if [[ ! -f "${HADOOP_HOME}/include/hdfs.h" ]]; then
cp hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/include/hdfs/hdfs.h ${HADOOP_HOME}/include
fi

# Cleanup reundant files.
# Cleanup redundant files.
for f in $(find ${HADOOP_HOME} -name *.cmd); do
rm -rf $f
done
Expand Down
7 changes: 7 additions & 0 deletions sparse_operation_kit/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ if (NOT TF_RESULT)
list(GET TF_VERSION_LIST 0 TF_VERSION_MAJOR)
list(GET TF_VERSION_LIST 1 TF_VERSION_MINOR)
list(GET TF_VERSION_LIST 2 TF_VERSION_PATCH)
message(STATUS "TF_VERSION_MAJOR = ${TF_VERSION_MAJOR}")
message(STATUS "TF_VERSION_MINOR = ${TF_VERSION_MINOR}")
if(${TF_VERSION_MAJOR} GREATER 1 AND ${TF_VERSION_MINOR} GREATER 9)
add_definitions(-DTF_GE_210)
set_property(GLOBAL PROPERTY SOK_CXX_STANDARD_PROPERTY cxx_std_17)
Expand All @@ -51,6 +53,11 @@ if (NOT TF_RESULT)
if(${TF_VERSION_MAJOR} GREATER 1 AND ${TF_VERSION_MINOR} GREATER 11)
add_definitions(-DTF_GE_212)
endif()


if(${TF_VERSION_MAJOR} GREATER 1 AND ${TF_VERSION_MINOR} GREATER 15)
add_definitions(-DTF_GE_216)
endif()
else()
message(FATAL_ERROR "Can not detect tensorflow in your environment,please install tensorflow(tf1 support version 1.15, for tf2 support version 2.60~latest) ")
endif()
Expand Down
2 changes: 1 addition & 1 deletion sparse_operation_kit/ReadMe.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ You can also build the SOK module from source code. Here are the steps to follow
### Pre-requisites ###
CUDA Version:>= 11.2

TF2 Version:2.6.0~2.14.0
TF2 Version:2.6.0~2.16.0

TF1 Version:1.15

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@
#include "tensorflow/core/common_runtime/gpu_device_context.h"
#endif

#ifdef TF_GE_216
#include "tensorflow/c/experimental/stream_executor/stream_executor_internal.h"
#endif

#include "tensorflow/core/framework/device_base.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/platform/stream_executor.h"
Expand All @@ -49,6 +53,16 @@ class GPUResource final : public GPUResourceBase {
LOG(FATAL) << "Get DeviceContext fail! please check OpKernel running on GPU.";
}
const GPUDeviceContext *gpu_dc = static_cast<GPUDeviceContext *>(dc);

#ifdef TF_GE_216
cudaStream_t stream =
reinterpret_cast<cudaStream_t>(gpu_dc->stream()->platform_specific_handle().stream);

if (!stream) {
LOG(FATAL) << "Get default CUDA stream fail!";
}
stream_map_[current_stream_name_] = stream;
#else
cudaStream_t *stream =
reinterpret_cast<cudaStream_t *>(gpu_dc->stream()->implementation()->GpuStreamMemberHack());

Expand All @@ -62,6 +76,8 @@ class GPUResource final : public GPUResourceBase {
LOG(FATAL) << "Get default CUDA stream fail!";
}
stream_map_[current_stream_name_] = *stream;

#endif
}

void set_stream(const std::string &name) override { current_stream_name_ = name; }
Expand All @@ -84,4 +100,4 @@ class GPUResource final : public GPUResourceBase {
std::string current_stream_name_;
std::unordered_map<std::string, cudaStream_t> stream_map_;
};
} // namespace tf_internal
} // namespace tf_internal
1 change: 0 additions & 1 deletion third_party/hadoop
Submodule hadoop deleted from a585a7
1 change: 0 additions & 1 deletion third_party/protobuf
Submodule protobuf deleted from 22d0e2
4 changes: 2 additions & 2 deletions tools/dlrm_script/dlrm_raw.cu
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ void process_kaggle_dataset(const std::string &input_dir_path, const std::string
if (col.type().id() == cudf::type_id::STRING) {
auto str_col = cudf::strings_column_view(col.view());
int64_t num_strings = str_col.size();
char *char_array = const_cast<char *>(str_col.chars().data<char>());
char *char_array = const_cast<char *>(str_col.chars_begin(cudf::get_default_stream()));
int32_t *offsets = const_cast<int32_t *>(str_col.offsets().data<int32_t>());

build_categorical_index<key_type, value_type><<<grid, block>>>(
Expand Down Expand Up @@ -517,7 +517,7 @@ void process_terabyte_dataset(const std::string &input_dir_path, const std::stri
if (col.type().id() == cudf::type_id::STRING) {
auto str_col = cudf::strings_column_view(col.view());
int64_t num_strings = str_col.size();
char *char_array = const_cast<char *>(str_col.chars().data<char>());
char *char_array = const_cast<char *>(str_col.chars_begin(cudf::get_default_stream()));
int32_t *offsets = const_cast<int32_t *>(str_col.offsets().data<int32_t>());

build_categorical_index<key_type, value_type><<<grid, block>>>(
Expand Down
Loading

0 comments on commit af90256

Please sign in to comment.