diff --git a/cmake/external/jemalloc.cmake b/cmake/external/jemalloc.cmake new file mode 100644 index 0000000000000..efce686b20929 --- /dev/null +++ b/cmake/external/jemalloc.cmake @@ -0,0 +1,35 @@ +include(ExternalProject) + +set(JEMALLOC_PROJECT "extern_jemalloc") +set(JEMALLOC_URL + https://github.com/jemalloc/jemalloc/releases/download/5.1.0/jemalloc-5.1.0.tar.bz2 +) +set(JEMALLOC_BUILD ${THIRD_PARTY_PATH}/jemalloc/src/extern_jemalloc) +set(JEMALLOC_SOURCE_DIR "${THIRD_PARTY_PATH}/jemalloc") +set(JEMALLOC_INSTALL ${THIRD_PARTY_PATH}/install/jemalloc) +set(JEMALLOC_INCLUDE_DIR ${JEMALLOC_INSTALL}/include) +set(JEMALLOC_DOWNLOAD_DIR "${JEMALLOC_SOURCE_DIR}/src/${JEMALLOC_PROJECT}") + +set(JEMALLOC_STATIC_LIBRARIES + ${THIRD_PARTY_PATH}/install/jemalloc/lib/libjemalloc_pic.a) +set(JEMALLOC_LIBRARIES + ${THIRD_PARTY_PATH}/install/jemalloc/lib/libjemalloc_pic.a) + +ExternalProject_Add( + extern_jemalloc + PREFIX ${JEMALLOC_SOURCE_DIR} + URL ${JEMALLOC_URL} + INSTALL_DIR ${JEMALLOC_INSTALL} + DOWNLOAD_DIR "${JEMALLOC_DOWNLOAD_DIR}" + BUILD_COMMAND $(MAKE) + BUILD_IN_SOURCE 1 + INSTALL_COMMAND $(MAKE) install + CONFIGURE_COMMAND "${JEMALLOC_DOWNLOAD_DIR}/configure" + --prefix=${JEMALLOC_INSTALL} --disable-initial-exec-tls) + +add_library(jemalloc STATIC IMPORTED GLOBAL) +set_property(TARGET jemalloc PROPERTY IMPORTED_LOCATION + ${JEMALLOC_STATIC_LIBRARIES}) + +include_directories(${JEMALLOC_INCLUDE_DIR}) +add_dependencies(jemalloc extern_jemalloc) diff --git a/cmake/external/rocksdb.cmake b/cmake/external/rocksdb.cmake index 41a1916dc3308..0084247461b74 100644 --- a/cmake/external/rocksdb.cmake +++ b/cmake/external/rocksdb.cmake @@ -14,6 +14,13 @@ include(ExternalProject) +# find_package(jemalloc REQUIRED) + +set(JEMALLOC_INCLUDE_DIR ${THIRD_PARTY_PATH}/install/jemalloc/include) +set(JEMALLOC_LIBRARIES + ${THIRD_PARTY_PATH}/install/jemalloc/lib/libjemalloc_pic.a) +message(STATUS "rocksdb jemalloc:" ${JEMALLOC_LIBRARIES}) + set(ROCKSDB_PREFIX_DIR ${THIRD_PARTY_PATH}/rocksdb) set(ROCKSDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocksdb) set(ROCKSDB_INCLUDE_DIR @@ -22,22 +29,41 @@ set(ROCKSDB_INCLUDE_DIR set(ROCKSDB_LIBRARIES "${ROCKSDB_INSTALL_DIR}/lib/librocksdb.a" CACHE FILEPATH "rocksdb library." FORCE) -set(ROCKSDB_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") +set(ROCKSDB_COMMON_FLAGS + "-g -pipe -O2 -W -Wall -Wno-unused-parameter -fPIC -fno-builtin-memcmp -fno-omit-frame-pointer" +) +set(ROCKSDB_FLAGS + "-DNDEBUG -DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DOS_LINUX -DROCKSDB_FALLOCATE_PRESENT -DHAVE_SSE42 -DHAVE_PCLMUL -DZLIB -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_PTHREAD_ADAPTIVE_MUTEX -DROCKSDB_BACKTRACE -DROCKSDB_SUPPORT_THREAD_LOCAL -DROCKSDB_USE_RTTI -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_AUXV_GETAUXVAL_PRESENT" +) +set(ROCKSDB_CMAKE_CXX_FLAGS + "${ROCKSDB_COMMON_FLAGS} -DROCKSDB_LIBAIO_PRESENT -msse -msse4.2 -mpclmul ${ROCKSDB_FLAGS} -fPIC -I${JEMALLOC_INCLUDE_DIR}" +) +set(ROCKSDB_CMAKE_C_FLAGS + "${ROCKSDB_COMMON_FLAGS} ${ROCKSDB_FLAGS} -DROCKSDB_LIBAIO_PRESENT -fPIC -I${JEMALLOC_INCLUDE_DIR}" +) include_directories(${ROCKSDB_INCLUDE_DIR}) +set(CMAKE_CXX_LINK_EXECUTABLE + "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt -lz") ExternalProject_Add( extern_rocksdb ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${ROCKSDB_PREFIX_DIR} - GIT_REPOSITORY "https://github.com/facebook/rocksdb" - GIT_TAG v6.10.1 + GIT_REPOSITORY "https://github.com/Thunderbrook/rocksdb" + GIT_TAG 6.19.fb UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DWITH_BZ2=OFF -DWITH_GFLAGS=OFF + -DWITH_TESTS=OFF + -DWITH_JEMALLOC=ON + -DWITH_BENCHMARK_TOOLS=OFF + -DJeMalloc_LIBRARIES=${JEMALLOC_LIBRARIES} + -DJeMalloc_INCLUDE_DIRS=${JEMALLOC_INCLUDE_DIR} -DCMAKE_CXX_FLAGS=${ROCKSDB_CMAKE_CXX_FLAGS} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS=${ROCKSDB_CMAKE_C_FLAGS} + -DCMAKE_CXX_LINK_EXECUTABLE=${CMAKE_CXX_LINK_EXECUTABLE} # BUILD_BYPRODUCTS ${ROCKSDB_PREFIX_DIR}/src/extern_rocksdb/librocksdb.a INSTALL_COMMAND mkdir -p ${ROCKSDB_INSTALL_DIR}/lib/ && cp diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 94fb1b4d838f9..5455ddadfdea4 100755 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -422,6 +422,9 @@ if(WITH_PSCORE) include(external/rocksdb) # download, build, install rocksdb list(APPEND third_party_deps extern_rocksdb) + + include(external/jemalloc) # download, build, install jemalloc + list(APPEND third_party_deps extern_jemalloc) endif() if(WITH_XBYAK) diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h index 5654669d76fdb..74f946b2253aa 100644 --- a/paddle/fluid/distributed/ps/service/ps_client.h +++ b/paddle/fluid/distributed/ps/service/ps_client.h @@ -148,10 +148,12 @@ class PSClient { return fut; } - virtual ::std::future PullSparsePtr(char **select_values, + virtual ::std::future PullSparsePtr(int shard_id, + char **select_values, size_t table_id, const uint64_t *keys, - size_t num) { + size_t num, + uint16_t pass_id) { VLOG(0) << "Did not implement"; std::promise promise; std::future fut = promise.get_future(); @@ -160,6 +162,15 @@ class PSClient { } virtual std::future PrintTableStat(uint32_t table_id) = 0; + virtual std::future SaveCacheTable(uint32_t table_id, + uint16_t pass_id, + size_t threshold) { + VLOG(0) << "Did not implement"; + std::promise promise; + std::future fut = promise.get_future(); + promise.set_value(-1); + return fut; + } // 确保所有积攒中的请求都发起发送 virtual std::future Flush() = 0; diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc index e8bf426710bc3..5466e9cd95bd0 100644 --- a/paddle/fluid/distributed/ps/service/ps_local_client.cc +++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc @@ -260,10 +260,12 @@ ::std::future PsLocalClient::PushDense(const Region* regions, // return done(); //} -::std::future PsLocalClient::PullSparsePtr(char** select_values, +::std::future PsLocalClient::PullSparsePtr(int shard_id, + char** select_values, size_t table_id, const uint64_t* keys, - size_t num) { + size_t num, + uint16_t pass_id) { // FIXME // auto timer = // std::make_shared("pslib_downpour_client_pull_sparse"); @@ -278,6 +280,8 @@ ::std::future PsLocalClient::PullSparsePtr(char** select_values, table_context.pull_context.ptr_values = select_values; table_context.use_ptr = true; table_context.num = num; + table_context.shard_id = shard_id; + table_context.pass_id = pass_id; // table_ptr->PullSparsePtr(select_values, keys, num); table_ptr->Pull(table_context); @@ -285,6 +289,28 @@ ::std::future PsLocalClient::PullSparsePtr(char** select_values, return done(); } +::std::future PsLocalClient::PrintTableStat(uint32_t table_id) { + auto* table_ptr = GetTable(table_id); + std::pair ret = table_ptr->PrintTableStat(); + VLOG(0) << "table id: " << table_id << ", feasign size: " << ret.first + << ", mf size: " << ret.second; + return done(); +} + +::std::future PsLocalClient::SaveCacheTable(uint32_t table_id, + uint16_t pass_id, + size_t threshold) { + auto* table_ptr = GetTable(table_id); + std::pair ret = table_ptr->PrintTableStat(); + VLOG(0) << "table id: " << table_id << ", feasign size: " << ret.first + << ", mf size: " << ret.second; + if (ret.first > threshold) { + VLOG(0) << "run cache table"; + table_ptr->CacheTable(pass_id); + } + return done(); +} + ::std::future PsLocalClient::PushSparseRawGradient( size_t table_id, const uint64_t* keys, diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.h b/paddle/fluid/distributed/ps/service/ps_local_client.h index 593805547af84..583ea8052eb01 100644 --- a/paddle/fluid/distributed/ps/service/ps_local_client.h +++ b/paddle/fluid/distributed/ps/service/ps_local_client.h @@ -76,18 +76,19 @@ class PsLocalClient : public PSClient { return fut; } - virtual ::std::future PullSparsePtr(char** select_values, + virtual ::std::future PullSparsePtr(int shard_id, + char** select_values, size_t table_id, const uint64_t* keys, - size_t num); + size_t num, + uint16_t pass_id); - virtual ::std::future PrintTableStat(uint32_t table_id) { - std::promise prom; - std::future fut = prom.get_future(); - prom.set_value(0); + virtual ::std::future PrintTableStat(uint32_t table_id); + + virtual ::std::future SaveCacheTable(uint32_t table_id, + uint16_t pass_id, + size_t threshold); - return fut; - } virtual ::std::future PushSparse(size_t table_id, const uint64_t* keys, const float** update_values, diff --git a/paddle/fluid/distributed/ps/table/accessor.h b/paddle/fluid/distributed/ps/table/accessor.h index b55c77bf52d84..9f6baf3189fb8 100644 --- a/paddle/fluid/distributed/ps/table/accessor.h +++ b/paddle/fluid/distributed/ps/table/accessor.h @@ -162,6 +162,15 @@ class ValueAccessor { return 0; } + virtual bool SaveMemCache(float* value, + int param, + double global_cache_threshold, + uint16_t pass_id) { + return true; + } + + virtual void UpdatePassId(float* value, uint16_t pass_id) {} + virtual float GetField(float* value, const std::string& name) { return 0.0; } #define DEFINE_GET_INDEX(class, field) \ virtual int get_##field##_index() override { return class ::field##_index(); } diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc index 08bef6261cf34..995eafcdd61eb 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.cc +++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc @@ -24,6 +24,7 @@ #include "gflags/gflags.h" #include "paddle/fluid/distributed/common/utils.h" #include "paddle/fluid/distributed/ps/table/graph/graph_node.h" +#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/io/fs.h" #include "paddle/fluid/platform/timer.h" @@ -32,6 +33,7 @@ DECLARE_bool(graph_load_in_parallel); DECLARE_bool(graph_get_neighbor_id); +DECLARE_int32(gpugraph_storage_mode); namespace paddle { namespace distributed { @@ -311,7 +313,7 @@ int64_t GraphTable::load_graph_to_memory_from_ssd(int idx, std::string str; if (_db->get(i, ch, sizeof(int) * 2 + sizeof(uint64_t), str) == 0) { count[i] += (int64_t)str.size(); - for (size_t j = 0; j < (int)str.size(); j += sizeof(uint64_t)) { + for (size_t j = 0; j < str.size(); j += sizeof(uint64_t)) { uint64_t id = *(uint64_t *)(str.c_str() + j); add_comm_edge(idx, v, id); } @@ -381,7 +383,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) { score[i] = 0; } } - for (size_t j = 0; j < (int)value.size(); j += sizeof(uint64_t)) { + for (size_t j = 0; j < value.size(); j += sizeof(uint64_t)) { uint64_t v = *((uint64_t *)(value.c_str() + j)); int index = -1; if (id_map.find(v) != id_map.end()) { @@ -472,6 +474,71 @@ void GraphTable::clear_graph(int idx) { edge_shards[idx].push_back(new GraphShard()); } } + +void GraphTable::release_graph() { + // Before releasing graph, prepare for sampling ids and embedding keys. + build_graph_type_keys(); + + if (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::WHOLE_HBM) { + build_graph_total_keys(); + } + // clear graph + clear_graph(); +} + +void GraphTable::release_graph_edge() { + build_graph_total_keys(); + clear_edge_shard(); +} + +void GraphTable::release_graph_node() { + build_graph_type_keys(); + clear_feature_shard(); +} + +void GraphTable::clear_edge_shard() { + VLOG(0) << "begin clear edge shard"; + std::vector> tasks; + for (auto &type_shards : edge_shards) { + for (auto &shard : type_shards) { + tasks.push_back( + load_node_edge_task_pool->enqueue([&shard, this]() -> int { + delete shard; + return 0; + })); + } + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + for (auto &shards : edge_shards) shards.clear(); + edge_shards.clear(); + VLOG(0) << "finish clear edge shard"; +} + +void GraphTable::clear_feature_shard() { + VLOG(0) << "begin clear feature shard"; + std::vector> tasks; + for (auto &type_shards : feature_shards) { + for (auto &shard : type_shards) { + tasks.push_back( + load_node_edge_task_pool->enqueue([&shard, this]() -> int { + delete shard; + return 0; + })); + } + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + for (auto &shards : feature_shards) shards.clear(); + feature_shards.clear(); + VLOG(0) << "finish clear feature shard"; +} + +void GraphTable::clear_graph() { + VLOG(0) << "begin clear_graph"; + clear_edge_shard(); + clear_feature_shard(); + VLOG(0) << "finish clear_graph"; +} + int32_t GraphTable::load_next_partition(int idx) { if (next_partition >= (int)partitions[idx].size()) { VLOG(0) << "partition iteration is done"; @@ -538,7 +605,7 @@ int32_t GraphTable::dump_edges_to_ssd(int idx) { std::vector &v = shards[i]->get_bucket(); for (size_t j = 0; j < v.size(); j++) { std::vector s; - for (size_t k = 0; k < (int)v[j]->get_neighbor_size(); k++) { + for (size_t k = 0; k < v[j]->get_neighbor_size(); k++) { s.push_back(v[j]->get_neighbor_id(k)); } cost += v[j]->get_neighbor_size() * sizeof(uint64_t); @@ -1037,21 +1104,7 @@ Node *GraphShard::find_node(uint64_t id) { return iter == node_location.end() ? nullptr : bucket[iter->second]; } -GraphTable::~GraphTable() { - for (int i = 0; i < (int)edge_shards.size(); i++) { - for (auto p : edge_shards[i]) { - delete p; - } - edge_shards[i].clear(); - } - - for (int i = 0; i < (int)feature_shards.size(); i++) { - for (auto p : feature_shards[i]) { - delete p; - } - feature_shards[i].clear(); - } -} +GraphTable::~GraphTable() { clear_graph(); } int32_t GraphTable::Load(const std::string &path, const std::string ¶m) { bool load_edge = (param[0] == 'e'); @@ -1079,14 +1132,133 @@ std::string GraphTable::get_inverse_etype(std::string &etype) { return res; } -int32_t GraphTable::load_node_and_edge_file(std::string etype, - std::string ntype, - std::string epath, - std::string npath, +int32_t GraphTable::parse_type_to_typepath( + std::string &type2files, + std::string graph_data_local_path, + std::vector &res_type, + std::unordered_map &res_type2path) { + auto type2files_split = + paddle::string::split_string(type2files, ","); + if (type2files_split.size() == 0) { + return -1; + } + for (auto one_type2file : type2files_split) { + auto one_type2file_split = + paddle::string::split_string(one_type2file, ":"); + auto type = one_type2file_split[0]; + auto type_dir = one_type2file_split[1]; + res_type.push_back(type); + res_type2path[type] = graph_data_local_path + "/" + type_dir; + } + return 0; +} + +int32_t GraphTable::parse_edge_and_load(std::string etype2files, + std::string graph_data_local_path, + int part_num, + bool reverse) { + std::vector etypes; + std::unordered_map edge_to_edgedir; + int res = parse_type_to_typepath( + etype2files, graph_data_local_path, etypes, edge_to_edgedir); + if (res != 0) { + VLOG(0) << "parse edge type and edgedir failed!"; + return -1; + } + VLOG(0) << "etypes size: " << etypes.size(); + VLOG(0) << "whether reverse: " << reverse; + is_load_reverse_edge = reverse; + std::string delim = ";"; + size_t total_len = etypes.size(); + + std::vector> tasks; + for (size_t i = 0; i < total_len; i++) { + tasks.push_back( + _shards_task_pool[i % task_pool_size_]->enqueue([&, i, this]() -> int { + std::string etype_path = edge_to_edgedir[etypes[i]]; + auto etype_path_list = paddle::framework::localfs_list(etype_path); + std::string etype_path_str; + if (part_num > 0 && part_num < (int)etype_path_list.size()) { + std::vector sub_etype_path_list( + etype_path_list.begin(), etype_path_list.begin() + part_num); + etype_path_str = + paddle::string::join_strings(sub_etype_path_list, delim); + } else { + etype_path_str = + paddle::string::join_strings(etype_path_list, delim); + } + this->load_edges(etype_path_str, false, etypes[i]); + if (reverse) { + std::string r_etype = get_inverse_etype(etypes[i]); + this->load_edges(etype_path_str, true, r_etype); + } + return 0; + })); + } + for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get(); + return 0; +} + +int32_t GraphTable::parse_node_and_load(std::string ntype2files, + std::string graph_data_local_path, + int part_num) { + std::vector ntypes; + std::unordered_map node_to_nodedir; + int res = parse_type_to_typepath( + ntype2files, graph_data_local_path, ntypes, node_to_nodedir); + if (res != 0) { + VLOG(0) << "parse node type and nodedir failed!"; + return -1; + } + + std::string delim = ";"; + std::string npath = node_to_nodedir[ntypes[0]]; + auto npath_list = paddle::framework::localfs_list(npath); + std::string npath_str; + if (part_num > 0 && part_num < (int)npath_list.size()) { + std::vector sub_npath_list( + npath_list.begin(), npath_list.begin() + part_num); + npath_str = paddle::string::join_strings(sub_npath_list, delim); + } else { + npath_str = paddle::string::join_strings(npath_list, delim); + } + if (ntypes.size() == 0) { + VLOG(0) << "node_type not specified, nothing will be loaded "; + return 0; + } + + if (FLAGS_graph_load_in_parallel) { + this->load_nodes(npath_str, ""); + } else { + for (size_t j = 0; j < ntypes.size(); j++) { + this->load_nodes(npath_str, ntypes[j]); + } + } + return 0; +} + +int32_t GraphTable::load_node_and_edge_file(std::string etype2files, + std::string ntype2files, + std::string graph_data_local_path, int part_num, bool reverse) { - auto etypes = paddle::string::split_string(etype, ","); - auto ntypes = paddle::string::split_string(ntype, ","); + std::vector etypes; + std::unordered_map edge_to_edgedir; + int res = parse_type_to_typepath( + etype2files, graph_data_local_path, etypes, edge_to_edgedir); + if (res != 0) { + VLOG(0) << "parse edge type and edgedir failed!"; + return -1; + } + std::vector ntypes; + std::unordered_map node_to_nodedir; + res = parse_type_to_typepath( + ntype2files, graph_data_local_path, ntypes, node_to_nodedir); + if (res != 0) { + VLOG(0) << "parse node type and nodedir failed!"; + return -1; + } + VLOG(0) << "etypes size: " << etypes.size(); VLOG(0) << "whether reverse: " << reverse; is_load_reverse_edge = reverse; @@ -1098,7 +1270,7 @@ int32_t GraphTable::load_node_and_edge_file(std::string etype, tasks.push_back( _shards_task_pool[i % task_pool_size_]->enqueue([&, i, this]() -> int { if (i < etypes.size()) { - std::string etype_path = epath + "/" + etypes[i]; + std::string etype_path = edge_to_edgedir[etypes[i]]; auto etype_path_list = paddle::framework::localfs_list(etype_path); std::string etype_path_str; if (part_num > 0 && part_num < (int)etype_path_list.size()) { @@ -1116,6 +1288,7 @@ int32_t GraphTable::load_node_and_edge_file(std::string etype, this->load_edges(etype_path_str, true, r_etype); } } else { + std::string npath = node_to_nodedir[ntypes[0]]; auto npath_list = paddle::framework::localfs_list(npath); std::string npath_str; if (part_num > 0 && part_num < (int)npath_list.size()) { @@ -1414,7 +1587,6 @@ int32_t GraphTable::load_edges(const std::string &path, const std::string &edge_type) { #ifdef PADDLE_WITH_HETERPS if (search_level == 2) total_memory_cost = 0; - const uint64_t fixed_load_edges = 1000000; #endif int idx = 0; if (edge_type == "") { @@ -1436,7 +1608,7 @@ int32_t GraphTable::load_edges(const std::string &path, VLOG(0) << "Begin GraphTable::load_edges() edge_type[" << edge_type << "]"; if (FLAGS_graph_load_in_parallel) { std::vector>> tasks; - for (int i = 0; i < paths.size(); i++) { + for (size_t i = 0; i < paths.size(); i++) { tasks.push_back(load_node_edge_task_pool->enqueue( [&, i, idx, this]() -> std::pair { return parse_edge_file(paths[i], idx, reverse_edge); @@ -1895,8 +2067,8 @@ int GraphTable::get_all_id(int type_id, MergeShardVector shard_merge(output, slice_num); auto &search_shards = type_id == 0 ? edge_shards : feature_shards; std::vector> tasks; - for (int idx = 0; idx < search_shards.size(); idx++) { - for (int j = 0; j < search_shards[idx].size(); j++) { + for (size_t idx = 0; idx < search_shards.size(); idx++) { + for (size_t j = 0; j < search_shards[idx].size(); j++) { tasks.push_back(_shards_task_pool[j % task_pool_size_]->enqueue( [&search_shards, idx, j, slice_num, &shard_merge]() -> size_t { std::vector> shard_keys; @@ -1919,8 +2091,8 @@ int GraphTable::get_all_neighbor_id( MergeShardVector shard_merge(output, slice_num); auto &search_shards = type_id == 0 ? edge_shards : feature_shards; std::vector> tasks; - for (int idx = 0; idx < search_shards.size(); idx++) { - for (int j = 0; j < search_shards[idx].size(); j++) { + for (size_t idx = 0; idx < search_shards.size(); idx++) { + for (size_t j = 0; j < search_shards[idx].size(); j++) { tasks.push_back(_shards_task_pool[j % task_pool_size_]->enqueue( [&search_shards, idx, j, slice_num, &shard_merge]() -> size_t { std::vector> shard_keys; @@ -1972,7 +2144,7 @@ int GraphTable::get_all_neighbor_id( auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx]; std::vector> tasks; VLOG(3) << "begin task, task_pool_size_[" << task_pool_size_ << "]"; - for (int i = 0; i < search_shards.size(); i++) { + for (size_t i = 0; i < search_shards.size(); i++) { tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( [&search_shards, i, slice_num, &shard_merge]() -> size_t { std::vector> shard_keys; @@ -1998,7 +2170,7 @@ int GraphTable::get_all_feature_ids( MergeShardVector shard_merge(output, slice_num); auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx]; std::vector> tasks; - for (int i = 0; i < search_shards.size(); i++) { + for (size_t i = 0; i < search_shards.size(); i++) { tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( [&search_shards, i, slice_num, &shard_merge]() -> size_t { std::vector> shard_keys; @@ -2233,5 +2405,43 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) { return 0; } +void GraphTable::build_graph_total_keys() { + VLOG(0) << "begin insert edge to graph_total_keys"; + // build node embedding id + std::vector> keys; + this->get_node_embedding_ids(1, &keys); + graph_total_keys_.insert( + graph_total_keys_.end(), keys[0].begin(), keys[0].end()); + + VLOG(0) << "finish insert edge to graph_total_keys"; +} + +void GraphTable::build_graph_type_keys() { + VLOG(0) << "begin build_graph_type_keys"; + graph_type_keys_.clear(); + graph_type_keys_.resize(this->feature_to_id.size()); + + int cnt = 0; + for (auto &it : this->feature_to_id) { + auto node_idx = it.second; + std::vector> keys; + this->get_all_id(1, node_idx, 1, &keys); + type_to_index_[node_idx] = cnt; + graph_type_keys_[cnt++] = std::move(keys[0]); + } + VLOG(0) << "finish build_graph_type_keys"; + + VLOG(0) << "begin insert feature into graph_total_keys"; + // build feature embedding id + for (auto &it : this->feature_to_id) { + auto node_idx = it.second; + std::vector> keys; + this->get_all_feature_ids(1, node_idx, 1, &keys); + graph_total_keys_.insert( + graph_total_keys_.end(), keys[0].begin(), keys[0].end()); + } + VLOG(0) << "finish insert feature into graph_total_keys"; +} + } // namespace distributed }; // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h index 0855babec83c1..9b416b30d9788 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.h +++ b/paddle/fluid/distributed/ps/table/common_graph_table.h @@ -271,7 +271,6 @@ class RandomSampleLRU { remove(node_head); remove_count--; } - // std::cerr<<"after remove_count = "< *node) { @@ -536,20 +535,27 @@ class GraphTable : public Table { const FsClientParameter &fs_config); virtual int32_t Initialize(const GraphParameter &config); int32_t Load(const std::string &path, const std::string ¶m); - - int32_t load_node_and_edge_file(std::string etype, - std::string ntype, - std::string epath, - std::string npath, + int32_t load_node_and_edge_file(std::string etype2files, + std::string ntype2files, + std::string graph_data_local_path, int part_num, bool reverse); - + int32_t parse_edge_and_load(std::string etype2files, + std::string graph_data_local_path, + int part_num, + bool reverse); + int32_t parse_node_and_load(std::string ntype2files, + std::string graph_data_local_path, + int part_num); std::string get_inverse_etype(std::string &etype); - + int32_t parse_type_to_typepath( + std::string &type2files, + std::string graph_data_local_path, + std::vector &res_type, + std::unordered_map &res_type2path); int32_t load_edges(const std::string &path, bool reverse, const std::string &edge_type); - int get_all_id(int type, int slice_num, std::vector> *output); @@ -631,7 +637,13 @@ class GraphTable : public Table { const std::vector> &res); size_t get_server_num() { return server_num; } + void clear_graph(); void clear_graph(int idx); + void clear_edge_shard(); + void clear_feature_shard(); + void release_graph(); + void release_graph_edge(); + void release_graph_node(); virtual int32_t make_neighbor_sample_cache(size_t size_limit, size_t ttl) { { std::unique_lock lock(mutex_); @@ -694,6 +706,14 @@ class GraphTable : public Table { virtual int32_t add_comm_edge(int idx, uint64_t src_id, uint64_t dst_id); virtual int32_t build_sampler(int idx, std::string sample_type = "random"); void set_feature_separator(const std::string &ch); + + void build_graph_total_keys(); + void build_graph_type_keys(); + + std::vector graph_total_keys_; + std::vector> graph_type_keys_; + std::unordered_map type_to_index_; + std::vector> edge_shards, feature_shards; size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num; int task_pool_size_ = 24; diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc index 4feee70fed751..3e4f4d68f49ca 100644 --- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc +++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc @@ -292,7 +292,8 @@ std::string CtrDymfAccessor::ParseToString(const float* v, int param) { thread_local std::ostringstream os; os.clear(); os.str(""); - os << v[0] << " " << v[1] << " " << v[2] << " " << v[3] << " " << v[4]; + os << common_feature_value.UnseenDays(const_cast(v)) << " " << v[1] + << " " << v[2] << " " << v[3] << " " << v[4]; // << v[5] << " " << v[6]; for (int i = common_feature_value.EmbedG2SumIndex(); i < common_feature_value.EmbedxG2SumIndex(); @@ -320,5 +321,18 @@ int CtrDymfAccessor::ParseFromString(const std::string& str, float* value) { return ret; } +bool CtrDymfAccessor::SaveMemCache(float* value, + int param, + double global_cache_threshold, + uint16_t pass_id) { + auto base_threshold = _config.ctr_accessor_param().base_threshold(); + return common_feature_value.Show(value) > global_cache_threshold || + common_feature_value.PassId(value) >= pass_id; +} + +void CtrDymfAccessor::UpdatePassId(float* value, uint16_t pass_id) { + common_feature_value.PassId(value) = pass_id; +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h index b820d617d06ae..047bafedd9d7b 100644 --- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h @@ -30,7 +30,7 @@ namespace distributed { class CtrDymfAccessor : public ValueAccessor { public: struct CtrDymfFeatureValue { - /* + /*v1: old version float unseen_days; float delta_score; float show; @@ -44,6 +44,20 @@ class CtrDymfAccessor : public ValueAccessor { // float embedx_g2sum; std::vector embedx_w; */ + /* V2: support pass_id + uint16_t pass_id; + uint16_t unseen_days; + float show; + float click; + float embed_w; + // float embed_g2sum; + std::vector embed_g2sum; + float slot; + float mf_dim + std::float embedx_g2sum; + // float embedx_g2sum; + std::vector embedx_w; + */ int Dim() { return 7 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; } int DimSize(size_t dim, int embedx_dim) { return sizeof(float); } @@ -73,7 +87,17 @@ class CtrDymfAccessor : public ValueAccessor { // 根据mf_dim计算的总byte数 int Size(int& mf_dim) { return (Dim(mf_dim)) * sizeof(float); } - float& UnseenDays(float* val) { return val[UnseenDaysIndex()]; } + uint16_t& PassId(float* val) { + uint16_t* int16_val = + reinterpret_cast(val + UnseenDaysIndex()); + return int16_val[0]; + } + + uint16_t& UnseenDays(float* val) { + uint16_t* int16_val = + reinterpret_cast(val + UnseenDaysIndex()); + return int16_val[1]; + } float& DeltaScore(float* val) { return val[DeltaScoreIndex()]; } float& Show(float* val) { return val[ShowIndex()]; } float& Click(float* val) { return val[ClickIndex()]; } @@ -217,6 +241,14 @@ class CtrDymfAccessor : public ValueAccessor { return 0.0; } + //根据pass_id和show_threashold阈值来判断cache到ssd + bool SaveMemCache(float* value, + int param, + double global_cache_threshold, + uint16_t pass_id); + //更新pass_id + void UpdatePassId(float* value, uint16_t pass_id); + private: // float ShowClickScore(float show, float click); diff --git a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h index eb3ff2e254f56..5b5a0057f3707 100644 --- a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h +++ b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h @@ -16,8 +16,12 @@ #include #include #include +#include #include +#include +#include #include +#include #include #include @@ -27,6 +31,55 @@ namespace paddle { namespace distributed { +class Uint64Comparator : public rocksdb::Comparator { + int Compare(const rocksdb::Slice& a, const rocksdb::Slice& b) const { + uint64_t A = *((uint64_t*)const_cast(a.data())); + uint64_t B = *((uint64_t*)const_cast(b.data())); + if (A < B) { + return -1; + } + if (A > B) { + return 1; + } + return 0; + } + const char* Name() const { return "Uint64Comparator"; } + void FindShortestSeparator(std::string*, const rocksdb::Slice&) const {} + void FindShortSuccessor(std::string*) const {} +}; + +class RocksDBItem { + public: + RocksDBItem() {} + ~RocksDBItem() {} + void reset() { + batch_keys.clear(); + batch_index.clear(); + batch_values.clear(); + status.clear(); + } + std::vector batch_keys; + std::vector batch_index; + std::vector batch_values; + std::vector status; +}; + +class RocksDBCtx { + public: + RocksDBCtx() { + items[0].reset(); + items[1].reset(); + cur_index = 0; + } + ~RocksDBCtx() {} + RocksDBItem* switch_item() { + cur_index = (cur_index + 1) % 2; + return &items[cur_index]; + } + RocksDBItem items[2]; + int cur_index; +}; + class RocksDBHandler { public: RocksDBHandler() {} @@ -38,55 +91,69 @@ class RocksDBHandler { } int initialize(const std::string& db_path, const int colnum) { - VLOG(3) << "db path: " << db_path << " colnum: " << colnum; - rocksdb::Options options; - rocksdb::BlockBasedTableOptions bbto; - bbto.block_size = 4 * 1024; - bbto.block_cache = rocksdb::NewLRUCache(64 * 1024 * 1024); - bbto.block_cache_compressed = rocksdb::NewLRUCache(64 * 1024 * 1024); - bbto.cache_index_and_filter_blocks = false; - bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(20, false)); - bbto.whole_key_filtering = true; - options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbto)); - - options.keep_log_file_num = 100; - options.max_log_file_size = 50 * 1024 * 1024; // 50MB - options.create_if_missing = true; - options.use_direct_reads = true; - options.max_background_flushes = 5; - options.max_background_compactions = 5; - options.base_background_compactions = 10; - options.write_buffer_size = 256 * 1024 * 1024; // 256MB - options.max_write_buffer_number = 8; - options.max_bytes_for_level_base = - options.max_write_buffer_number * options.write_buffer_size; - options.min_write_buffer_number_to_merge = 1; - options.target_file_size_base = 1024 * 1024 * 1024; // 1024MB - options.memtable_prefix_bloom_size_ratio = 0.02; - options.num_levels = 4; - options.max_open_files = -1; - - options.compression = rocksdb::kNoCompression; - options.level0_file_num_compaction_trigger = 8; - options.level0_slowdown_writes_trigger = - 1.8 * options.level0_file_num_compaction_trigger; - options.level0_stop_writes_trigger = - 3.6 * options.level0_file_num_compaction_trigger; - - if (!db_path.empty()) { - std::string rm_cmd = "rm -rf " + db_path; - system(rm_cmd.c_str()); - } - - rocksdb::Status s = rocksdb::DB::Open(options, db_path, &_db); - assert(s.ok()); - _handles.resize(colnum); + VLOG(0) << "db path: " << db_path << " colnum: " << colnum; + _dbs.resize(colnum); for (int i = 0; i < colnum; i++) { - s = _db->CreateColumnFamily( - options, "shard_" + std::to_string(i), &_handles[i]); + rocksdb::Options options; + options.comparator = &_comparator; + rocksdb::BlockBasedTableOptions bbto; + // options.memtable_factory.reset(rocksdb::NewHashSkipListRepFactory(65536)); + // options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(2)); + bbto.format_version = 5; + bbto.use_delta_encoding = false; + bbto.block_size = 4 * 1024; + bbto.block_restart_interval = 6; + bbto.block_cache = rocksdb::NewLRUCache(64 * 1024 * 1024); + // bbto.block_cache_compressed = rocksdb::NewLRUCache(64 * 1024 * 1024); + bbto.cache_index_and_filter_blocks = false; + bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(15, false)); + bbto.whole_key_filtering = true; + options.statistics = rocksdb::CreateDBStatistics(); + options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbto)); + + // options.IncreaseParallelism(); + options.OptimizeLevelStyleCompaction(); + options.keep_log_file_num = 100; + // options.db_log_dir = "./log/rocksdb"; + options.max_log_file_size = 50 * 1024 * 1024; // 50MB + // options.threads = 8; + options.create_if_missing = true; + options.use_direct_reads = true; + options.max_background_flushes = 37; + options.max_background_compactions = 64; + options.base_background_compactions = 10; + options.write_buffer_size = 256 * 1024 * 1024; // 256MB + options.max_write_buffer_number = 8; + options.max_bytes_for_level_base = + options.max_write_buffer_number * options.write_buffer_size; + options.min_write_buffer_number_to_merge = 1; + options.target_file_size_base = 1024 * 1024 * 1024; // 1024MB + // options.verify_checksums_in_compaction = false; + // options.disable_auto_compactions = true; + options.memtable_prefix_bloom_size_ratio = 0.02; + options.num_levels = 4; + options.max_open_files = -1; + + options.compression = rocksdb::kNoCompression; + // options.compaction_options_fifo = rocksdb::CompactionOptionsFIFO(); + // options.compaction_style = + // rocksdb::CompactionStyle::kCompactionStyleFIFO; + options.level0_file_num_compaction_trigger = 5; + options.level0_slowdown_writes_trigger = + 1.8 * options.level0_file_num_compaction_trigger; + options.level0_stop_writes_trigger = + 3.6 * options.level0_file_num_compaction_trigger; + + std::string shard_path = db_path + "_" + std::to_string(i); + if (!shard_path.empty()) { + std::string rm_cmd = "rm -rf " + shard_path; + system(rm_cmd.c_str()); + } + + rocksdb::Status s = rocksdb::DB::Open(options, shard_path, &_dbs[i]); assert(s.ok()); } - LOG(INFO) << "DB initialize success, colnum:" << colnum; + VLOG(0) << "DB initialize success, colnum:" << colnum; return 0; } @@ -94,10 +161,9 @@ class RocksDBHandler { int id, const char* key, int key_len, const char* value, int value_len) { rocksdb::WriteOptions options; options.disableWAL = true; - rocksdb::Status s = _db->Put(options, - _handles[id], - rocksdb::Slice(key, key_len), - rocksdb::Slice(value, value_len)); + rocksdb::Status s = _dbs[id]->Put(options, + rocksdb::Slice(key, key_len), + rocksdb::Slice(value, value_len)); assert(s.ok()); return 0; } @@ -110,20 +176,17 @@ class RocksDBHandler { options.disableWAL = true; rocksdb::WriteBatch batch(n * 128); for (int i = 0; i < n; i++) { - batch.Put(_handles[id], - rocksdb::Slice(ssd_keys[i].first, ssd_keys[i].second), + batch.Put(rocksdb::Slice(ssd_keys[i].first, ssd_keys[i].second), rocksdb::Slice(ssd_values[i].first, ssd_values[i].second)); } - rocksdb::Status s = _db->Write(options, &batch); + rocksdb::Status s = _dbs[id]->Write(options, &batch); assert(s.ok()); return 0; } int get(int id, const char* key, int key_len, std::string& value) { - rocksdb::Status s = _db->Get(rocksdb::ReadOptions(), - _handles[id], - rocksdb::Slice(key, key_len), - &value); + rocksdb::Status s = _dbs[id]->Get( + rocksdb::ReadOptions(), rocksdb::Slice(key, key_len), &value); if (s.IsNotFound()) { return 1; } @@ -131,33 +194,58 @@ class RocksDBHandler { return 0; } + void multi_get(int id, + const size_t num_keys, + const rocksdb::Slice* keys, + rocksdb::PinnableSlice* values, + rocksdb::Status* status, + const bool sorted_input = true) { + rocksdb::ColumnFamilyHandle* handle = _dbs[id]->DefaultColumnFamily(); + auto read_opt = rocksdb::ReadOptions(); + read_opt.fill_cache = false; + _dbs[id]->MultiGet( + read_opt, handle, num_keys, keys, values, status, sorted_input); + } + int del_data(int id, const char* key, int key_len) { rocksdb::WriteOptions options; options.disableWAL = true; - rocksdb::Status s = - _db->Delete(options, _handles[id], rocksdb::Slice(key, key_len)); + rocksdb::Status s = _dbs[id]->Delete(options, rocksdb::Slice(key, key_len)); assert(s.ok()); return 0; } int flush(int id) { - rocksdb::Status s = _db->Flush(rocksdb::FlushOptions(), _handles[id]); + rocksdb::Status s = _dbs[id]->Flush(rocksdb::FlushOptions()); assert(s.ok()); return 0; } rocksdb::Iterator* get_iterator(int id) { - return _db->NewIterator(rocksdb::ReadOptions(), _handles[id]); + return _dbs[id]->NewIterator(rocksdb::ReadOptions()); } int get_estimate_key_num(uint64_t& num_keys) { - _db->GetAggregatedIntProperty("rocksdb.estimate-num-keys", &num_keys); + // _db->GetAggregatedIntProperty("rocksdb.estimate-num-keys", &num_keys); + return 0; + } + + Uint64Comparator* get_comparator() { return &_comparator; } + + int ingest_externel_file(int id, + const std::vector& sst_filelist) { + rocksdb::IngestExternalFileOptions ifo; + ifo.move_files = true; + rocksdb::Status s = _dbs[id]->IngestExternalFile(sst_filelist, ifo); + assert(s.ok()); return 0; } private: std::vector _handles; - rocksdb::DB* _db; + // rocksdb::DB* _db; + std::vector _dbs; + Uint64Comparator _comparator; }; } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc index a46244265ef20..9a69433c6104a 100644 --- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc @@ -172,7 +172,6 @@ int32_t MemorySparseTable::Load(const std::string& path, value.resize(feature_value_size); int parse_size = _value_accesor->ParseFromString(++end, value.data()); value.resize(parse_size); - } read_channel->close(); if (err_no == -1) { @@ -725,7 +724,8 @@ int32_t MemorySparseTable::Pull(TableContext& context) { if (context.use_ptr) { char** pull_values = context.pull_context.ptr_values; const uint64_t* keys = context.pull_context.keys; - return PullSparsePtr(pull_values, keys, context.num); + return PullSparsePtr( + context.shard_id, pull_values, keys, context.num, context.pass_id); } else { float* pull_values = context.pull_context.values; const PullSparseValue& pull_value = context.pull_context.pull_value; @@ -822,9 +822,11 @@ int32_t MemorySparseTable::PullSparse(float* pull_values, return 0; } -int32_t MemorySparseTable::PullSparsePtr(char** pull_values, +int32_t MemorySparseTable::PullSparsePtr(int shard_id, // fake num + char** pull_values, const uint64_t* keys, - size_t num) { + size_t num, + uint16_t pass_id) { CostTimer timer("pscore_sparse_select_all"); size_t value_size = _value_accesor->GetAccessorInfo().size / sizeof(float); size_t mf_value_size = diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_table.h index 17018d5e5dfc3..658446d770c71 100644 --- a/paddle/fluid/distributed/ps/table/memory_sparse_table.h +++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.h @@ -90,7 +90,11 @@ class MemorySparseTable : public Table { std::pair PrintTableStat() override; int32_t PullSparse(float* values, const PullSparseValue& pull_value); - int32_t PullSparsePtr(char** pull_values, const uint64_t* keys, size_t num); + int32_t PullSparsePtr(int shard_id, + char** pull_values, + const uint64_t* keys, + size_t num, + uint16_t pass_id); int32_t PushSparse(const uint64_t* keys, const float* values, size_t num); diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc index 3e0f631ed41bc..b2bd0b36795ad 100644 --- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc @@ -24,8 +24,10 @@ DECLARE_bool(pserver_print_missed_key_num_every_push); DECLARE_bool(pserver_create_value_when_push); DECLARE_bool(pserver_enable_create_feasign_randomly); DEFINE_bool(pserver_open_strict_check, false, "pserver_open_strict_check"); -DEFINE_string(rocksdb_path, "database", "path of sparse table rocksdb file"); DEFINE_int32(pserver_load_batch_size, 5000, "load batch size for ssd"); +PADDLE_DEFINE_EXPORTED_string(rocksdb_path, + "database", + "path of sparse table rocksdb file"); namespace paddle { namespace distributed { @@ -34,6 +36,9 @@ int32_t SSDSparseTable::Initialize() { MemorySparseTable::Initialize(); _db = paddle::distributed::RocksDBHandler::GetInstance(); _db->initialize(FLAGS_rocksdb_path, _real_local_shard_num); + VLOG(0) << "initalize SSDSparseTable succ"; + VLOG(0) << "SSD FLAGS_pserver_print_missed_key_num_every_push:" + << FLAGS_pserver_print_missed_key_num_every_push; return 0; } @@ -44,7 +49,8 @@ int32_t SSDSparseTable::Pull(TableContext& context) { if (context.use_ptr) { char** pull_values = context.pull_context.ptr_values; const uint64_t* keys = context.pull_context.keys; - return PullSparsePtr(pull_values, keys, context.num); + return PullSparsePtr( + context.shard_id, pull_values, keys, context.num, context.pass_id); } else { float* pull_values = context.pull_context.values; const PullSparseValue& pull_value = context.pull_context.pull_value; @@ -171,90 +177,139 @@ int32_t SSDSparseTable::PullSparse(float* pull_values, return 0; } -int32_t SSDSparseTable::PullSparsePtr(char** pull_values, - const uint64_t* keys, - size_t num) { +int32_t SSDSparseTable::PullSparsePtr(int shard_id, + char** pull_values, + const uint64_t* pull_keys, + size_t num, + uint16_t pass_id) { CostTimer timer("pserver_ssd_sparse_select_all"); size_t value_size = _value_accesor->GetAccessorInfo().size / sizeof(float); size_t mf_value_size = _value_accesor->GetAccessorInfo().mf_size / sizeof(float); { // 从table取值 or create - std::vector> tasks(_real_local_shard_num); - std::vector>> task_keys( - _real_local_shard_num); - for (size_t i = 0; i < num; ++i) { - int shard_id = (keys[i] % _sparse_table_shard_num) % _avg_local_shard_num; - task_keys[shard_id].push_back({keys[i], i}); - } + RocksDBCtx context; + std::vector> tasks; + RocksDBItem* cur_ctx = context.switch_item(); + cur_ctx->reset(); + FixedFeatureValue* ret = NULL; + auto& local_shard = _local_shards[shard_id]; + float data_buffer[value_size]; + float* data_buffer_ptr = data_buffer; - std::atomic missed_keys{0}; - for (int shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) { - tasks[shard_id] = + for (int i = 0; i < num; ++i) { + uint64_t key = pull_keys[i]; + auto itr = local_shard.find(key); + if (itr == local_shard.end()) { + cur_ctx->batch_index.push_back(i); + cur_ctx->batch_keys.push_back( + rocksdb::Slice((char*)&(pull_keys[i]), sizeof(uint64_t))); + if (cur_ctx->batch_keys.size() == 1024) { + cur_ctx->batch_values.resize(cur_ctx->batch_keys.size()); + cur_ctx->status.resize(cur_ctx->batch_keys.size()); + auto fut = + _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue( + [this, shard_id, cur_ctx]() -> int { + _db->multi_get(shard_id, + cur_ctx->batch_keys.size(), + cur_ctx->batch_keys.data(), + cur_ctx->batch_values.data(), + cur_ctx->status.data()); + return 0; + }); + cur_ctx = context.switch_item(); + for (size_t x = 0; x < tasks.size(); ++x) { + tasks[x].wait(); + for (size_t idx = 0; idx < cur_ctx->status.size(); idx++) { + uint64_t cur_key = *((uint64_t*)const_cast( + cur_ctx->batch_keys[idx].data())); + if (cur_ctx->status[idx].IsNotFound()) { + auto& feature_value = local_shard[cur_key]; + int init_size = value_size - mf_value_size; + feature_value.resize(init_size); + _value_accesor->Create(&data_buffer_ptr, 1); + memcpy(const_cast(feature_value.data()), + data_buffer_ptr, + init_size * sizeof(float)); + ret = &feature_value; + } else { + int data_size = + cur_ctx->batch_values[idx].size() / sizeof(float); + // from rocksdb to mem + auto& feature_value = local_shard[cur_key]; + feature_value.resize(data_size); + memcpy(const_cast(feature_value.data()), + paddle::string::str_to_float( + cur_ctx->batch_values[idx].data()), + data_size * sizeof(float)); + _db->del_data(shard_id, (char*)&cur_key, sizeof(uint64_t)); + ret = &feature_value; + } + _value_accesor->UpdatePassId(ret->data(), pass_id); + int pull_data_idx = cur_ctx->batch_index[idx]; + pull_values[pull_data_idx] = (char*)ret; + } + } + cur_ctx->reset(); + tasks.clear(); + tasks.push_back(std::move(fut)); + } + } else { + ret = itr.value_ptr(); + // int pull_data_idx = keys[i].second; + _value_accesor->UpdatePassId(ret->data(), pass_id); + pull_values[i] = (char*)ret; + } + } + if (cur_ctx->batch_keys.size() != 0) { + cur_ctx->batch_values.resize(cur_ctx->batch_keys.size()); + cur_ctx->status.resize(cur_ctx->batch_keys.size()); + auto fut = _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue( - [this, - shard_id, - &task_keys, - value_size, - mf_value_size, - pull_values, - &missed_keys]() -> int { - auto& keys = task_keys[shard_id]; - auto& local_shard = _local_shards[shard_id]; - float data_buffer[value_size]; // NOLINT - float* data_buffer_ptr = data_buffer; - for (size_t i = 0; i < keys.size(); ++i) { - uint64_t key = keys[i].first; - auto itr = local_shard.find(key); - size_t data_size = value_size - mf_value_size; - FixedFeatureValue* ret = NULL; - if (itr == local_shard.end()) { - // pull rocksdb - std::string tmp_string(""); - if (_db->get(shard_id, - reinterpret_cast(&key), - sizeof(uint64_t), - tmp_string) > 0) { - ++missed_keys; - auto& feature_value = local_shard[key]; - feature_value.resize(data_size); - float* data_ptr = - const_cast(feature_value.data()); - _value_accesor->Create(&data_buffer_ptr, 1); - memcpy( - data_ptr, data_buffer_ptr, data_size * sizeof(float)); - ret = &feature_value; - } else { - data_size = tmp_string.size() / sizeof(float); - memcpy(data_buffer_ptr, - paddle::string::str_to_float(tmp_string), - data_size * sizeof(float)); - // from rocksdb to mem - auto& feature_value = local_shard[key]; - feature_value.resize(data_size); - memcpy(const_cast(feature_value.data()), - data_buffer_ptr, - data_size * sizeof(float)); - _db->del_data(shard_id, - reinterpret_cast(&key), - sizeof(uint64_t)); - ret = &feature_value; - } - } else { - ret = itr.value_ptr(); - } - int pull_data_idx = keys[i].second; - pull_values[pull_data_idx] = reinterpret_cast(ret); - } + [this, shard_id, cur_ctx]() -> int { + _db->multi_get(shard_id, + cur_ctx->batch_keys.size(), + cur_ctx->batch_keys.data(), + cur_ctx->batch_values.data(), + cur_ctx->status.data()); return 0; }); + tasks.push_back(std::move(fut)); } - for (int i = 0; i < _real_local_shard_num; ++i) { - tasks[i].wait(); + for (size_t x = 0; x < tasks.size(); ++x) { + tasks[x].wait(); } - if (FLAGS_pserver_print_missed_key_num_every_push) { - LOG(WARNING) << "total pull keys:" << num - << " missed_keys:" << missed_keys.load(); + for (size_t x = 0; x < 2; x++) { + cur_ctx = context.switch_item(); + for (size_t idx = 0; idx < cur_ctx->status.size(); idx++) { + uint64_t cur_key = + *((uint64_t*)const_cast(cur_ctx->batch_keys[idx].data())); + if (cur_ctx->status[idx].IsNotFound()) { + auto& feature_value = local_shard[cur_key]; + int init_size = value_size - mf_value_size; + feature_value.resize(init_size); + _value_accesor->Create(&data_buffer_ptr, 1); + memcpy(const_cast(feature_value.data()), + data_buffer_ptr, + init_size * sizeof(float)); + ret = &feature_value; + } else { + int data_size = cur_ctx->batch_values[idx].size() / sizeof(float); + // from rocksdb to mem + auto& feature_value = local_shard[cur_key]; + feature_value.resize(data_size); + memcpy( + const_cast(feature_value.data()), + paddle::string::str_to_float(cur_ctx->batch_values[idx].data()), + data_size * sizeof(float)); + _db->del_data(shard_id, (char*)&cur_key, sizeof(uint64_t)); + ret = &feature_value; + } + _value_accesor->UpdatePassId(ret->data(), pass_id); + int pull_data_idx = cur_ctx->batch_index[idx]; + pull_values[pull_data_idx] = (char*)ret; + } + cur_ctx->reset(); } } return 0; @@ -536,157 +591,165 @@ int32_t SSDSparseTable::Save(const std::string& path, // } // LOG(INFO) << "table cache rate is: " << _config.sparse_table_cache_rate(); - LOG(INFO) << "table cache rate is: " << _config.sparse_table_cache_rate(); - LOG(INFO) << "enable_sparse_table_cache: " - << _config.enable_sparse_table_cache(); - LOG(INFO) << "LocalSize: " << LocalSize(); + VLOG(0) << "table cache rate is: " << _config.sparse_table_cache_rate(); + VLOG(0) << "enable_sparse_table_cache: " + << _config.enable_sparse_table_cache(); + VLOG(0) << "LocalSize: " << LocalSize(); if (_config.enable_sparse_table_cache()) { - LOG(INFO) << "Enable sparse table cache, top n:" << _cache_tk_size; + VLOG(0) << "Enable sparse table cache, top n:" << _cache_tk_size; } _cache_tk_size = LocalSize() * _config.sparse_table_cache_rate(); TopkCalculator tk(_real_local_shard_num, _cache_tk_size); + VLOG(0) << "TopkCalculator top n:" << _cache_tk_size; size_t file_start_idx = _avg_local_shard_num * _shard_idx; std::string table_path = TableDir(path); _afs_client.remove(paddle::string::format_string( "%s/part-%03d-*", table_path.c_str(), _shard_idx)); +#ifdef PADDLE_WITH_GPU_GRAPH + int thread_num = _real_local_shard_num; +#else int thread_num = _real_local_shard_num < 20 ? _real_local_shard_num : 20; +#endif // std::atomic feasign_size; std::atomic feasign_size_all{0}; // feasign_size = 0; - omp_set_num_threads(thread_num); -#pragma omp parallel for schedule(dynamic) - for (int i = 0; i < _real_local_shard_num; ++i) { + std::vector< + paddle::framework::Channel>>> + fs_channel; + for (int i = 0; i < _real_local_shard_num; i++) { + fs_channel.push_back( + paddle::framework::MakeChannel>>( + 10240)); + } + std::vector threads; + threads.resize(_real_local_shard_num); + + auto save_func = [this, + &save_param, + &table_path, + &file_start_idx, + &fs_channel](int file_num) { + int err_no = 0; FsChannelConfig channel_config; if (_config.compress_in_save() && (save_param == 0 || save_param == 3)) { channel_config.path = paddle::string::format_string("%s/part-%03d-%05d.gz", table_path.c_str(), _shard_idx, - file_start_idx + i); + file_start_idx + file_num); } else { - channel_config.path = paddle::string::format_string("%s/part-%03d-%05d", - table_path.c_str(), - _shard_idx, - file_start_idx + i); + channel_config.path = + paddle::string::format_string("%s/part-%03d-%05d", + table_path.c_str(), + _shard_idx, + file_start_idx + file_num); } channel_config.converter = _value_accesor->Converter(save_param).converter; channel_config.deconverter = _value_accesor->Converter(save_param).deconverter; - int err_no = 0; - int retry_num = 0; - bool is_write_failed = false; + auto write_channel = + _afs_client.open_w(channel_config, 1024 * 1024 * 40, &err_no); + paddle::framework::ChannelReader>> + reader(fs_channel[file_num].get()); + std::pair> out_str; + while (reader >> out_str) { + std::string format_value = _value_accesor->ParseToString( + out_str.second.data(), out_str.second.size()); + if (0 != write_channel->write_line(paddle::string::format_string( + "%lu %s", out_str.first, format_value.c_str()))) { + LOG(FATAL) << "SSDSparseTable save failed, retry it! path:" + << channel_config.path; + } + } + write_channel->close(); + }; + for (size_t i = 0; i < threads.size(); i++) { + threads[i] = std::thread(save_func, i); + } + + std::vector< + paddle::framework::ChannelWriter>>> + writers(_real_local_shard_num); + omp_set_num_threads(thread_num); +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < _real_local_shard_num; ++i) { int feasign_size = 0; auto& shard = _local_shards[i]; - do { - err_no = 0; - feasign_size = 0; - is_write_failed = false; - auto write_channel = - _afs_client.open_w(channel_config, 1024 * 1024 * 40, &err_no); + auto& writer = writers[i]; + writer.Reset(fs_channel[i].get()); + { for (auto it = shard.begin(); it != shard.end(); ++it) { if (_config.enable_sparse_table_cache() && - (save_param == 1 || save_param == 2) && - _value_accesor->Save(it.value().data(), 4)) { - // tk.push(i, it.value().data()[2]); + (save_param == 1 || save_param == 2)) { + // get_field get right decayed show tk.push(i, _value_accesor->GetField(it.value().data(), "show")); } if (_value_accesor->Save(it.value().data(), save_param)) { - std::string format_value = _value_accesor->ParseToString( - it.value().data(), it.value().size()); - if (0 != write_channel->write_line(paddle::string::format_string( - "%lu %s", it.key(), format_value.c_str()))) { - ++retry_num; - is_write_failed = true; - LOG(ERROR) << "SSDSparseTable save failed, retry it! path:" - << channel_config.path << ", retry_num=" << retry_num; - break; - } + std::vector feature_value; + feature_value.resize(it.value().size()); + memcpy(const_cast(feature_value.data()), + it.value().data(), + it.value().size() * sizeof(float)); + writer << std::make_pair(it.key(), std::move(feature_value)); ++feasign_size; } } + } - if (err_no == -1 && !is_write_failed) { - ++retry_num; - is_write_failed = true; - LOG(ERROR) << "SSDSparseTable save failed after write, retry it! " - << "path:" << channel_config.path - << " , retry_num=" << retry_num; - } - if (is_write_failed) { - _afs_client.remove(channel_config.path); - continue; - } - - // delta and cache and revert is all in mem, base in rocksdb - if (save_param != 1) { - auto* it = _db->get_iterator(i); - for (it->SeekToFirst(); it->Valid(); it->Next()) { - bool need_save = _value_accesor->Save( - paddle::string::str_to_float(it->value().data()), save_param); - _value_accesor->UpdateStatAfterSave( - paddle::string::str_to_float(it->value().data()), save_param); - if (need_save) { - std::string format_value = _value_accesor->ParseToString( - paddle::string::str_to_float(it->value().data()), - it->value().size() / sizeof(float)); - if (0 != write_channel->write_line(paddle::string::format_string( - "%lu %s", - *((uint64_t*)const_cast(it->key().data())), - format_value.c_str()))) { - ++retry_num; - is_write_failed = true; - LOG(ERROR) << "SSDSparseTable save failed, retry it! path:" - << channel_config.path << ", retry_num=" << retry_num; - break; - } - if (save_param == 3) { - _db->put(i, - it->key().data(), - it->key().size(), - it->value().data(), - it->value().size()); - } - ++feasign_size; - } + if (save_param != 1) { + auto* it = _db->get_iterator(i); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + bool need_save = _value_accesor->Save( + paddle::string::str_to_float(it->value().data()), save_param); + _value_accesor->UpdateStatAfterSave( + paddle::string::str_to_float(it->value().data()), save_param); + if (need_save) { + std::vector feature_value; + feature_value.resize(it->value().size() / sizeof(float)); + memcpy(const_cast(feature_value.data()), + paddle::string::str_to_float(it->value().data()), + it->value().size()); + writer << std::make_pair( + *((uint64_t*)const_cast(it->key().data())), + std::move(feature_value)); + ++feasign_size; } - delete it; } + delete it; + } - write_channel->close(); - if (err_no == -1) { - ++retry_num; - is_write_failed = true; - LOG(ERROR) << "SSDSparseTable save failed after write, retry it! " - << "path:" << channel_config.path - << " , retry_num=" << retry_num; - } - if (is_write_failed) { - _afs_client.remove(channel_config.path); - } - } while (is_write_failed); + writer.Flush(); + fs_channel[i]->Close(); feasign_size_all += feasign_size; for (auto it = shard.begin(); it != shard.end(); ++it) { _value_accesor->UpdateStatAfterSave(it.value().data(), save_param); } } + for (int i = 0; i < threads.size(); i++) { + threads[i].join(); + } + for (int i = 0; i < fs_channel.size(); i++) { + fs_channel[i].reset(); + } + fs_channel.clear(); + if (save_param == 3) { - UpdateTable(); + // UpdateTable(); _cache_tk_size = LocalSize() * _config.sparse_table_cache_rate(); - LOG(INFO) << "SSDSparseTable update success."; - } - LOG(INFO) << "SSDSparseTable save success, path:" - << paddle::string::format_string("%s/%03d/part-%03d-", - path.c_str(), - _config.table_id(), - _shard_idx) - << " from " << file_start_idx << " to " - << file_start_idx + _real_local_shard_num - 1; - // return feasign_size_all; + VLOG(0) << "SSDSparseTable update success."; + } + VLOG(0) << "SSDSparseTable save success, feasign size:" << feasign_size_all + << ", path:" + << paddle::string::format_string("%s/%03d/part-%03d-", + path.c_str(), + _config.table_id(), + _shard_idx) + << " from " << file_start_idx << " to " + << file_start_idx + _real_local_shard_num - 1; _local_show_threshold = tk.top(); - LOG(INFO) << "local cache threshold: " << _local_show_threshold; - // int32 may overflow need to change return value + VLOG(0) << "local cache threshold: " << _local_show_threshold; return 0; } @@ -857,7 +920,167 @@ int32_t SSDSparseTable::SaveCache( int32_t SSDSparseTable::Load(const std::string& path, const std::string& param) { - return MemorySparseTable::Load(path, param); + VLOG(0) << "LOAD FLAGS_rocksdb_path:" << FLAGS_rocksdb_path; + std::string table_path = TableDir(path); + auto file_list = _afs_client.list(table_path); + + // std::sort(file_list.begin(), file_list.end()); + for (auto file : file_list) { + VLOG(1) << "SSDSparseTable::Load() file list: " << file; + } + + int load_param = atoi(param.c_str()); + size_t expect_shard_num = _sparse_table_shard_num; + if (file_list.size() != expect_shard_num) { + LOG(WARNING) << "SSDSparseTable file_size:" << file_list.size() + << " not equal to expect_shard_num:" << expect_shard_num; + return -1; + } + if (file_list.size() == 0) { + LOG(WARNING) << "SSDSparseTable load file is empty, path:" << path; + return -1; + } + + size_t file_start_idx = _shard_idx * _avg_local_shard_num; + + if (file_start_idx >= file_list.size()) { + return 0; + } + + size_t feature_value_size = + _value_accesor->GetAccessorInfo().size / sizeof(float); + size_t mf_value_size = + _value_accesor->GetAccessorInfo().mf_size / sizeof(float); + +#ifdef PADDLE_WITH_HETERPS + int thread_num = _real_local_shard_num; +#else + int thread_num = _real_local_shard_num < 15 ? _real_local_shard_num : 15; +#endif + + for (int i = 0; i < _real_local_shard_num; i++) { + _fs_channel.push_back(paddle::framework::MakeChannel(30000)); + } + + std::vector threads; + threads.resize(thread_num); + auto load_func = [this, &file_start_idx, &file_list, &load_param]( + int file_num) { + int err_no = 0; + FsChannelConfig channel_config; + channel_config.path = file_list[file_num + file_start_idx]; + VLOG(1) << "SSDSparseTable::load begin load " << channel_config.path + << " into local shard " << file_num; + channel_config.converter = _value_accesor->Converter(load_param).converter; + channel_config.deconverter = + _value_accesor->Converter(load_param).deconverter; + + std::string line_data; + auto read_channel = _afs_client.open_r(channel_config, 0, &err_no); + paddle::framework::ChannelWriter writer( + _fs_channel[file_num].get()); + while (read_channel->read_line(line_data) == 0 && line_data.size() > 1) { + writer << line_data; + } + writer.Flush(); + read_channel->close(); + _fs_channel[file_num]->Close(); + }; + for (size_t i = 0; i < threads.size(); i++) { + threads[i] = std::thread(load_func, i); + } + + omp_set_num_threads(thread_num); +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < _real_local_shard_num; ++i) { + std::vector> ssd_keys; + std::vector> ssd_values; + std::vector tmp_key; + ssd_keys.reserve(FLAGS_pserver_load_batch_size); + ssd_values.reserve(FLAGS_pserver_load_batch_size); + tmp_key.reserve(FLAGS_pserver_load_batch_size); + ssd_keys.clear(); + ssd_values.clear(); + tmp_key.clear(); + std::string line_data; + char* end = NULL; + int local_shard_id = i % _avg_local_shard_num; + auto& shard = _local_shards[local_shard_id]; + float data_buffer[FLAGS_pserver_load_batch_size * feature_value_size]; + float* data_buffer_ptr = data_buffer; + uint64_t mem_count = 0; + uint64_t ssd_count = 0; + uint64_t mem_mf_count = 0; + uint64_t ssd_mf_count = 0; + + paddle::framework::ChannelReader reader(_fs_channel[i].get()); + + while (reader >> line_data) { + uint64_t key = std::strtoul(line_data.data(), &end, 10); + if (FLAGS_pserver_open_strict_check) { + if (key % _sparse_table_shard_num != (i + file_start_idx)) { + LOG(WARNING) << "SSDSparseTable key:" << key << " not match shard," + << " file_idx:" << i + << " shard num:" << _sparse_table_shard_num; + continue; + } + } + size_t value_size = + _value_accesor->ParseFromString(++end, data_buffer_ptr); + // ssd or mem + if (_value_accesor->SaveSSD(data_buffer_ptr)) { + tmp_key.emplace_back(key); + ssd_keys.emplace_back( + std::make_pair((char*)&tmp_key.back(), sizeof(uint64_t))); + ssd_values.emplace_back( + std::make_pair((char*)data_buffer_ptr, value_size * sizeof(float))); + data_buffer_ptr += feature_value_size; + if (static_cast(ssd_keys.size()) == + FLAGS_pserver_load_batch_size) { + _db->put_batch(local_shard_id, ssd_keys, ssd_values, ssd_keys.size()); + ssd_keys.clear(); + ssd_values.clear(); + tmp_key.clear(); + data_buffer_ptr = data_buffer; + } + ssd_count++; + if (value_size > feature_value_size - mf_value_size) { + ssd_mf_count++; + } + } else { + auto& value = shard[key]; + value.resize(value_size); + _value_accesor->ParseFromString(end, value.data()); + mem_count++; + if (value_size > feature_value_size - mf_value_size) { + mem_mf_count++; + } + } + } + // last batch + if (ssd_keys.size() > 0) { + _db->put_batch(local_shard_id, ssd_keys, ssd_values, ssd_keys.size()); + } + + _db->flush(local_shard_id); + VLOG(0) << "Table>> load done. ALL[" << mem_count + ssd_count << "] MEM[" + << mem_count << "] MEM_MF[" << mem_mf_count << "] SSD[" << ssd_count + << "] SSD_MF[" << ssd_mf_count << "]."; + } + for (int i = 0; i < threads.size(); i++) { + threads[i].join(); + } + for (int i = 0; i < _fs_channel.size(); i++) { + _fs_channel[i].reset(); + } + _fs_channel.clear(); + LOG(INFO) << "load num:" << LocalSize(); + LOG(INFO) << "SSDSparseTable load success, path from " + << file_list[file_start_idx] << " to " + << file_list[file_start_idx + _real_local_shard_num - 1]; + + _cache_tk_size = LocalSize() * _config.sparse_table_cache_rate(); + return 0; } //加载path目录下数据[start_idx, end_idx) @@ -877,7 +1100,11 @@ int32_t SSDSparseTable::Load(size_t start_idx, end_idx = static_cast(end_idx) < _sparse_table_shard_num ? end_idx : _sparse_table_shard_num; +#ifdef PADDLE_WITH_HETERPS + int thread_num = end_idx - start_idx; +#else int thread_num = (end_idx - start_idx) < 20 ? (end_idx - start_idx) : 20; +#endif omp_set_num_threads(thread_num); #pragma omp parallel for schedule(dynamic) for (size_t i = start_idx; i < end_idx; ++i) { @@ -994,5 +1221,164 @@ int32_t SSDSparseTable::Load(size_t start_idx, return 0; } +std::pair SSDSparseTable::PrintTableStat() { + int64_t feasign_size = LocalSize(); + return {feasign_size, -1}; +} + +int32_t SSDSparseTable::CacheTable(uint16_t pass_id) { + // acquire_table_mutex(); + VLOG(0) << "cache_table"; + std::atomic count{0}; + auto thread_num = _real_local_shard_num; + std::vector> tasks; + + double show_threshold = 10000000; + + //保证cache数据不被淘汰掉 + if (_config.enable_sparse_table_cache()) { + if (_local_show_threshold < show_threshold) { + show_threshold = _local_show_threshold; + } + } + + if (show_threshold < 500) { + show_threshold = 500; + } + VLOG(0) << " show_threshold:" << show_threshold + << " ; local_show_threshold:" << _local_show_threshold; + VLOG(0) << "Table>> origin mem feasign size:" << LocalSize(); + static int cache_table_count = 0; + ++cache_table_count; + for (size_t shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) { + // from mem to ssd + auto fut = _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue( + [shard_id, this, &count, show_threshold, pass_id]() -> int { + rocksdb::Options options; + options.comparator = _db->get_comparator(); + rocksdb::BlockBasedTableOptions bbto; + bbto.format_version = 5; + bbto.use_delta_encoding = false; + bbto.block_size = 4 * 1024; + bbto.block_restart_interval = 6; + bbto.cache_index_and_filter_blocks = false; + bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(15, false)); + bbto.whole_key_filtering = true; + options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbto)); + options.OptimizeLevelStyleCompaction(); + options.keep_log_file_num = 100; + options.max_log_file_size = 50 * 1024 * 1024; // 50MB + options.create_if_missing = true; + options.use_direct_reads = true; + options.write_buffer_size = 64 * 1024 * 1024; // 256MB + options.max_write_buffer_number = 4; + options.max_bytes_for_level_base = + options.max_write_buffer_number * options.write_buffer_size; + options.min_write_buffer_number_to_merge = 1; + options.target_file_size_base = 1024 * 1024 * 1024; // 1024MB + options.memtable_prefix_bloom_size_ratio = 0.02; + options.num_levels = 4; + options.max_open_files = -1; + + options.compression = rocksdb::kNoCompression; + + auto& shard = _local_shards[shard_id]; + if (1) { + using DataType = shard_type::map_type::iterator; + std::vector datas; + datas.reserve(shard.size() * 0.8); + size_t idx = 0; + for (auto it = shard.begin(); it != shard.end(); ++it) { + if (!_value_accesor->SaveMemCache( + it.value().data(), 0, show_threshold, pass_id)) { + datas.emplace_back(it.it); + } + } + count.fetch_add(datas.size(), std::memory_order_relaxed); + VLOG(0) << "datas size: " << datas.size(); + { + // sst文件写入必须有序 + uint64_t show_begin = butil::gettimeofday_ms(); + std::sort(datas.begin(), + datas.end(), + [](const DataType& a, const DataType& b) { + return a->first < b->first; + }); + VLOG(0) << "sort shard " << shard_id << ": " + << butil::gettimeofday_ms() - show_begin + << " ms, num: " << datas.size(); + } + + //必须做空判断,否则sst_writer.Finish会core掉 + if (datas.size() != 0) { + rocksdb::SstFileWriter sst_writer(rocksdb::EnvOptions(), options); + std::string filename = + paddle::string::format_string("%s_%d/cache-%05d.sst", + FLAGS_rocksdb_path.c_str(), + shard_id, + cache_table_count); + rocksdb::Status status = sst_writer.Open(filename); + if (!status.ok()) { + VLOG(0) << "sst writer open " << filename << "failed" + << ", " << status.getState(); + abort(); + } + VLOG(0) << "sst writer open " << filename; + + uint64_t show_begin = butil::gettimeofday_ms(); + for (auto& data : datas) { + uint64_t tmp_key = data->first; + FixedFeatureValue& tmp_value = + *((FixedFeatureValue*)(void*)(data->second)); + status = sst_writer.Put( + rocksdb::Slice((char*)(&(tmp_key)), sizeof(uint64_t)), + rocksdb::Slice((char*)(tmp_value.data()), + tmp_value.size() * sizeof(float))); + if (!status.ok()) { + VLOG(0) << "fatal in Put file: " << filename << ", " + << status.getState(); + abort(); + } + } + status = sst_writer.Finish(); + if (!status.ok()) { + VLOG(0) << "fatal in finish file: " << filename << ", " + << status.getState(); + abort(); + } + VLOG(0) << "write sst_file shard " << shard_id << ": " + << butil::gettimeofday_ms() - show_begin << " ms"; + int ret = _db->ingest_externel_file(shard_id, {filename}); + if (ret) { + VLOG(0) << "ingest file failed" + << ", " << status.getState(); + abort(); + } + } + + for (auto it = shard.begin(); it != shard.end();) { + if (!_value_accesor->SaveMemCache( + it.value().data(), 0, show_threshold, pass_id)) { + it = shard.erase(it); + } else { + ++it; + } + } + } + return 0; + }); + tasks.push_back(std::move(fut)); + } + for (size_t i = 0; i < tasks.size(); ++i) { + tasks[i].wait(); + } + tasks.clear(); + + VLOG(0) << "Table>> cache ssd count: " << count.load(); + VLOG(0) << "Table>> after update, mem feasign size:" << LocalSize(); + // release_table_mutex(); + return 0; +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h index 55a05bbab5ec2..6fe13082240d7 100644 --- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h +++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h @@ -38,7 +38,11 @@ class SSDSparseTable : public MemorySparseTable { int32_t Push(TableContext& context) override; int32_t PullSparse(float* pull_values, const uint64_t* keys, size_t num); - int32_t PullSparsePtr(char** pull_values, const uint64_t* keys, size_t num); + int32_t PullSparsePtr(int shard_id, + char** pull_values, + const uint64_t* keys, + size_t num, + uint16_t pass_id); int32_t PushSparse(const uint64_t* keys, const float* values, size_t num); int32_t PushSparse(const uint64_t* keys, const float** values, size_t num); @@ -77,10 +81,15 @@ class SSDSparseTable : public MemorySparseTable { const std::string& param); int64_t LocalSize(); + std::pair PrintTableStat() override; + + int32_t CacheTable(uint16_t pass_id) override; + private: RocksDBHandler* _db; int64_t _cache_tk_size; double _local_show_threshold{0.0}; + std::vector> _fs_channel; }; } // namespace distributed diff --git a/paddle/fluid/distributed/ps/table/table.h b/paddle/fluid/distributed/ps/table/table.h index aee707712f662..f07a3f2132217 100644 --- a/paddle/fluid/distributed/ps/table/table.h +++ b/paddle/fluid/distributed/ps/table/table.h @@ -62,6 +62,8 @@ struct TableContext { size_t num; bool use_ptr = false; uint32_t trainer_id; // for GEO and global step + int shard_id; // for gpups + uint16_t pass_id; // for gpups ssd }; class Table { @@ -147,6 +149,7 @@ class Table { virtual void *GetShard(size_t shard_idx) = 0; virtual std::pair PrintTableStat() { return {0, 0}; } + virtual int32_t CacheTable(uint16_t pass_id) { return 0; } // for patch model virtual void Revert() {} diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc index 5df74883f9247..db0dcf0605dc7 100644 --- a/paddle/fluid/distributed/ps/wrapper/fleet.cc +++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc @@ -748,6 +748,17 @@ void FleetWrapper::PrintTableStat(const uint64_t table_id) { } } +void FleetWrapper::SaveCacheTable(const uint64_t table_id, + uint16_t pass_id, + size_t threshold) { + auto ret = worker_ptr_->SaveCacheTable(table_id, pass_id, threshold); + ret.wait(); + int32_t err_code = ret.get(); + if (err_code == -1) { + LOG(ERROR) << "save cache table stat failed"; + } +} + void FleetWrapper::ShrinkSparseTable(int table_id, int threshold) { auto ret = worker_ptr_->Shrink(table_id, std::to_string(threshold)); ret.wait(); diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.h b/paddle/fluid/distributed/ps/wrapper/fleet.h old mode 100755 new mode 100644 index 28347b3502707..5065fb380a346 --- a/paddle/fluid/distributed/ps/wrapper/fleet.h +++ b/paddle/fluid/distributed/ps/wrapper/fleet.h @@ -242,6 +242,9 @@ class FleetWrapper { void BarrierWithTable(uint32_t barrier_type); void PrintTableStat(const uint64_t table_id); + void SaveCacheTable(const uint64_t table_id, + uint16_t pass_id, + size_t threshold); // mode = 0, load all feature // mode = 1, load delta feature, which means load diff void LoadModel(const std::string& path, const int mode); diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h index 47dc8abf402b9..fe6c51b87228a 100644 --- a/paddle/fluid/framework/channel.h +++ b/paddle/fluid/framework/channel.h @@ -353,7 +353,7 @@ class ChannelReader { } if (cursor_ >= buffer_.size()) { cursor_ = 0; - if (channel_->read(buffer_) == 0) { + if (channel_->Read(buffer_) == 0) { failed_ = true; return *this; } diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index c88c91f166112..0bad79fdcb3d2 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -2118,6 +2118,10 @@ void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) { } } +void SlotRecordInMemoryDataFeed::InitGraphResource() { + gpu_graph_data_generator_.AllocResource(thread_id_, feed_vec_); +} + void SlotRecordInMemoryDataFeed::LoadIntoMemory() { VLOG(3) << "SlotRecord LoadIntoMemory() begin, thread_id=" << thread_id_; if (!so_parser_name_.empty()) { @@ -2654,7 +2658,7 @@ bool SlotRecordInMemoryDataFeed::Start() { pack_ = BatchGpuPackMgr().get(this->GetPlace(), used_slots_info_); #endif #if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) - gpu_graph_data_generator_.AllocResource(this->place_, feed_vec_); + gpu_graph_data_generator_.SetFeedVec(feed_vec_); #endif return true; } @@ -2696,6 +2700,12 @@ int SlotRecordInMemoryDataFeed::Next() { #endif } +#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) +void SlotRecordInMemoryDataFeed::DoWalk() { + gpu_graph_data_generator_.DoWalk(); +} +#endif + #if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) { int offset_cols_size = (ins_num + 1); diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu index 3c4f2c5bbc74d..074fa407c5b51 100644 --- a/paddle/fluid/framework/data_feed.cu +++ b/paddle/fluid/framework/data_feed.cu @@ -24,9 +24,14 @@ limitations under the License. */ #include #include "cub/cub.cuh" #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" +#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h" #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h" +#include "paddle/fluid/framework/fleet/heter_ps/hashtable.h" +#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" DECLARE_bool(enable_opt_get_features); +DECLARE_int32(gpugraph_storage_mode); +DECLARE_double(gpugraph_hbm_table_load_factor); namespace paddle { namespace framework { @@ -35,6 +40,11 @@ namespace framework { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) +#define DEBUG_STATE(state) \ + VLOG(2) << "left: " << state->left << " right: " << state->right \ + << " central_word: " << state->central_word \ + << " step: " << state->step << " cursor: " << state->cursor \ + << " len: " << state->len << " row_num: " << state->row_num; \ // CUDA: use 512 threads per block const int CUDA_NUM_THREADS = 512; // CUDA: number of blocks for threads. @@ -207,13 +217,13 @@ __global__ void CopyDuplicateKeys(int64_t *dist_tensor, int GraphDataGenerator::AcquireInstance(BufState *state) { // if (state->GetNextStep()) { - state->Debug(); + DEBUG_STATE(state); return state->len; } else if (state->GetNextCentrolWord()) { - state->Debug(); + DEBUG_STATE(state); return state->len; } else if (state->GetNextBatch()) { - state->Debug(); + DEBUG_STATE(state); return state->len; } return 0; @@ -338,64 +348,152 @@ __global__ void GraphFillSlotLodKernel(int64_t *id_tensor, int len) { CUDA_KERNEL_LOOP(idx, len) { id_tensor[idx] = idx; } } -int GraphDataGenerator::FillInsBuf() { - if (ins_buf_pair_len_ >= batch_size_) { - return batch_size_; +int GraphDataGenerator::FillIdShowClkTensor(int total_instance, + bool gpu_graph_training, + size_t cursor) { + id_tensor_ptr_ = + feed_vec_[0]->mutable_data({total_instance, 1}, this->place_); + show_tensor_ptr_ = + feed_vec_[1]->mutable_data({total_instance}, this->place_); + clk_tensor_ptr_ = + feed_vec_[2]->mutable_data({total_instance}, this->place_); + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + if (gpu_graph_training) { + uint64_t *ins_cursor, *ins_buf; + ins_buf = reinterpret_cast(d_ins_buf_->ptr()); + ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance; + cudaMemcpyAsync(id_tensor_ptr_, + ins_cursor, + sizeof(uint64_t) * total_instance, + cudaMemcpyDeviceToDevice, + train_stream_); + } else { + uint64_t *d_type_keys = + reinterpret_cast(d_device_keys_[cursor]->ptr()); + d_type_keys += infer_node_start_; + infer_node_start_ += total_instance / 2; + CopyDuplicateKeys<<>>( + id_tensor_ptr_, d_type_keys, total_instance / 2); } - int total_instance = AcquireInstance(&buf_state_); - VLOG(2) << "total_ins: " << total_instance; - buf_state_.Debug(); + GraphFillCVMKernel<<>>(show_tensor_ptr_, total_instance); + GraphFillCVMKernel<<>>(clk_tensor_ptr_, total_instance); + return 0; +} - if (total_instance == 0) { - int res = FillWalkBuf(d_walk_); - if (!res) { - // graph iterate complete - return -1; - } else { - total_instance = buf_state_.len; - VLOG(2) << "total_ins: " << total_instance; - buf_state_.Debug(); - // if (total_instance == 0) { - // return -1; - //} +int GraphDataGenerator::FillGraphSlotFeature(int total_instance, + bool gpu_graph_training) { + int64_t *slot_tensor_ptr_[slot_num_]; + int64_t *slot_lod_tensor_ptr_[slot_num_]; + for (int i = 0; i < slot_num_; ++i) { + slot_tensor_ptr_[i] = feed_vec_[3 + 2 * i]->mutable_data( + {total_instance, 1}, this->place_); + slot_lod_tensor_ptr_[i] = feed_vec_[3 + 2 * i + 1]->mutable_data( + {total_instance + 1}, this->place_); + } + uint64_t *ins_cursor, *ins_buf; + if (gpu_graph_training) { + ins_buf = reinterpret_cast(d_ins_buf_->ptr()); + ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance; + } else { + id_tensor_ptr_ = + feed_vec_[0]->mutable_data({total_instance, 1}, this->place_); + ins_cursor = (uint64_t *)id_tensor_ptr_; + } + + cudaMemcpyAsync(d_slot_tensor_ptr_->ptr(), + slot_tensor_ptr_, + sizeof(uint64_t *) * slot_num_, + cudaMemcpyHostToDevice, + train_stream_); + cudaMemcpyAsync(d_slot_lod_tensor_ptr_->ptr(), + slot_lod_tensor_ptr_, + sizeof(uint64_t *) * slot_num_, + cudaMemcpyHostToDevice, + train_stream_); + uint64_t *feature_buf = reinterpret_cast(d_feature_buf_->ptr()); + FillFeatureBuf(ins_cursor, feature_buf, total_instance); + GraphFillSlotKernel<<>>((uint64_t *)d_slot_tensor_ptr_->ptr(), + feature_buf, + total_instance * slot_num_, + total_instance, + slot_num_); + GraphFillSlotLodKernelOpt<<>>( + (uint64_t *)d_slot_lod_tensor_ptr_->ptr(), + (total_instance + 1) * slot_num_, + total_instance + 1); + if (debug_mode_) { + uint64_t h_walk[total_instance]; + cudaMemcpy(h_walk, + ins_cursor, + total_instance * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + uint64_t h_feature[total_instance * slot_num_]; + cudaMemcpy(h_feature, + feature_buf, + total_instance * slot_num_ * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + for (int i = 0; i < total_instance; ++i) { + std::stringstream ss; + for (int j = 0; j < slot_num_; ++j) { + ss << h_feature[i * slot_num_ + j] << " "; + } + VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i + << "] = " << (uint64_t)h_walk[i] << " feature[" << i * slot_num_ + << ".." << (i + 1) * slot_num_ << "] = " << ss.str(); } - if (!FLAGS_enable_opt_get_features && slot_num_ > 0) { - FillFeatureBuf(d_walk_, d_feature_); - if (debug_mode_) { - int len = buf_size_ > 5000 ? 5000 : buf_size_; - uint64_t h_walk[len]; - cudaMemcpy(h_walk, - d_walk_->ptr(), - len * sizeof(uint64_t), - cudaMemcpyDeviceToHost); - uint64_t h_feature[len * slot_num_]; - cudaMemcpy(h_feature, - d_feature_->ptr(), - len * slot_num_ * sizeof(uint64_t), - cudaMemcpyDeviceToHost); - for (int i = 0; i < len; ++i) { - std::stringstream ss; - for (int j = 0; j < slot_num_; ++j) { - ss << h_feature[i * slot_num_ + j] << " "; - } - VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i - << "] = " << (uint64_t)h_walk[i] << " feature[" - << i * slot_num_ << ".." << (i + 1) * slot_num_ - << "] = " << ss.str(); - } + uint64_t h_slot_tensor[slot_num_][total_instance]; + uint64_t h_slot_lod_tensor[slot_num_][total_instance + 1]; + for (int i = 0; i < slot_num_; ++i) { + cudaMemcpy(h_slot_tensor[i], + slot_tensor_ptr_[i], + total_instance * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + int len = total_instance > 5000 ? 5000 : total_instance; + for (int j = 0; j < len; ++j) { + VLOG(2) << "gpu[" << gpuid_ << "] slot_tensor[" << i << "][" << j + << "] = " << h_slot_tensor[i][j]; + } + + cudaMemcpy(h_slot_lod_tensor[i], + slot_lod_tensor_ptr_[i], + (total_instance + 1) * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + len = total_instance + 1 > 5000 ? 5000 : total_instance + 1; + for (int j = 0; j < len; ++j) { + VLOG(2) << "gpu[" << gpuid_ << "] slot_lod_tensor[" << i << "][" << j + << "] = " << h_slot_lod_tensor[i][j]; } } } + return 0; +} +int GraphDataGenerator::MakeInsPair() { uint64_t *walk = reinterpret_cast(d_walk_->ptr()); uint64_t *ins_buf = reinterpret_cast(d_ins_buf_->ptr()); int *random_row = reinterpret_cast(d_random_row_->ptr()); int *d_pair_num = reinterpret_cast(d_pair_num_->ptr()); - cudaMemsetAsync(d_pair_num, 0, sizeof(int), stream_); + cudaMemsetAsync(d_pair_num, 0, sizeof(int), train_stream_); int len = buf_state_.len; - GraphFillIdKernel<<>>( + // make pair + GraphFillIdKernel<<>>( ins_buf + ins_buf_pair_len_ * 2, d_pair_num, walk, @@ -405,29 +503,12 @@ int GraphDataGenerator::FillInsBuf() { len, walk_len_); int h_pair_num; - cudaMemcpyAsync( - &h_pair_num, d_pair_num, sizeof(int), cudaMemcpyDeviceToHost, stream_); - if (!FLAGS_enable_opt_get_features && slot_num_ > 0) { - uint64_t *feature_buf = reinterpret_cast(d_feature_buf_->ptr()); - uint64_t *feature = reinterpret_cast(d_feature_->ptr()); - cudaMemsetAsync(d_pair_num, 0, sizeof(int), stream_); - int len = buf_state_.len; - VLOG(2) << "feature_buf start[" << ins_buf_pair_len_ * 2 * slot_num_ - << "] len[" << len << "]"; - GraphFillFeatureKernel<<>>( - feature_buf + ins_buf_pair_len_ * 2 * slot_num_, - d_pair_num, - walk, - feature, - random_row + buf_state_.cursor, - buf_state_.central_word, - window_step_[buf_state_.step], - len, - walk_len_, - slot_num_); - } - - cudaStreamSynchronize(stream_); + cudaMemcpyAsync(&h_pair_num, + d_pair_num, + sizeof(int), + cudaMemcpyDeviceToHost, + train_stream_); + cudaStreamSynchronize(train_stream_); ins_buf_pair_len_ += h_pair_num; if (debug_mode_) { @@ -441,70 +522,41 @@ int GraphDataGenerator::FillInsBuf() { for (int xx = 0; xx < 2 * ins_buf_pair_len_; xx++) { VLOG(2) << "h_ins_buf[" << xx << "]: " << h_ins_buf[xx]; } - delete[] h_ins_buf; - - if (!FLAGS_enable_opt_get_features && slot_num_ > 0) { - uint64_t *feature_buf = - reinterpret_cast(d_feature_buf_->ptr()); - uint64_t h_feature_buf[(batch_size_ * 2 * 2) * slot_num_]; - cudaMemcpy(h_feature_buf, - feature_buf, - (batch_size_ * 2 * 2) * slot_num_ * sizeof(uint64_t), - cudaMemcpyDeviceToHost); - for (int xx = 0; xx < (batch_size_ * 2 * 2) * slot_num_; xx++) { - VLOG(2) << "h_feature_buf[" << xx << "]: " << h_feature_buf[xx]; - } - } } return ins_buf_pair_len_; } +int GraphDataGenerator::FillInsBuf() { + if (ins_buf_pair_len_ >= batch_size_) { + return batch_size_; + } + int total_instance = AcquireInstance(&buf_state_); + + VLOG(2) << "total_ins: " << total_instance; + buf_state_.Debug(); + + if (total_instance == 0) { + return -1; + } + return MakeInsPair(); +} + int GraphDataGenerator::GenerateBatch() { int total_instance = 0; platform::CUDADeviceGuard guard(gpuid_); int res = 0; + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); if (!gpu_graph_training_) { - while (cursor_ < h_device_keys_.size()) { - size_t device_key_size = h_device_keys_[cursor_]->size(); - if (infer_node_type_start_[cursor_] >= device_key_size) { - cursor_++; - continue; - } - total_instance = - (infer_node_type_start_[cursor_] + batch_size_ <= device_key_size) - ? batch_size_ - : device_key_size - infer_node_type_start_[cursor_]; - uint64_t *d_type_keys = - reinterpret_cast(d_device_keys_[cursor_]->ptr()); - d_type_keys += infer_node_type_start_[cursor_]; - infer_node_type_start_[cursor_] += total_instance; - VLOG(1) << "in graph_data generator:batch_size = " << batch_size_ - << " instance = " << total_instance; - total_instance *= 2; - id_tensor_ptr_ = feed_vec_[0]->mutable_data({total_instance, 1}, - this->place_); - show_tensor_ptr_ = - feed_vec_[1]->mutable_data({total_instance}, this->place_); - clk_tensor_ptr_ = - feed_vec_[2]->mutable_data({total_instance}, this->place_); - CopyDuplicateKeys<<>>( - id_tensor_ptr_, d_type_keys, total_instance / 2); - GraphFillCVMKernel<<>>(show_tensor_ptr_, total_instance); - GraphFillCVMKernel<<>>(clk_tensor_ptr_, total_instance); - break; - } + total_instance = (infer_node_start_ + batch_size_ <= infer_node_end_) + ? batch_size_ + : infer_node_end_ - infer_node_start_; + VLOG(1) << "in graph_data generator:batch_size = " << batch_size_ + << " instance = " << total_instance; + total_instance *= 2; if (total_instance == 0) { return 0; } + FillIdShowClkTensor(total_instance, gpu_graph_training_, cursor_); } else { while (ins_buf_pair_len_ < batch_size_) { res = FillInsBuf(); @@ -518,132 +570,15 @@ int GraphDataGenerator::GenerateBatch() { } total_instance = ins_buf_pair_len_ < batch_size_ ? ins_buf_pair_len_ : batch_size_; - total_instance *= 2; - id_tensor_ptr_ = - feed_vec_[0]->mutable_data({total_instance, 1}, this->place_); - show_tensor_ptr_ = - feed_vec_[1]->mutable_data({total_instance}, this->place_); - clk_tensor_ptr_ = - feed_vec_[2]->mutable_data({total_instance}, this->place_); - } - - int64_t *slot_tensor_ptr_[slot_num_]; - int64_t *slot_lod_tensor_ptr_[slot_num_]; - if (slot_num_ > 0) { - for (int i = 0; i < slot_num_; ++i) { - slot_tensor_ptr_[i] = feed_vec_[3 + 2 * i]->mutable_data( - {total_instance, 1}, this->place_); - slot_lod_tensor_ptr_[i] = feed_vec_[3 + 2 * i + 1]->mutable_data( - {total_instance + 1}, this->place_); - } - if (FLAGS_enable_opt_get_features || !gpu_graph_training_) { - cudaMemcpyAsync(d_slot_tensor_ptr_->ptr(), - slot_tensor_ptr_, - sizeof(uint64_t *) * slot_num_, - cudaMemcpyHostToDevice, - stream_); - cudaMemcpyAsync(d_slot_lod_tensor_ptr_->ptr(), - slot_lod_tensor_ptr_, - sizeof(uint64_t *) * slot_num_, - cudaMemcpyHostToDevice, - stream_); - } - } - - uint64_t *ins_cursor, *ins_buf; - if (gpu_graph_training_) { VLOG(2) << "total_instance: " << total_instance << ", ins_buf_pair_len = " << ins_buf_pair_len_; - // uint64_t *ins_buf = reinterpret_cast(d_ins_buf_->ptr()); - // uint64_t *ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance; - ins_buf = reinterpret_cast(d_ins_buf_->ptr()); - ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance; - cudaMemcpyAsync(id_tensor_ptr_, - ins_cursor, - sizeof(uint64_t) * total_instance, - cudaMemcpyDeviceToDevice, - stream_); - - GraphFillCVMKernel<<>>(show_tensor_ptr_, total_instance); - GraphFillCVMKernel<<>>(clk_tensor_ptr_, total_instance); - } else { - ins_cursor = (uint64_t *)id_tensor_ptr_; + FillIdShowClkTensor(total_instance, gpu_graph_training_); } if (slot_num_ > 0) { - uint64_t *feature_buf = reinterpret_cast(d_feature_buf_->ptr()); - if (FLAGS_enable_opt_get_features || !gpu_graph_training_) { - FillFeatureBuf(ins_cursor, feature_buf, total_instance); - // FillFeatureBuf(id_tensor_ptr_, feature_buf, total_instance); - if (debug_mode_) { - uint64_t h_walk[total_instance]; - cudaMemcpy(h_walk, - ins_cursor, - total_instance * sizeof(uint64_t), - cudaMemcpyDeviceToHost); - uint64_t h_feature[total_instance * slot_num_]; - cudaMemcpy(h_feature, - feature_buf, - total_instance * slot_num_ * sizeof(uint64_t), - cudaMemcpyDeviceToHost); - for (int i = 0; i < total_instance; ++i) { - std::stringstream ss; - for (int j = 0; j < slot_num_; ++j) { - ss << h_feature[i * slot_num_ + j] << " "; - } - VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i - << "] = " << (uint64_t)h_walk[i] << " feature[" - << i * slot_num_ << ".." << (i + 1) * slot_num_ - << "] = " << ss.str(); - } - } - - GraphFillSlotKernel<<>>((uint64_t *)d_slot_tensor_ptr_->ptr(), - feature_buf, - total_instance * slot_num_, - total_instance, - slot_num_); - GraphFillSlotLodKernelOpt<<>>( - (uint64_t *)d_slot_lod_tensor_ptr_->ptr(), - (total_instance + 1) * slot_num_, - total_instance + 1); - } else { - for (int i = 0; i < slot_num_; ++i) { - int feature_buf_offset = - (ins_buf_pair_len_ * 2 - total_instance) * slot_num_ + i * 2; - for (int j = 0; j < total_instance; j += 2) { - VLOG(2) << "slot_tensor[" << i << "][" << j << "] <- feature_buf[" - << feature_buf_offset + j * slot_num_ << "]"; - VLOG(2) << "slot_tensor[" << i << "][" << j + 1 << "] <- feature_buf[" - << feature_buf_offset + j * slot_num_ + 1 << "]"; - cudaMemcpyAsync(slot_tensor_ptr_[i] + j, - &feature_buf[feature_buf_offset + j * slot_num_], - sizeof(uint64_t) * 2, - cudaMemcpyDeviceToDevice, - stream_); - } - GraphFillSlotLodKernel<<>>(slot_lod_tensor_ptr_[i], - total_instance + 1); - } - } + FillGraphSlotFeature(total_instance, gpu_graph_training_); } - offset_.clear(); offset_.push_back(0); offset_.push_back(total_instance); @@ -655,34 +590,9 @@ int GraphDataGenerator::GenerateBatch() { } } - cudaStreamSynchronize(stream_); + cudaStreamSynchronize(train_stream_); if (!gpu_graph_training_) return 1; ins_buf_pair_len_ -= total_instance / 2; - if (debug_mode_) { - uint64_t h_slot_tensor[slot_num_][total_instance]; - uint64_t h_slot_lod_tensor[slot_num_][total_instance + 1]; - for (int i = 0; i < slot_num_; ++i) { - cudaMemcpy(h_slot_tensor[i], - slot_tensor_ptr_[i], - total_instance * sizeof(uint64_t), - cudaMemcpyDeviceToHost); - int len = total_instance > 5000 ? 5000 : total_instance; - for (int j = 0; j < len; ++j) { - VLOG(2) << "gpu[" << gpuid_ << "] slot_tensor[" << i << "][" << j - << "] = " << h_slot_tensor[i][j]; - } - - cudaMemcpy(h_slot_lod_tensor[i], - slot_lod_tensor_ptr_[i], - (total_instance + 1) * sizeof(uint64_t), - cudaMemcpyDeviceToHost); - len = total_instance + 1 > 5000 ? 5000 : total_instance + 1; - for (int j = 0; j < len; ++j) { - VLOG(2) << "gpu[" << gpuid_ << "] slot_lod_tensor[" << i << "][" << j - << "] = " << h_slot_lod_tensor[i][j]; - } - } - } return 1; } @@ -750,6 +660,66 @@ __global__ void GraphFillFirstStepKernel(int *prefix_sum, } } +__global__ void GetUniqueFeaNum(uint64_t *d_in, + uint64_t *unique_num, + size_t len) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ uint64_t local_num; + if (threadIdx.x == 0) { + local_num = 0; + } + __syncthreads(); + + if (i < len - 1) { + if (d_in[i] != d_in[i + 1]) { + atomicAdd(&local_num, 1); + } + } + if (i == len - 1) { + atomicAdd(&local_num, 1); + } + + __syncthreads(); + if (threadIdx.x == 0) { + atomicAdd(unique_num, local_num); + } +} + +__global__ void UniqueFeature(uint64_t *d_in, + uint64_t *d_out, + uint64_t *unique_num, + size_t len) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ uint64_t local_key[CUDA_NUM_THREADS]; + __shared__ uint64_t local_num; + __shared__ uint64_t global_num; + if (threadIdx.x == 0) { + local_num = 0; + } + __syncthreads(); + + if (i < len - 1) { + if (d_in[i] != d_in[i + 1]) { + size_t dst = atomicAdd(&local_num, 1); + local_key[dst] = d_in[i]; + } + } + if (i == len - 1) { + size_t dst = atomicAdd(&local_num, 1); + local_key[dst] = d_in[i]; + } + + __syncthreads(); + + if (threadIdx.x == 0) { + global_num = atomicAdd(unique_num, local_num); + } + __syncthreads(); + + if (threadIdx.x < local_num) { + d_out[global_num + threadIdx.x] = local_key[threadIdx.x]; + } +} // Fill sample_res to the stepth column of walk void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids, uint64_t *walk, @@ -773,45 +743,50 @@ void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids, d_actual_sample_size, d_prefix_sum + 1, len, - stream_)); - auto d_temp_storage = memory::Alloc(place_, temp_storage_bytes); + sample_stream_)); + auto d_temp_storage = memory::Alloc( + place_, + temp_storage_bytes, + phi::Stream(reinterpret_cast(sample_stream_))); CUDA_CHECK(cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(), temp_storage_bytes, d_actual_sample_size, d_prefix_sum + 1, len, - stream_)); + sample_stream_)); - cudaStreamSynchronize(stream_); + cudaStreamSynchronize(sample_stream_); if (step == 1) { - GraphFillFirstStepKernel<<>>( - d_prefix_sum, - d_tmp_sampleidx2row, - walk, - d_start_ids, - len, - walk_degree_, - walk_len_, - d_actual_sample_size, - d_neighbors, - d_sample_keys); + GraphFillFirstStepKernel<<>>(d_prefix_sum, + d_tmp_sampleidx2row, + walk, + d_start_ids, + len, + walk_degree_, + walk_len_, + d_actual_sample_size, + d_neighbors, + d_sample_keys); } else { GraphFillSampleKeysKernel<<>>(d_neighbors, - d_sample_keys, - d_prefix_sum, - d_sampleidx2row, - d_tmp_sampleidx2row, - d_actual_sample_size, - cur_degree, - len); - - GraphDoWalkKernel<<>>( + sample_stream_>>>(d_neighbors, + d_sample_keys, + d_prefix_sum, + d_sampleidx2row, + d_tmp_sampleidx2row, + d_actual_sample_size, + cur_degree, + len); + + GraphDoWalkKernel<<>>( d_neighbors, walk, d_prefix_sum, @@ -828,7 +803,6 @@ void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids, int *h_prefix_sum = new int[len + 1]; int *h_actual_size = new int[len]; int *h_offset2idx = new int[once_max_sample_keynum]; - uint64_t h_sample_keys[once_max_sample_keynum]; cudaMemcpy(h_offset2idx, d_tmp_sampleidx2row, once_max_sample_keynum * sizeof(int), @@ -847,9 +821,8 @@ void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids, delete[] h_prefix_sum; delete[] h_actual_size; delete[] h_offset2idx; - delete[] h_sample_keys; } - cudaStreamSynchronize(stream_); + cudaStreamSynchronize(sample_stream_); cur_sampleidx2row_ = 1 - cur_sampleidx2row_; } @@ -878,7 +851,155 @@ int GraphDataGenerator::FillFeatureBuf( return ret; } -int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { +// 尝试插入table, 0表示插入成功 +int GraphDataGenerator::InsertTable( + const unsigned long *d_keys, + unsigned long len, + std::shared_ptr d_uniq_node_num) { + uint64_t h_uniq_node_num = 0; + uint64_t *d_uniq_node_num_ptr = + reinterpret_cast(d_uniq_node_num->ptr()); + cudaMemcpyAsync(&h_uniq_node_num, + d_uniq_node_num_ptr, + sizeof(uint64_t), + cudaMemcpyDeviceToHost, + sample_stream_); + cudaStreamSynchronize(sample_stream_); + // 产生了足够多的node,采样结束 + VLOG(2) << "table capcity: " << train_table_cap_ << ", " << h_uniq_node_num + << " used"; + if (h_uniq_node_num + len >= train_table_cap_) { + return 1; + } + table_->insert(d_keys, len, d_uniq_node_num_ptr, sample_stream_); + CUDA_CHECK(cudaStreamSynchronize(sample_stream_)); + return 0; +} + +std::shared_ptr GraphDataGenerator::GetTableKeys( + std::shared_ptr d_uniq_node_num, + uint64_t &h_uniq_node_num) { + uint64_t *d_uniq_node_num_ptr = + reinterpret_cast(d_uniq_node_num->ptr()); + cudaMemcpyAsync(&h_uniq_node_num, + d_uniq_node_num_ptr, + sizeof(uint64_t), + cudaMemcpyDeviceToHost, + sample_stream_); + cudaStreamSynchronize(sample_stream_); + + auto d_uniq_node = memory::AllocShared( + place_, + h_uniq_node_num * sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); + uint64_t *d_uniq_node_ptr = reinterpret_cast(d_uniq_node->ptr()); + + auto d_node_cursor = memory::AllocShared( + place_, + sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); + + uint64_t *d_node_cursor_ptr = + reinterpret_cast(d_node_cursor->ptr()); + cudaMemsetAsync(d_node_cursor_ptr, 0, sizeof(uint64_t), sample_stream_); + table_->get_keys(d_uniq_node_ptr, d_node_cursor_ptr, sample_stream_); + + cudaStreamSynchronize(sample_stream_); + return d_uniq_node; +} + +void GraphDataGenerator::CopyFeaFromTable( + std::shared_ptr d_uniq_fea_num) { + uint64_t h_uniq_fea_num = 0; + auto d_uniq_fea = GetTableKeys(d_uniq_fea_num, h_uniq_fea_num); + uint64_t *d_uniq_fea_ptr = reinterpret_cast(d_uniq_fea->ptr()); + size_t host_vec_size = host_vec_.size(); + host_vec_.resize(host_vec_size + h_uniq_fea_num); + + VLOG(0) << "uniq feature num: " << h_uniq_fea_num; + + cudaMemcpyAsync(host_vec_.data() + host_vec_size, + d_uniq_fea_ptr, + sizeof(uint64_t) * h_uniq_fea_num, + cudaMemcpyDeviceToHost, + sample_stream_); + cudaStreamSynchronize(sample_stream_); +} + +void GraphDataGenerator::DoWalk() { + int device_id = place_.GetDeviceId(); + debug_gpu_memory_info(device_id, "DoWalk start"); + if (gpu_graph_training_) { + FillWalkBuf(); + } else { + FillInferBuf(); + } + debug_gpu_memory_info(device_id, "DoWalk end"); +} + +void GraphDataGenerator::clear_gpu_mem() { + d_len_per_row_.reset(); + d_sample_keys_.reset(); + d_prefix_sum_.reset(); + for (size_t i = 0; i < d_sampleidx2rows_.size(); i++) { + d_sampleidx2rows_[i].reset(); + } + delete table_; +} + +int GraphDataGenerator::FillInferBuf() { + platform::CUDADeviceGuard guard(gpuid_); + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + auto &global_infer_node_type_start = + gpu_graph_ptr->global_infer_node_type_start_[gpuid_]; + auto &infer_cursor = gpu_graph_ptr->infer_cursor_[thread_id_]; + total_row_ = 0; + if (infer_cursor < h_device_keys_len_.size()) { + if (global_infer_node_type_start[infer_cursor] >= + h_device_keys_len_[infer_cursor]) { + infer_cursor++; + if (infer_cursor >= h_device_keys_len_.size()) { + return 0; + } + } + size_t device_key_size = h_device_keys_len_[infer_cursor]; + total_row_ = + (global_infer_node_type_start[infer_cursor] + infer_table_cap_ <= + device_key_size) + ? infer_table_cap_ + : device_key_size - global_infer_node_type_start[infer_cursor]; + + host_vec_.resize(total_row_); + uint64_t *d_type_keys = + reinterpret_cast(d_device_keys_[infer_cursor]->ptr()); + cudaMemcpyAsync(host_vec_.data(), + d_type_keys + global_infer_node_type_start[infer_cursor], + sizeof(uint64_t) * total_row_, + cudaMemcpyDeviceToHost, + sample_stream_); + cudaStreamSynchronize(sample_stream_); + VLOG(1) << "cursor: " << infer_cursor + << " start: " << global_infer_node_type_start[infer_cursor] + << " num: " << total_row_; + infer_node_start_ = global_infer_node_type_start[infer_cursor]; + global_infer_node_type_start[infer_cursor] += total_row_; + infer_node_end_ = global_infer_node_type_start[infer_cursor]; + cursor_ = infer_cursor; + } + return 0; +} + +void GraphDataGenerator::ClearSampleState() { + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + auto &finish_node_type = gpu_graph_ptr->finish_node_type_[gpuid_]; + auto &node_type_start = gpu_graph_ptr->node_type_start_[gpuid_]; + finish_node_type.clear(); + for (auto iter = node_type_start.begin(); iter != node_type_start.end(); iter++) { + iter->second = 0; + } +} + +int GraphDataGenerator::FillWalkBuf() { platform::CUDADeviceGuard guard(gpuid_); size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_; //////// @@ -896,30 +1017,42 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { } /////// auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); - uint64_t *walk = reinterpret_cast(d_walk->ptr()); + uint64_t *walk = reinterpret_cast(d_walk_->ptr()); int *len_per_row = reinterpret_cast(d_len_per_row_->ptr()); uint64_t *d_sample_keys = reinterpret_cast(d_sample_keys_->ptr()); - cudaMemsetAsync(walk, 0, buf_size_ * sizeof(uint64_t), stream_); - cudaMemsetAsync( - len_per_row, 0, once_max_sample_keynum * sizeof(int), stream_); + cudaMemsetAsync(walk, 0, buf_size_ * sizeof(uint64_t), sample_stream_); + // cudaMemsetAsync( + // len_per_row, 0, once_max_sample_keynum * sizeof(int), sample_stream_); + int sample_times = 0; int i = 0; - int total_row = 0; - size_t node_type_len = first_node_type_.size(); + total_row_ = 0; + + // 获取全局采样状态 + auto &first_node_type = gpu_graph_ptr->first_node_type_; + auto &meta_path = gpu_graph_ptr->meta_path_; + auto &node_type_start = gpu_graph_ptr->node_type_start_[gpuid_]; + auto &finish_node_type = gpu_graph_ptr->finish_node_type_[gpuid_]; + auto &type_to_index = gpu_graph_ptr->get_graph_type_to_index(); + auto &cursor = gpu_graph_ptr->cursor_[thread_id_]; + size_t node_type_len = first_node_type.size(); int remain_size = buf_size_ - walk_degree_ * once_sample_startid_len_ * walk_len_; + int total_samples = 0; while (i <= remain_size) { - int cur_node_idx = cursor_ % node_type_len; - int node_type = first_node_type_[cur_node_idx]; - auto &path = meta_path_[cur_node_idx]; - size_t start = node_type_start_[node_type]; + int cur_node_idx = cursor % node_type_len; + int node_type = first_node_type[cur_node_idx]; + auto &path = meta_path[cur_node_idx]; + size_t start = node_type_start[node_type]; + VLOG(2) << "cur_node_idx = " << cur_node_idx + << " meta_path.size = " << meta_path.size(); // auto node_query_result = gpu_graph_ptr->query_node_list( - // gpuid_, node_type, start, once_sample_startid_len_); + // gpuid_, node_type, start, once_sample_startid_len_); // int tmp_len = node_query_result.actual_sample_size; VLOG(2) << "choose start type: " << node_type; - int type_index = type_to_index_[node_type]; - size_t device_key_size = h_device_keys_[type_index]->size(); + int type_index = type_to_index[node_type]; + size_t device_key_size = h_device_keys_len_[type_index]; VLOG(2) << "type: " << node_type << " size: " << device_key_size << " start: " << start; uint64_t *d_type_keys = @@ -927,21 +1060,19 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { int tmp_len = start + once_sample_startid_len_ > device_key_size ? device_key_size - start : once_sample_startid_len_; - node_type_start_[node_type] = tmp_len + start; + bool update = true; if (tmp_len == 0) { - finish_node_type_.insert(node_type); - if (finish_node_type_.size() == node_type_start_.size()) { + finish_node_type.insert(node_type); + if (finish_node_type.size() == node_type_start.size()) { + cursor = 0; + epoch_finish_ = true; break; } - cursor_ += 1; + cursor += 1; continue; } - // if (tmp_len == 0) { - // break; - //} - VLOG(2) << "i = " << i << " buf_size_ = " << buf_size_ - << " tmp_len = " << tmp_len << " cursor = " << cursor_ - << " once_max_sample_keynum = " << once_max_sample_keynum; + + VLOG(2) << "gpuid = " << gpuid_ << " path[0] = " << path[0]; uint64_t *cur_walk = walk + i; NeighborSampleQuery q; @@ -955,6 +1086,30 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { int step = 1; VLOG(2) << "sample edge type: " << path[0] << " step: " << 1; jump_rows_ = sample_res.total_sample_size; + total_samples += sample_res.total_sample_size; + VLOG(2) << "i = " << i << " start = " << start << " tmp_len = " << tmp_len + << " cursor = " << node_type << " cur_node_idx = " << cur_node_idx + << " jump row: " << jump_rows_; + VLOG(2) << "jump_row: " << jump_rows_; + if (jump_rows_ == 0) { + node_type_start[node_type] = tmp_len + start; + cursor += 1; + continue; + } + + if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { + if (InsertTable(d_type_keys + start, tmp_len, d_uniq_node_num_) != 0) { + VLOG(2) << "in step 0, insert key stage, table is full"; + update = false; + break; + } + if (InsertTable(sample_res.actual_val, sample_res.total_sample_size, d_uniq_node_num_) != + 0) { + VLOG(2) << "in step 0, insert sample res stage, table is full"; + update = false; + break; + } + } FillOneStep(d_type_keys + start, cur_walk, tmp_len, @@ -962,7 +1117,6 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { walk_degree_, step, len_per_row); - VLOG(2) << "jump_row: " << jump_rows_; ///////// if (debug_mode_) { cudaMemcpy( @@ -971,11 +1125,16 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx]; } } + + VLOG(2) << "sample, step=" << step << " sample_keys=" << tmp_len + << " sample_res_len=" << sample_res.total_sample_size; + ///////// step++; size_t path_len = path.size(); for (; step < walk_len_; step++) { if (sample_res.total_sample_size == 0) { + VLOG(2) << "sample finish, step=" << step; break; } auto sample_key_mem = sample_res.actual_val_mem; @@ -988,11 +1147,20 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { (uint64_t)sample_keys_ptr, 1, sample_res.total_sample_size); + int sample_key_len = sample_res.total_sample_size; sample_res = gpu_graph_ptr->graph_neighbor_sample_v3(q, false); - + total_samples += sample_res.total_sample_size; + if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { + if (InsertTable(sample_res.actual_val, sample_res.total_sample_size, d_uniq_node_num_) != + 0) { + VLOG(2) << "in step: " << step << ", table is full"; + update = false; + break; + } + } FillOneStep(d_type_keys + start, cur_walk, - sample_res.total_sample_size, + sample_key_len, sample_res, 1, step, @@ -1004,34 +1172,44 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx]; } } + + VLOG(2) << "sample, step=" << step << " sample_keys=" << sample_key_len + << " sample_res_len=" << sample_res.total_sample_size; + } + // 此时更新全局采样状态 + if (update == true) { + node_type_start[node_type] = tmp_len + start; + i += jump_rows_ * walk_len_; + total_row_ += jump_rows_; + cursor += 1; + sample_times++; + } else { + VLOG(2) << "table is full, not update stat!"; + break; } - // cursor_ += tmp_len; - i += jump_rows_ * walk_len_; - total_row += jump_rows_; - cursor_ += 1; } - buf_state_.Reset(total_row); + buf_state_.Reset(total_row_); int *d_random_row = reinterpret_cast(d_random_row_->ptr()); thrust::random::default_random_engine engine(shuffle_seed_); - const auto &exec_policy = thrust::cuda::par.on(stream_); + const auto &exec_policy = thrust::cuda::par.on(sample_stream_); thrust::counting_iterator cnt_iter(0); thrust::shuffle_copy(exec_policy, cnt_iter, - cnt_iter + total_row, + cnt_iter + total_row_, thrust::device_pointer_cast(d_random_row), engine); - cudaStreamSynchronize(stream_); + cudaStreamSynchronize(sample_stream_); shuffle_seed_ = engine(); if (debug_mode_) { - int *h_random_row = new int[total_row + 10]; + int *h_random_row = new int[total_row_ + 10]; cudaMemcpy(h_random_row, d_random_row, - total_row * sizeof(int), + total_row_ * sizeof(int), cudaMemcpyDeviceToHost); - for (int xx = 0; xx < total_row; xx++) { + for (int xx = 0; xx < total_row_; xx++) { VLOG(2) << "h_random_row[" << xx << "]: " << h_random_row[xx]; } delete[] h_random_row; @@ -1041,71 +1219,198 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { delete[] h_len_per_row; delete[] h_prefix_sum; } - return total_row != 0; + if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { + // table_->prefetch(cudaCpuDeviceId, sample_stream_); + // thrust::pair *kv = table_->data(); + // size_t size = table_->size(); + // uint64_t unused_key = std::numeric_limits::max(); + // for (size_t i = 0; i < size; i++) { + // if (kv[i].first == unused_key) { + // continue; + // } + // host_vec_.push_back(kv[i].first); + // } + + uint64_t h_uniq_node_num = 0; + uint64_t *d_uniq_node_num = + reinterpret_cast(d_uniq_node_num_->ptr()); + cudaMemcpyAsync(&h_uniq_node_num, + d_uniq_node_num, + sizeof(uint64_t), + cudaMemcpyDeviceToHost, + sample_stream_); + cudaStreamSynchronize(sample_stream_); + VLOG(2) << "h_uniq_node_num: " << h_uniq_node_num; + // 临时显存, 存储去重后的nodeid + auto d_uniq_node = memory::AllocShared( + place_, + h_uniq_node_num * sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); + uint64_t *d_uniq_node_ptr = + reinterpret_cast(d_uniq_node->ptr()); + + auto d_node_cursor = memory::AllocShared( + place_, + sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); + + uint64_t *d_node_cursor_ptr = + reinterpret_cast(d_node_cursor->ptr()); + cudaMemsetAsync(d_node_cursor_ptr, 0, sizeof(uint64_t), sample_stream_); + // uint64_t unused_key = std::numeric_limits::max(); + table_->get_keys(d_uniq_node_ptr, d_node_cursor_ptr, sample_stream_); + + cudaStreamSynchronize(sample_stream_); + + host_vec_.resize(h_uniq_node_num); + cudaMemcpyAsync(host_vec_.data(), + d_uniq_node_ptr, + sizeof(uint64_t) * h_uniq_node_num, + cudaMemcpyDeviceToHost, + sample_stream_); + cudaStreamSynchronize(sample_stream_); + VLOG(2) << "slot_num: " << slot_num_; + if (slot_num_ > 0) { + VLOG(2) << "uniq feature"; + auto d_uniq_fea_num = memory::AllocShared( + place_, + sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); + cudaMemsetAsync( + d_uniq_fea_num->ptr(), 0, sizeof(uint64_t), sample_stream_); + table_->clear(sample_stream_); + size_t cursor = 0; + size_t batch = 0; + d_feature_list_ = memory::AllocShared( + place_, + once_sample_startid_len_ * slot_num_ * sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); + uint64_t *d_feature_list_ptr = + reinterpret_cast(d_feature_list_->ptr()); + while (cursor < h_uniq_node_num) { + batch = (cursor + once_sample_startid_len_ <= h_uniq_node_num) + ? once_sample_startid_len_ + : h_uniq_node_num - cursor; + + int ret = gpu_graph_ptr->get_feature_of_nodes(gpuid_, + d_uniq_node_ptr + cursor, + d_feature_list_ptr, + batch, + slot_num_); + if (InsertTable( + d_feature_list_ptr, slot_num_ * batch, d_uniq_fea_num)) { + CopyFeaFromTable(d_uniq_fea_num); + table_->clear(sample_stream_); + cudaMemsetAsync( + d_uniq_fea_num->ptr(), 0, sizeof(uint64_t), sample_stream_); + } + cursor += batch; + } + CopyFeaFromTable(d_uniq_fea_num); + } + + VLOG(0) << "sample_times:" << sample_times + << ", d_walk_size:" << buf_size_ + << ", d_walk_offset:" << i + << ", total_rows:" << total_row_ + << ", total_samples:" << total_samples + << ", h_uniq_node_num:" << h_uniq_node_num; + } + return total_row_ != 0; } -void GraphDataGenerator::AllocResource(const paddle::platform::Place &place, - std::vector feed_vec) { - place_ = place; - gpuid_ = place_.GetDeviceId(); - VLOG(3) << "gpuid " << gpuid_; - stream_ = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); +void GraphDataGenerator::SetFeedVec(std::vector feed_vec) { feed_vec_ = feed_vec; - slot_num_ = (feed_vec_.size() - 3) / 2; - - // d_device_keys_.resize(h_device_keys_.size()); - VLOG(2) << "h_device_keys size: " << h_device_keys_.size(); - infer_node_type_start_ = std::vector(h_device_keys_.size(), 0); - for (size_t i = 0; i < h_device_keys_.size(); i++) { - for (size_t j = 0; j < h_device_keys_[i]->size(); j++) { - VLOG(3) << "h_device_keys_[" << i << "][" << j - << "] = " << (*(h_device_keys_[i]))[j]; - } - auto buf = memory::AllocShared( - place_, h_device_keys_[i]->size() * sizeof(uint64_t)); - d_device_keys_.push_back(buf); - CUDA_CHECK(cudaMemcpyAsync(buf->ptr(), - h_device_keys_[i]->data(), - h_device_keys_[i]->size() * sizeof(uint64_t), - cudaMemcpyHostToDevice, - stream_)); +} +void GraphDataGenerator::AllocResource(int thread_id, + std::vector feed_vec) { + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + gpuid_ = gpu_graph_ptr->device_id_mapping[thread_id]; + thread_id_ = thread_id; + place_ = platform::CUDAPlace(gpuid_); + debug_gpu_memory_info(gpuid_, "AllocResource start"); + + platform::CUDADeviceGuard guard(gpuid_); + if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { + table_ = new HashTable( + train_table_cap_ / FLAGS_gpugraph_hbm_table_load_factor); + } + VLOG(1) << "AllocResource gpuid " << gpuid_ + << " feed_vec.size: " << feed_vec.size() + << " table cap: " << train_table_cap_; + sample_stream_ = gpu_graph_ptr->get_local_stream(gpuid_); + train_stream_ = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place_)) + ->stream(); + // feed_vec_ = feed_vec; + slot_num_ = (feed_vec.size() - 3) / 2; + + // infer_node_type_start_ = std::vector(h_device_keys_.size(), 0); + // for (size_t i = 0; i < h_device_keys_.size(); i++) { + // for (size_t j = 0; j < h_device_keys_[i]->size(); j++) { + // VLOG(3) << "h_device_keys_[" << i << "][" << j + // << "] = " << (*(h_device_keys_[i]))[j]; + // } + // auto buf = memory::AllocShared( + // place_, h_device_keys_[i]->size() * sizeof(uint64_t)); + // d_device_keys_.push_back(buf); + // CUDA_CHECK(cudaMemcpyAsync(buf->ptr(), + // h_device_keys_[i]->data(), + // h_device_keys_[i]->size() * sizeof(uint64_t), + // cudaMemcpyHostToDevice, + // stream_)); + // } + auto &d_graph_all_type_keys = gpu_graph_ptr->d_graph_all_type_total_keys_; + auto &h_graph_all_type_keys_len = gpu_graph_ptr->h_graph_all_type_keys_len_; + + for (size_t i = 0; i < d_graph_all_type_keys.size(); i++) { + d_device_keys_.push_back(d_graph_all_type_keys[i][thread_id]); + h_device_keys_len_.push_back(h_graph_all_type_keys_len[i][thread_id]); } - // h_device_keys_ = h_device_keys; - // device_key_size_ = h_device_keys_->size(); - // d_device_keys_ = - // memory::AllocShared(place_, device_key_size_ * sizeof(int64_t)); - // CUDA_CHECK(cudaMemcpyAsync(d_device_keys_->ptr(), h_device_keys_->data(), - // device_key_size_ * sizeof(int64_t), - // cudaMemcpyHostToDevice, stream_)); + VLOG(2) << "h_device_keys size: " << h_device_keys_len_.size(); size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_; - d_prefix_sum_ = - memory::AllocShared(place_, (once_max_sample_keynum + 1) * sizeof(int)); + d_prefix_sum_ = memory::AllocShared( + place_, + (once_max_sample_keynum + 1) * sizeof(int), + phi::Stream(reinterpret_cast(sample_stream_))); int *d_prefix_sum_ptr = reinterpret_cast(d_prefix_sum_->ptr()); - cudaMemsetAsync( - d_prefix_sum_ptr, 0, (once_max_sample_keynum + 1) * sizeof(int), stream_); + cudaMemsetAsync(d_prefix_sum_ptr, + 0, + (once_max_sample_keynum + 1) * sizeof(int), + sample_stream_); cursor_ = 0; jump_rows_ = 0; - d_walk_ = memory::AllocShared(place_, buf_size_ * sizeof(uint64_t)); - cudaMemsetAsync(d_walk_->ptr(), 0, buf_size_ * sizeof(uint64_t), stream_); - if (!FLAGS_enable_opt_get_features && slot_num_ > 0) { - d_feature_ = - memory::AllocShared(place_, buf_size_ * slot_num_ * sizeof(uint64_t)); - cudaMemsetAsync( - d_feature_->ptr(), 0, buf_size_ * sizeof(uint64_t), stream_); - } - d_sample_keys_ = - memory::AllocShared(place_, once_max_sample_keynum * sizeof(uint64_t)); + d_uniq_node_num_ = memory::AllocShared( + place_, + sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); + cudaMemsetAsync(d_uniq_node_num_->ptr(), 0, sizeof(uint64_t), sample_stream_); - d_sampleidx2rows_.push_back( - memory::AllocShared(place_, once_max_sample_keynum * sizeof(int))); - d_sampleidx2rows_.push_back( - memory::AllocShared(place_, once_max_sample_keynum * sizeof(int))); + d_walk_ = memory::AllocShared( + place_, + buf_size_ * sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); + cudaMemsetAsync( + d_walk_->ptr(), 0, buf_size_ * sizeof(uint64_t), sample_stream_); + d_sample_keys_ = memory::AllocShared( + place_, + once_max_sample_keynum * sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); + + d_sampleidx2rows_.push_back(memory::AllocShared( + place_, + once_max_sample_keynum * sizeof(int), + phi::Stream(reinterpret_cast(sample_stream_)))); + d_sampleidx2rows_.push_back(memory::AllocShared( + place_, + once_max_sample_keynum * sizeof(int), + phi::Stream(reinterpret_cast(sample_stream_)))); cur_sampleidx2row_ = 0; - d_len_per_row_ = - memory::AllocShared(place_, once_max_sample_keynum * sizeof(int)); + d_len_per_row_ = memory::AllocShared( + place_, + once_max_sample_keynum * sizeof(int), + phi::Stream(reinterpret_cast(sample_stream_))); for (int i = -window_; i < 0; i++) { window_step_.push_back(i); } @@ -1115,7 +1420,8 @@ void GraphDataGenerator::AllocResource(const paddle::platform::Place &place, buf_state_.Init(batch_size_, walk_len_, &window_step_); d_random_row_ = memory::AllocShared( place_, - (once_sample_startid_len_ * walk_degree_ * repeat_time_) * sizeof(int)); + (once_sample_startid_len_ * walk_degree_ * repeat_time_) * sizeof(int), + phi::Stream(reinterpret_cast(sample_stream_))); shuffle_seed_ = 0; ins_buf_pair_len_ = 0; @@ -1126,14 +1432,15 @@ void GraphDataGenerator::AllocResource(const paddle::platform::Place &place, place_, (batch_size_ * 2 * 2) * slot_num_ * sizeof(uint64_t)); } d_pair_num_ = memory::AllocShared(place_, sizeof(int)); - if (FLAGS_enable_opt_get_features && slot_num_ > 0) { - d_slot_tensor_ptr_ = - memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *)); - d_slot_lod_tensor_ptr_ = - memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *)); - } - cudaStreamSynchronize(stream_); + d_slot_tensor_ptr_ = + memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *)); + d_slot_lod_tensor_ptr_ = + memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *)); + + cudaStreamSynchronize(sample_stream_); + + debug_gpu_memory_info(gpuid_, "AllocResource end"); } void GraphDataGenerator::SetConfig( @@ -1153,48 +1460,22 @@ void GraphDataGenerator::SetConfig( repeat_time_ = graph_config.sample_times_one_chunk(); buf_size_ = once_sample_startid_len_ * walk_len_ * walk_degree_ * repeat_time_; - VLOG(2) << "Confirm GraphConfig, walk_degree : " << walk_degree_ + train_table_cap_ = graph_config.train_table_cap(); + infer_table_cap_ = graph_config.infer_table_cap(); + epoch_finish_ = false; + VLOG(0) << "Confirm GraphConfig, walk_degree : " << walk_degree_ << ", walk_len : " << walk_len_ << ", window : " << window_ << ", once_sample_startid_len : " << once_sample_startid_len_ << ", sample_times_one_chunk : " << repeat_time_ - << ", batch_size: " << batch_size_; + << ", batch_size: " << batch_size_ + << ", train_table_cap: " << train_table_cap_ + << ", infer_table_cap: " << infer_table_cap_; std::string first_node_type = graph_config.first_node_type(); std::string meta_path = graph_config.meta_path(); auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); - auto edge_to_id = gpu_graph_ptr->edge_to_id; - auto node_to_id = gpu_graph_ptr->feature_to_id; - // parse first_node_type - auto node_types = - paddle::string::split_string(first_node_type, ";"); - VLOG(2) << "node_types: " << first_node_type; - finish_node_type_.clear(); - node_type_start_.clear(); - for (auto &type : node_types) { - auto iter = node_to_id.find(type); - PADDLE_ENFORCE_NE( - iter, - node_to_id.end(), - platform::errors::NotFound("(%s) is not found in node_to_id.", type)); - VLOG(2) << "node_to_id[" << type << "] = " << iter->second; - first_node_type_.push_back(iter->second); - node_type_start_[iter->second] = 0; - } - meta_path_.resize(first_node_type_.size()); - auto meta_paths = paddle::string::split_string(meta_path, ";"); - - for (size_t i = 0; i < meta_paths.size(); i++) { - auto path = meta_paths[i]; - auto nodes = paddle::string::split_string(path, "-"); - for (auto &node : nodes) { - auto iter = edge_to_id.find(node); - PADDLE_ENFORCE_NE( - iter, - edge_to_id.end(), - platform::errors::NotFound("(%s) is not found in edge_to_id.", node)); - VLOG(2) << "edge_to_id[" << node << "] = " << iter->second; - meta_path_[i].push_back(iter->second); - } - } + debug_gpu_memory_info("init_conf start"); + gpu_graph_ptr->init_conf(first_node_type, meta_path); + debug_gpu_memory_info("init_conf end"); }; } // namespace framework diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index ba9d5d0546791..7ae03968598aa 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -60,6 +60,8 @@ class Scope; class Variable; class NeighborSampleResult; class NodeQueryResult; +template +class HashTable; } // namespace framework } // namespace paddle @@ -878,6 +880,9 @@ struct BufState { int GetNextBatch() { cursor += len; + if (row_num - cursor < 0) { + return 0; + } int tmp_len = cursor + batch_size > row_num ? row_num - cursor : batch_size; if (tmp_len == 0) { return 0; @@ -895,11 +900,13 @@ class GraphDataGenerator { GraphDataGenerator(){}; virtual ~GraphDataGenerator(){}; void SetConfig(const paddle::framework::DataFeedDesc& data_feed_desc); - void AllocResource(const paddle::platform::Place& place, - std::vector feed_vec); + void AllocResource(int thread_id, std::vector feed_vec); + void SetFeedVec(std::vector feed_vec); int AcquireInstance(BufState* state); int GenerateBatch(); - int FillWalkBuf(std::shared_ptr d_walk); + int FillWalkBuf(); + int FillInferBuf(); + void DoWalk(); int FillFeatureBuf(uint64_t* d_walk, uint64_t* d_feature, size_t key_num); int FillFeatureBuf(std::shared_ptr d_walk, std::shared_ptr d_feature); @@ -911,12 +918,32 @@ class GraphDataGenerator { int step, int* len_per_row); int FillInsBuf(); + int FillIdShowClkTensor(int total_instance, + bool gpu_graph_training, + size_t cursor = 0); + int FillGraphSlotFeature(int total_instance, bool gpu_graph_training); + int MakeInsPair(); + int GetPathNum() { return total_row_; } + void ResetPathNum() {total_row_ = 0; } + void ResetEpochFinish() {epoch_finish_ = false; } + void ClearSampleState(); void SetDeviceKeys(std::vector* device_keys, int type) { - type_to_index_[type] = h_device_keys_.size(); - h_device_keys_.push_back(device_keys); - } + // type_to_index_[type] = h_device_keys_.size(); + // h_device_keys_.push_back(device_keys); + } + int InsertTable(const unsigned long* d_keys, + unsigned long len, + std::shared_ptr d_uniq_node_num); + std::shared_ptr GetTableKeys( + std::shared_ptr d_uniq_node_num, + uint64_t& h_uniq_node_num); + void CopyFeaFromTable(std::shared_ptr d_uniq_fea_num); + std::vector& GetHostVec() { return host_vec_; } + bool get_epoch_finish() {return epoch_finish_; } + void clear_gpu_mem(); protected: + HashTable* table_; int walk_degree_; int walk_len_; int window_; @@ -924,16 +951,14 @@ class GraphDataGenerator { int gpuid_; // start ids // int64_t* device_keys_; - // size_t device_key_size_; - std::vector*> h_device_keys_; - std::unordered_map type_to_index_; - // point to device_keys_ size_t cursor_; + int thread_id_; size_t jump_rows_; int64_t* id_tensor_ptr_; int64_t* show_tensor_ptr_; int64_t* clk_tensor_ptr_; - cudaStream_t stream_; + cudaStream_t train_stream_; + cudaStream_t sample_stream_; paddle::platform::Place place_; std::vector feed_vec_; std::vector offset_; @@ -941,9 +966,11 @@ class GraphDataGenerator { std::vector> d_device_keys_; std::shared_ptr d_walk_; + std::shared_ptr d_feature_list_; std::shared_ptr d_feature_; std::shared_ptr d_len_per_row_; std::shared_ptr d_random_row_; + std::shared_ptr d_uniq_node_num_; // std::vector> d_sampleidx2rows_; int cur_sampleidx2row_; @@ -951,10 +978,6 @@ class GraphDataGenerator { std::shared_ptr d_sample_keys_; int sample_keys_len_; - std::set finish_node_type_; - std::unordered_map node_type_start_; - std::vector infer_node_type_start_; - std::shared_ptr d_ins_buf_; std::shared_ptr d_feature_buf_; std::shared_ptr d_pair_num_; @@ -970,9 +993,15 @@ class GraphDataGenerator { int slot_num_; int shuffle_seed_; int debug_mode_; - std::vector first_node_type_; - std::vector> meta_path_; bool gpu_graph_training_; + bool epoch_finish_; + std::vector host_vec_; + std::vector h_device_keys_len_; + uint64_t train_table_cap_; + uint64_t infer_table_cap_; + int total_row_; + size_t infer_node_start_; + size_t infer_node_end_; }; class DataFeed { @@ -1037,11 +1066,29 @@ class DataFeed { virtual void SetParseLogKey(bool parse_logkey) {} virtual void SetEnablePvMerge(bool enable_pv_merge) {} virtual void SetCurrentPhase(int current_phase) {} + virtual void InitGraphResource() {} virtual void SetDeviceKeys(std::vector* device_keys, int type) { #if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) gpu_graph_data_generator_.SetDeviceKeys(device_keys, type); #endif } +#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) + virtual const std::vector& GetHostVec() { + return gpu_graph_data_generator_.GetHostVec(); + } +#endif + + virtual void clear_gpu_mem() { +#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) + gpu_graph_data_generator_.clear_gpu_mem(); +#endif + } + virtual bool get_epoch_finish() { +#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) + return gpu_graph_data_generator_.get_epoch_finish(); +#endif + } + virtual void SetGpuGraphMode(int gpu_graph_mode) { gpu_graph_mode_ = gpu_graph_mode; } @@ -1058,11 +1105,40 @@ class DataFeed { return ins_content_vec_; } virtual int GetCurBatchSize() { return batch_size_; } + virtual int GetGraphPathNum() { +#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) + return gpu_graph_data_generator_.GetPathNum(); +#else + return 0; +#endif + } + virtual void ResetPathNum() { +#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) + gpu_graph_data_generator_.ResetPathNum(); +#endif + } + + virtual void ClearSampleState() { +#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) + gpu_graph_data_generator_.ClearSampleState(); +#endif + } + + virtual void ResetEpochFinish() { +#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) + gpu_graph_data_generator_.ResetEpochFinish(); +#endif +} + virtual bool IsTrainMode() { return train_mode_; } virtual void LoadIntoMemory() { PADDLE_THROW(platform::errors::Unimplemented( "This function(LoadIntoMemory) is not implemented.")); } + virtual void DoWalk() { + PADDLE_THROW(platform::errors::Unimplemented( + "This function(DoWalk) is not implemented.")); + } virtual void SetPlace(const paddle::platform::Place& place) { place_ = place; } @@ -1637,6 +1713,7 @@ class SlotRecordInMemoryDataFeed : public InMemoryDataFeed { // CustomParser* parser) {} virtual void PutToFeedVec(const std::vector& ins_vec) {} + virtual void InitGraphResource(void); virtual void LoadIntoMemoryByCommand(void); virtual void LoadIntoMemoryByLib(void); virtual void LoadIntoMemoryByLine(void); @@ -1671,6 +1748,8 @@ class SlotRecordInMemoryDataFeed : public InMemoryDataFeed { const int float_slot_size, const UsedSlotGpuType* used_slots); #endif + virtual void DoWalk(); + float sample_rate_ = 1.0f; int use_slot_size_ = 0; int float_use_slot_size_ = 0; diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto index a7ab70948795f..38791a124c56e 100644 --- a/paddle/fluid/framework/data_feed.proto +++ b/paddle/fluid/framework/data_feed.proto @@ -38,6 +38,8 @@ message GraphConfig { optional string first_node_type = 8; optional string meta_path = 9; optional bool gpu_graph_training = 10 [ default = true ]; + optional int64 train_table_cap = 11 [ default = 80000 ]; + optional int64 infer_table_cap = 12 [ default = 80000 ]; } message DataFeedDesc { diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc index 1d70ef6a1c78b..e22136fdaf707 100644 --- a/paddle/fluid/framework/data_set.cc +++ b/paddle/fluid/framework/data_set.cc @@ -36,7 +36,9 @@ #endif USE_INT_STAT(STAT_total_feasign_num_in_mem); +USE_INT_STAT(STAT_epoch_finish); DECLARE_bool(graph_get_neighbor_id); +DECLARE_int32(gpugraph_storage_mode); namespace paddle { namespace framework { @@ -446,18 +448,6 @@ void MultiSlotDataset::PrepareTrain() { return; } -template -void DatasetImpl::SetGraphDeviceKeys( - const std::vector& h_device_keys) { - // for (size_t i = 0; i < gpu_graph_device_keys_.size(); i++) { - // gpu_graph_device_keys_[i].clear(); - // } - // size_t device_num = gpu_graph_device_keys_.size(); - // for (size_t i = 0; i < h_device_keys.size(); i++) { - // int shard = h_device_keys[i] % device_num; - // gpu_graph_device_keys_[shard].push_back(h_device_keys[i]); - // } -} // load data into memory, Dataset hold this memory, // which will later be fed into readers' channel template @@ -469,63 +459,54 @@ void DatasetImpl::LoadIntoMemory() { if (gpu_graph_mode_) { VLOG(0) << "in gpu_graph_mode"; #ifdef PADDLE_WITH_HETERPS - graph_all_type_total_keys_.clear(); - auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); - auto node_to_id = gpu_graph_ptr->feature_to_id; - auto edge_to_id = gpu_graph_ptr->edge_to_id; - graph_all_type_total_keys_.resize(node_to_id.size()); - int cnt = 0; - // set sample start node - for (auto& iter : node_to_id) { - int node_idx = iter.second; - std::vector> gpu_graph_device_keys; - gpu_graph_ptr->get_all_id( - 1, node_idx, thread_num_, &gpu_graph_device_keys); - auto& type_total_key = graph_all_type_total_keys_[cnt]; - type_total_key.resize(thread_num_); - for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) { - VLOG(2) << "node type: " << node_idx << ", gpu_graph_device_keys[" << i - << "] = " << gpu_graph_device_keys[i].size(); - for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) { - type_total_key[i].push_back(gpu_graph_device_keys[i][j]); - } - } - + for (size_t i = 0; i < readers_.size(); i++) { + readers_[i]->SetGpuGraphMode(gpu_graph_mode_); + } + + if (STAT_GET(STAT_epoch_finish) == 1) { + VLOG(0) << "get epoch finish true"; + STAT_RESET(STAT_epoch_finish, 0); for (size_t i = 0; i < readers_.size(); i++) { - readers_[i]->SetDeviceKeys(&type_total_key[i], node_idx); - readers_[i]->SetGpuGraphMode(gpu_graph_mode_); + readers_[i]->ResetPathNum(); + readers_[i]->ResetEpochFinish(); } - cnt++; + return; } - // add node embedding id - std::vector> gpu_graph_device_keys; - gpu_graph_ptr->get_node_embedding_ids(thread_num_, &gpu_graph_device_keys); - for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) { - for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) { - gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]); + for (int64_t i = 0; i < thread_num_; ++i) { + load_threads.push_back( + std::thread(&paddle::framework::DataFeed::DoWalk, readers_[i].get())); + } + for (std::thread& t : load_threads) { + t.join(); + } + uint64_t node_num = 0; + for (int i = 0; i < thread_num_; i++) { + auto& host_vec = readers_[i]->GetHostVec(); + node_num += host_vec.size(); + } + gpu_graph_total_keys_.reserve(node_num); + for (int i = 0; i < thread_num_; i++) { + auto& host_vec = readers_[i]->GetHostVec(); + for (size_t j = 0; j < host_vec.size(); j++) { + gpu_graph_total_keys_.push_back(host_vec[j]); } } - // add feature embedding id - VLOG(2) << "begin add feature_id into gpu_graph_total_keys_ size[" - << gpu_graph_total_keys_.size() << "]"; - for (auto& iter : node_to_id) { - std::vector> gpu_graph_device_keys; - int node_idx = iter.second; - gpu_graph_ptr->get_all_feature_ids( - 1, node_idx, thread_num_, &gpu_graph_device_keys); - for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) { - VLOG(2) << "begin node type: " << node_idx << ", gpu_graph_device_keys[" - << i << "] = " << gpu_graph_device_keys[i].size(); - for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) { - gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]); - } - VLOG(2) << "end node type: " << node_idx << ", gpu_graph_device_keys[" - << i << "] = " << gpu_graph_device_keys[i].size(); + if (GetEpochFinish() == true) { + VLOG(0) << "epoch finish, set stat and clear sample stat!"; + STAT_RESET(STAT_epoch_finish, 1); + for (size_t i = 0; i < readers_.size(); i++) { + readers_[i]->ClearSampleState(); } } - VLOG(2) << "end add feature_id into gpu_graph_total_keys_ size[" + if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { + for (size_t i = 0; i < readers_.size(); i++) { + readers_[i]->clear_gpu_mem(); + } + } + + VLOG(2) << "end add edge into gpu_graph_total_keys_ size[" << gpu_graph_total_keys_.size() << "]"; #endif } else { @@ -1123,7 +1104,26 @@ void DatasetImpl::DestroyPreLoadReaders() { template int64_t DatasetImpl::GetMemoryDataSize() { - return input_channel_->Size(); + if (gpu_graph_mode_) { + int64_t total_path_num = 0; + for (int i = 0; i < thread_num_; i++) { + total_path_num += readers_[i]->GetGraphPathNum(); + } + return total_path_num; + } else { + return input_channel_->Size(); + } +} + +template +bool DatasetImpl::GetEpochFinish() { + bool is_epoch_finish = true; + if (gpu_graph_mode_) { + for (int i = 0; i < thread_num_; i++) { + is_epoch_finish = is_epoch_finish && readers_[i]->get_epoch_finish(); + } + } + return is_epoch_finish; } template @@ -1780,6 +1780,7 @@ void SlotRecordDataset::CreateReaders() { readers_[i]->SetParseLogKey(parse_logkey_); readers_[i]->SetEnablePvMerge(enable_pv_merge_); readers_[i]->SetCurrentPhase(current_phase_); + readers_[i]->InitGraphResource(); if (input_channel_ != nullptr) { readers_[i]->SetInputChannel(input_channel_.get()); } diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h index 0489c2ece64e8..9e1998a35fd64 100644 --- a/paddle/fluid/framework/data_set.h +++ b/paddle/fluid/framework/data_set.h @@ -169,6 +169,10 @@ class Dataset { virtual void SetGpuGraphMode(int is_graph_mode) = 0; virtual int GetGpuGraphMode() = 0; + virtual bool GetEpochFinish() = 0; + + virtual void SetPassId(uint32_t pass_id) = 0; + virtual uint32_t GetPassID() = 0; protected: virtual int ReceiveFromClient(int msg_type, @@ -253,7 +257,7 @@ class DatasetImpl : public Dataset { int read_thread_num, int consume_thread_num, int shard_num) {} - virtual void SetGraphDeviceKeys(const std::vector& h_device_keys); + virtual void SetGraphDeviceKeys(const std::vector& h_device_keys) {} virtual void ClearLocalTables() {} virtual void CreatePreLoadReaders(); virtual void DestroyPreLoadReaders(); @@ -263,11 +267,7 @@ class DatasetImpl : public Dataset { virtual void DynamicAdjustReadersNum(int thread_num); virtual void SetFleetSendSleepSeconds(int seconds); virtual std::vector GetSlots(); - /* for enable_heterps_ - virtual void EnableHeterps(bool enable_heterps) { - enable_heterps_ = enable_heterps; - } - */ + virtual bool GetEpochFinish(); std::vector>& GetMultiOutputChannel() { return multi_output_channel_; @@ -280,10 +280,13 @@ class DatasetImpl : public Dataset { return multi_consume_channel_; } } + Channel& GetInputChannelRef() { return input_channel_; } std::vector& GetGpuGraphTotalKeys() { return gpu_graph_total_keys_; } - Channel& GetInputChannelRef() { return input_channel_; } + + virtual void SetPassId(uint32_t pass_id) { pass_id_ = pass_id; } + virtual uint32_t GetPassID() { return pass_id_; } protected: virtual int ReceiveFromClient(int msg_type, @@ -344,9 +347,9 @@ class DatasetImpl : public Dataset { std::vector use_slots_; bool enable_heterps_ = false; int gpu_graph_mode_ = 0; - // std::vector> gpu_graph_device_keys_; - std::vector>> graph_all_type_total_keys_; + std::vector>> gpu_graph_type_keys_; std::vector gpu_graph_total_keys_; + uint32_t pass_id_ = 0; }; // use std::vector or Record as data type diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 4cf3ab8dc1a67..bacb096f751b1 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -29,7 +29,7 @@ if(WITH_HETERPS) nv_library( ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc - DEPS heter_ps gloo_wrapper ps_framework_proto ${BRPC_DEPS}) + DEPS heter_ps gloo_wrapper ps_framework_proto graph_gpu_wrapper ${BRPC_DEPS}) else() nv_library( ps_gpu_wrapper diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h index ef2e73d6dd5b5..cab31ade6c374 100644 --- a/paddle/fluid/framework/fleet/heter_context.h +++ b/paddle/fluid/framework/fleet/heter_context.h @@ -86,6 +86,7 @@ class HeterContext { int multi_mf_dim_ = 0; uint32_t shard_num_ = 37; + uint16_t pass_id_ = 0; uint64_t size() { uint64_t total_size = 0; for (auto& keys : feature_keys_) { diff --git a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h index 85bf6bb553b22..160de2646d7d9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h @@ -524,6 +524,7 @@ class concurrent_unordered_map : public managed { __forceinline__ __device__ iterator insert(const value_type& x, aggregation_type op, + uint64_t* local_count = NULL, comparison_type keys_equal = key_equal(), bool precomputed_hash = false, hash_value_type precomputed_hash_value = 0) { @@ -580,6 +581,10 @@ class concurrent_unordered_map : public managed { if (m_enable_collision_stat) { atomicAdd(&m_insert_times, 1); } + + if (local_count != NULL && keys_equal(unused_key, old_key)) { + atomicAdd(local_count, 1); + } break; } diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h index 08a87b6a84688..c6b3bde4a2bbd 100644 --- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h +++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h @@ -170,6 +170,7 @@ struct NeighborSampleResult { uint64_t *actual_val; int *actual_sample_size, sample_size, key_size; int total_sample_size; + cudaStream_t stream=0; std::shared_ptr val_mem, actual_sample_size_mem; std::shared_ptr actual_val_mem; uint64_t *get_val() { return val; } @@ -179,18 +180,30 @@ struct NeighborSampleResult { int get_key_size() { return key_size; } void set_total_sample_size(int s) { total_sample_size = s; } int get_len() { return total_sample_size; } + void set_stream(cudaStream_t stream_t) { + stream = stream_t; + } void initialize(int _sample_size, int _key_size, int dev_id) { sample_size = _sample_size; key_size = _key_size; platform::CUDADeviceGuard guard(dev_id); platform::CUDAPlace place = platform::CUDAPlace(dev_id); - val_mem = - memory::AllocShared(place, _sample_size * _key_size * sizeof(uint64_t)); + if (stream != 0) { + val_mem = + memory::AllocShared(place, _sample_size * _key_size * sizeof(uint64_t), phi::Stream(reinterpret_cast(stream))); + actual_sample_size_mem = + memory::AllocShared(place, _key_size * sizeof(int), phi::Stream(reinterpret_cast(stream))); + } + else { + val_mem = + memory::AllocShared(place, _sample_size * _key_size * sizeof(uint64_t)); + actual_sample_size_mem = + memory::AllocShared(place, _key_size * sizeof(int)); + } val = (uint64_t *)val_mem->ptr(); - actual_sample_size_mem = - memory::AllocShared(place, _key_size * sizeof(int)); actual_sample_size = (int *)actual_sample_size_mem->ptr(); } + void display() { VLOG(0) << "in node sample result display ------------------"; int64_t *res = new int64_t[sample_size * key_size]; diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h index aa202fe020fe9..d890b74ff9b2e 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h @@ -112,8 +112,17 @@ class GpuPsGraphTable } } } + device_mutex_.resize(gpu_num); + for (int i = 0; i < gpu_num; i++) { + device_mutex_[i] = new std::mutex(); + } + } + ~GpuPsGraphTable() { + for (size_t i = 0; i < device_mutex_.size(); ++i) { + delete device_mutex_[i]; + } + device_mutex_.clear(); } - ~GpuPsGraphTable() {} void build_graph_on_single_gpu(const GpuPsCommGraph &g, int gpu_id, int idx); void build_graph_fea_on_single_gpu(const GpuPsCommGraphFea &g, int gpu_id); void clear_graph_info(int gpu_id, int index); @@ -154,7 +163,9 @@ class GpuPsGraphTable uint64_t *src_sample_res, int *actual_sample_size); int init_cpu_table(const paddle::distributed::GraphParameter &graph); - + gpuStream_t get_local_stream(int gpu_id) { + return resource_->local_stream(gpu_id, 0); + } int gpu_num; int graph_table_num_, feature_table_num_; std::vector gpu_graph_list_; @@ -165,6 +176,7 @@ class GpuPsGraphTable std::shared_ptr cpu_graph_table_; std::shared_ptr rw_lock; mutable std::mutex mutex_; + std::vector device_mutex_; std::condition_variable cv_; int cpu_table_status; }; diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu index 3693277a75d39..eac888383a3de 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu @@ -15,8 +15,8 @@ #include #include #include - #include +#include "cub/cub.cuh" #pragma once #ifdef PADDLE_WITH_HETERPS #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h" @@ -678,7 +678,10 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( int sample_size, int len, bool cpu_query_switch) { + device_mutex_[gpu_id]->lock(); NeighborSampleResult result; + auto stream = resource_->local_stream(gpu_id, 0); + result.set_stream(stream); result.initialize(sample_size, len, resource_->dev_id(gpu_id)); if (len == 0) { @@ -691,15 +694,20 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( int* actual_sample_size = result.actual_sample_size; uint64_t* val = result.val; int total_gpu = resource_->total_device(); - auto stream = resource_->local_stream(gpu_id, 0); int grid_size = (len - 1) / block_size_ + 1; int h_left[total_gpu]; // NOLINT int h_right[total_gpu]; // NOLINT - auto d_left = memory::Alloc(place, total_gpu * sizeof(int)); - auto d_right = memory::Alloc(place, total_gpu * sizeof(int)); + auto d_left = + memory::Alloc(place, + total_gpu * sizeof(int), + phi::Stream(reinterpret_cast(stream))); + auto d_right = + memory::Alloc(place, + total_gpu * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_left_ptr = reinterpret_cast(d_left->ptr()); int* d_right_ptr = reinterpret_cast(d_right->ptr()); int default_value = 0; @@ -710,15 +718,26 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( CUDA_CHECK(cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream)); CUDA_CHECK(cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream)); // - auto d_idx = memory::Alloc(place, len * sizeof(int)); + auto d_idx = + memory::Alloc(place, + len * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); - auto d_shard_keys = memory::Alloc(place, len * sizeof(uint64_t)); + auto d_shard_keys = + memory::Alloc(place, + len * sizeof(uint64_t), + phi::Stream(reinterpret_cast(stream))); uint64_t* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); auto d_shard_vals = - memory::Alloc(place, sample_size * len * sizeof(uint64_t)); + memory::Alloc(place, + sample_size * len * sizeof(uint64_t), + phi::Stream(reinterpret_cast(stream))); uint64_t* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); - auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int)); + auto d_shard_actual_sample_size = + memory::Alloc(place, + len * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_shard_actual_sample_size_ptr = reinterpret_cast(d_shard_actual_sample_size->ptr()); @@ -730,10 +749,17 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaMemcpy( - h_left, d_left_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy( - h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(h_left, + d_left_ptr, + total_gpu * sizeof(int), + cudaMemcpyDeviceToHost, + stream)); + CUDA_CHECK(cudaMemcpyAsync(h_right, + d_right_ptr, + total_gpu * sizeof(int), + cudaMemcpyDeviceToHost, + stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); for (int i = 0; i < total_gpu; ++i) { int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; if (shard_len == 0) { @@ -911,29 +937,50 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( CUDA_CHECK(cudaStreamSynchronize(stream)); platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id)); platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id)); - - thrust::device_vector t_actual_sample_size(len); - thrust::copy(actual_sample_size, - actual_sample_size + len, - t_actual_sample_size.begin()); - int total_sample_size = thrust::reduce(t_actual_sample_size.begin(), - t_actual_sample_size.end()); - - result.actual_val_mem = - memory::AllocShared(place, total_sample_size * sizeof(uint64_t)); + size_t temp_storage_bytes = 0; + int total_sample_size = 0; + auto cumsum_actual_sample_size = + memory::Alloc(place, + (len + 1) * sizeof(int), + phi::Stream(reinterpret_cast(stream))); + int* cumsum_actual_sample_size_p = + reinterpret_cast(cumsum_actual_sample_size->ptr()); + CUDA_CHECK( + cudaMemsetAsync(cumsum_actual_sample_size_p, 0, sizeof(int), stream)); + CUDA_CHECK(cub::DeviceScan::InclusiveSum(NULL, + temp_storage_bytes, + actual_sample_size, + cumsum_actual_sample_size_p + 1, + len, + stream)); + auto d_temp_storage = + memory::Alloc(place, + temp_storage_bytes, + phi::Stream(reinterpret_cast(stream))); + CUDA_CHECK(cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(), + temp_storage_bytes, + actual_sample_size, + cumsum_actual_sample_size_p + 1, + len, + stream)); + CUDA_CHECK(cudaMemcpyAsync(&total_sample_size, + cumsum_actual_sample_size_p + len, + sizeof(int), + cudaMemcpyDeviceToHost, + stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + result.actual_val_mem = memory::AllocShared( + place, + total_sample_size * sizeof(uint64_t), + phi::Stream(reinterpret_cast(stream))); result.actual_val = (uint64_t*)(result.actual_val_mem)->ptr(); result.set_total_sample_size(total_sample_size); - thrust::device_vector cumsum_actual_sample_size(len); - thrust::exclusive_scan(t_actual_sample_size.begin(), - t_actual_sample_size.end(), - cumsum_actual_sample_size.begin(), - 0); fill_actual_vals<<>>( val, result.actual_val, actual_sample_size, - thrust::raw_pointer_cast(cumsum_actual_sample_size.data()), + cumsum_actual_sample_size_p, sample_size, len); } @@ -945,6 +992,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( destroy_storage(gpu_id, i); } cudaStreamSynchronize(stream); + device_mutex_[gpu_id]->unlock(); return result; } @@ -1002,6 +1050,7 @@ int GpuPsGraphTable::get_feature_of_nodes(int gpu_id, uint64_t* d_feature, int node_num, int slot_num) { + device_mutex_[gpu_id]->lock(); if (node_num == 0) { return -1; } @@ -1011,23 +1060,40 @@ int GpuPsGraphTable::get_feature_of_nodes(int gpu_id, int total_gpu = resource_->total_device(); auto stream = resource_->local_stream(gpu_id, 0); - auto d_left = memory::Alloc(place, total_gpu * sizeof(int)); - auto d_right = memory::Alloc(place, total_gpu * sizeof(int)); + auto d_left = + memory::Alloc(place, + total_gpu * sizeof(int), + phi::Stream(reinterpret_cast(stream))); + auto d_right = + memory::Alloc(place, + total_gpu * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_left_ptr = reinterpret_cast(d_left->ptr()); int* d_right_ptr = reinterpret_cast(d_right->ptr()); CUDA_CHECK(cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream)); CUDA_CHECK(cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream)); // - auto d_idx = memory::Alloc(place, node_num * sizeof(int)); + auto d_idx = + memory::Alloc(place, + node_num * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); - auto d_shard_keys = memory::Alloc(place, node_num * sizeof(uint64_t)); + auto d_shard_keys = + memory::Alloc(place, + node_num * sizeof(uint64_t), + phi::Stream(reinterpret_cast(stream))); uint64_t* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); auto d_shard_vals = - memory::Alloc(place, slot_num * node_num * sizeof(uint64_t)); + memory::Alloc(place, + slot_num * node_num * sizeof(uint64_t), + phi::Stream(reinterpret_cast(stream))); uint64_t* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); - auto d_shard_actual_size = memory::Alloc(place, node_num * sizeof(int)); + auto d_shard_actual_size = + memory::Alloc(place, + node_num * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_shard_actual_size_ptr = reinterpret_cast(d_shard_actual_size->ptr()); @@ -1131,6 +1197,7 @@ int GpuPsGraphTable::get_feature_of_nodes(int gpu_id, } CUDA_CHECK(cudaStreamSynchronize(stream)); + device_mutex_[gpu_id]->unlock(); return 0; } diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu index 4cc1b746a558f..6e0ea83d92991 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu @@ -28,6 +28,121 @@ void GraphGpuWrapper::set_device(std::vector ids) { } } +void GraphGpuWrapper::init_conf(const std::string &first_node_type, + const std::string &meta_path) { + static std::mutex mutex; + { + std::lock_guard lock(mutex); + if (conf_initialized_) { + return; + } + VLOG(2) << "init path config"; + conf_initialized_ = true; + auto node_types = + paddle::string::split_string(first_node_type, ";"); + VLOG(2) << "node_types: " << first_node_type; + for (auto &type : node_types) { + auto iter = feature_to_id.find(type); + PADDLE_ENFORCE_NE(iter, + feature_to_id.end(), + platform::errors::NotFound( + "(%s) is not found in feature_to_id.", type)); + VLOG(2) << "feature_to_id[" << type << "] = " << iter->second; + first_node_type_.push_back(iter->second); + } + meta_path_.resize(first_node_type_.size()); + auto meta_paths = paddle::string::split_string(meta_path, ";"); + + for (size_t i = 0; i < meta_paths.size(); i++) { + auto path = meta_paths[i]; + auto nodes = paddle::string::split_string(path, "-"); + for (auto &node : nodes) { + auto iter = edge_to_id.find(node); + PADDLE_ENFORCE_NE(iter, + edge_to_id.end(), + platform::errors::NotFound( + "(%s) is not found in edge_to_id.", node)); + VLOG(2) << "edge_to_id[" << node << "] = " << iter->second; + meta_path_[i].push_back(iter->second); + } + } + int max_dev_id = 0; + for (size_t i = 0; i < device_id_mapping.size(); i++) { + if (device_id_mapping[i] > max_dev_id) { + max_dev_id = device_id_mapping[i]; + } + } + finish_node_type_.resize(max_dev_id + 1); + node_type_start_.resize(max_dev_id + 1); + global_infer_node_type_start_.resize(max_dev_id + 1); + for (size_t i = 0; i < device_id_mapping.size(); i++) { + int dev_id = device_id_mapping[i]; + auto &node_type_start = node_type_start_[i]; + auto &infer_node_type_start = global_infer_node_type_start_[i]; + auto &finish_node_type = finish_node_type_[i]; + finish_node_type.clear(); + + for (size_t idx = 0; idx < feature_to_id.size(); idx++) { + infer_node_type_start[idx] = 0; + } + for (auto &type : node_types) { + auto iter = feature_to_id.find(type); + node_type_start[iter->second] = 0; + infer_node_type_start[iter->second] = 0; + } + infer_cursor_.push_back(0); + cursor_.push_back(0); + } + init_type_keys(); + } +} + +void GraphGpuWrapper::init_type_keys() { + size_t thread_num = device_id_mapping.size(); + int cnt = 0; + + auto &graph_all_type_total_keys = get_graph_type_keys(); + auto &type_to_index = get_graph_type_to_index(); + std::vector> tmp_keys; + tmp_keys.resize(thread_num); + d_graph_all_type_total_keys_.resize(graph_all_type_total_keys.size()); + h_graph_all_type_keys_len_.resize(graph_all_type_total_keys.size()); + for (size_t f_idx = 0; f_idx < graph_all_type_total_keys.size(); f_idx++) { + for (size_t j = 0; j < tmp_keys.size(); j++) { + tmp_keys[j].clear(); + } + d_graph_all_type_total_keys_[f_idx].resize(thread_num); + auto &type_total_key = graph_all_type_total_keys[f_idx]; + for (size_t j = 0; j < type_total_key.size(); j++) { + uint64_t shard = type_total_key[j] % thread_num; + tmp_keys[shard].push_back(type_total_key[j]); + } + for (size_t j = 0; j < thread_num; j++) { + h_graph_all_type_keys_len_[f_idx].push_back(tmp_keys[j].size()); + VLOG(1) << "node type: " << type_to_index[f_idx] + << ", gpu_graph_device_keys[" << j + << "] = " << tmp_keys[j].size(); + } + for (size_t j = 0; j < thread_num; j++) { + auto stream = get_local_stream(j); + int gpuid = device_id_mapping[j]; + auto place = platform::CUDAPlace(gpuid); + platform::CUDADeviceGuard guard(gpuid); + d_graph_all_type_total_keys_[f_idx][j] = + memory::AllocShared(place, tmp_keys[j].size() * sizeof(uint64_t)); + cudaMemcpyAsync(d_graph_all_type_total_keys_[f_idx][j]->ptr(), + tmp_keys[j].data(), + sizeof(uint64_t) * tmp_keys[j].size(), + cudaMemcpyHostToDevice, + stream); + } + } + for (int i = 0; i < thread_num; i++) { + auto stream = get_local_stream(i); + cudaStreamSynchronize(stream); + } +} + int GraphGpuWrapper::get_all_id(int type, int slice_num, std::vector> *output) { @@ -144,6 +259,15 @@ void GraphGpuWrapper::load_edge_file(std::string name, } } +void GraphGpuWrapper::load_edge_file(std::string etype2files, + std::string graph_data_local_path, + int part_num, + bool reverse) { + ((GpuPsGraphTable *)graph_table) + ->cpu_graph_table_->parse_edge_and_load( + etype2files, graph_data_local_path, part_num, reverse); +} + void GraphGpuWrapper::load_node_file(std::string name, std::string filepath) { // 'n' means load nodes and 'node_type' follows @@ -155,15 +279,22 @@ void GraphGpuWrapper::load_node_file(std::string name, std::string filepath) { } } -void GraphGpuWrapper::load_node_and_edge(std::string etype, - std::string ntype, - std::string epath, - std::string npath, +void GraphGpuWrapper::load_node_file(std::string ntype2files, + std::string graph_data_local_path, + int part_num) { + ((GpuPsGraphTable *)graph_table) + ->cpu_graph_table_->parse_node_and_load( + ntype2files, graph_data_local_path, part_num); +} + +void GraphGpuWrapper::load_node_and_edge(std::string etype2files, + std::string ntype2files, + std::string graph_data_local_path, int part_num, bool reverse) { ((GpuPsGraphTable *)graph_table) ->cpu_graph_table_->load_node_and_edge_file( - etype, ntype, epath, npath, part_num, reverse); + etype2files, ntype2files, graph_data_local_path, part_num, reverse); } void GraphGpuWrapper::add_table_feat_conf(std::string table_name, @@ -196,6 +327,10 @@ void GraphGpuWrapper::add_table_feat_conf(std::string table_name, } void GraphGpuWrapper::init_search_level(int level) { search_level = level; } +gpuStream_t GraphGpuWrapper::get_local_stream(int gpuid) { + return ((GpuPsGraphTable *)graph_table)->get_local_stream(gpuid); +} + void GraphGpuWrapper::init_service() { table_proto.set_task_pool_size(24); table_proto.set_shard_num(1000); @@ -357,9 +492,6 @@ std::vector GraphGpuWrapper::graph_neighbor_sample( res.push_back(cpu_key[i * sample_size + j]); } } - /* for(int i = 0;i < res.size();i ++) { */ - /* VLOG(0) << i << " " << res[i]; */ - /* } */ delete[] actual_sample_size; cudaFree(cuda_key); return res; @@ -386,6 +518,31 @@ void GraphGpuWrapper::export_partition_files(int idx, std::string file_path) { return ((GpuPsGraphTable *)graph_table) ->cpu_graph_table_->export_partition_files(idx, file_path); } + +void GraphGpuWrapper::release_graph() { + return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->release_graph(); +} + +void GraphGpuWrapper::release_graph_edge() { + return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->release_graph_edge(); +} + +void GraphGpuWrapper::release_graph_node() { + return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->release_graph_node(); +} + +std::vector &GraphGpuWrapper::get_graph_total_keys() { + return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->graph_total_keys_; +} + +std::vector> &GraphGpuWrapper::get_graph_type_keys() { + return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->graph_type_keys_; +} + +std::unordered_map &GraphGpuWrapper::get_graph_type_to_index() { + return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->type_to_index_; +} + #endif } // namespace framework }; // namespace paddle diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h index b41303b85e0df..8f7c29f487f4e 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h @@ -22,6 +22,14 @@ namespace paddle { namespace framework { #ifdef PADDLE_WITH_HETERPS + +enum GpuGraphStorageMode { + WHOLE_HBM = 1, + MEM_EMB_AND_GPU_GRAPH, + MEM_EMB_FEATURE_AND_GPU_GRAPH, + SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH +}; + class GraphGpuWrapper { public: static std::shared_ptr GetInstance() { @@ -31,6 +39,8 @@ class GraphGpuWrapper { return s_instance_; } static std::shared_ptr s_instance_; + void init_conf(const std::string& first_node_type, + const std::string& meta_path); void initialize(); void finalize(); void set_device(std::vector ids); @@ -47,11 +57,19 @@ class GraphGpuWrapper { std::string feat_dtype, int feat_shape); void load_edge_file(std::string name, std::string filepath, bool reverse); + void load_edge_file(std::string etype2files, + std::string graph_data_local_path, + int part_num, + bool reverse); + void load_node_file(std::string name, std::string filepath); - void load_node_and_edge(std::string etype, - std::string ntype, - std::string epath, - std::string npath, + void load_node_file(std::string ntype2files, + std::string graph_data_local_path, + int part_num); + + void load_node_and_edge(std::string etype2files, + std::string ntype2files, + std::string graph_data_local_path, int part_num, bool reverse); int32_t load_next_partition(int idx); @@ -93,6 +111,7 @@ class GraphGpuWrapper { uint64_t* device_keys, int walk_degree, int len); + gpuStream_t get_local_stream(int gpuid); std::vector graph_neighbor_sample(int gpu_id, int idx, std::vector& key, @@ -104,6 +123,14 @@ class GraphGpuWrapper { uint32_t size, int slot_num); + void release_graph(); + void release_graph_edge(); + void release_graph_node(); + void init_type_keys(); + std::vector& get_graph_total_keys(); + std::vector>& get_graph_type_keys(); + std::unordered_map& get_graph_type_to_index(); + std::unordered_map edge_to_id, feature_to_id; std::vector id_to_feature, id_to_edge; std::vector> table_feat_mapping; @@ -117,6 +144,19 @@ class GraphGpuWrapper { int upload_num = 8; std::shared_ptr<::ThreadPool> upload_task_pool; std::string feature_separator_ = std::string(" "); + // + bool conf_initialized_ = false; + std::vector first_node_type_; + std::vector> meta_path_; + + std::vector> finish_node_type_; + std::vector> node_type_start_; + std::vector> global_infer_node_type_start_; + std::vector infer_cursor_; + std::vector cursor_; + std::vector>> + d_graph_all_type_total_keys_; + std::vector> h_graph_all_type_keys_len_; }; #endif } // namespace framework diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index 18fb2eca5b752..05c254b2739f2 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -124,6 +124,12 @@ class HashTable { size_t len, StreamType stream); + template + void insert(const KeyType* d_keys, + size_t len, + uint64_t* global_num, + StreamType stream); + template void insert(const KeyType* d_keys, size_t len, @@ -153,6 +159,9 @@ class HashTable { template void dump_to_cpu(int devid, StreamType stream); + template + void get_keys(KeyType* d_out, uint64_t* global_cursor, StreamType stream); + #if defined(PADDLE_WITH_CUDA) template @@ -185,7 +194,7 @@ class HashTable { #endif int size() { return container_->size(); } - + thrust::pair* data() { return container_->data(); } void set_feature_value_size(size_t pull_feature_value_size, size_t push_grad_value_size) { pull_feature_value_size_ = pull_feature_value_size; @@ -194,6 +203,12 @@ class HashTable { << " push value size: " << push_grad_value_size_; } + int prefetch(const int dev_id, cudaStream_t stream = 0) { + return container_->prefetch(dev_id, stream); + } + + void clear(cudaStream_t stream = 0) { container_->clear_async(stream); } + void show_collision(int id) { return container_->print_collision(id); } std::unique_ptr rwlock_{nullptr}; diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu index 1fda5a586a2e8..33b50f789a49c 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu @@ -31,6 +31,35 @@ struct ReplaceOp { } }; +template +__global__ void insert_kernel(Table* table, + const typename Table::key_type* const keys, + size_t len, + uint64_t* global_num) { + ReplaceOp op; + thrust::pair kv; + + __shared__ uint64_t local_num; + + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (threadIdx.x == 0) { + local_num = 0; + } + __syncthreads(); + + if (i < len) { + kv.first = keys[i]; + kv.second = 1; // fake value + auto it = table->insert(kv, op, &local_num); + assert(it != table->end() && "error: insert fails: table is full"); + } + __syncthreads(); + + if (threadIdx.x == 0) { + atomicAdd(global_num, local_num); + } +} + template __global__ void insert_kernel(Table* table, const typename Table::key_type* const keys, @@ -38,7 +67,6 @@ __global__ void insert_kernel(Table* table, size_t len) { ReplaceOp op; thrust::pair kv; - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { kv.first = keys[i]; @@ -139,6 +167,41 @@ __global__ void dy_mf_update_kernel(Table* table, } } +template +__global__ void get_keys_kernel(Table* table, + typename Table::key_type* d_out, + uint64_t* global_cursor, + uint64_t unused_key) { + extern __shared__ typename Table::key_type local_key[]; + __shared__ uint64_t local_num; + __shared__ uint64_t global_num; + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (threadIdx.x == 0) { + local_num = 0; + } + __syncthreads(); + uint64_t len = table->size(); + if (idx < len) { + typename Table::value_type val = *(table->data() + idx); + if (val.first != unused_key) { + uint64_t dst = atomicAdd(&local_num, 1); + local_key[dst] = val.first; + } + } + + __syncthreads(); + + if (threadIdx.x == 0) { + global_num = atomicAdd(global_cursor, local_num); + } + __syncthreads(); + + if (threadIdx.x < local_num) { + d_out[global_num + threadIdx.x] = local_key[threadIdx.x]; + } +} + template HashTable::HashTable(size_t capacity) { container_ = new TableContainer(capacity); @@ -211,6 +274,20 @@ void HashTable::get(const KeyType* d_keys, container_, d_keys, d_vals, len, pull_feature_value_size_, fv_accessor); } +template +template +void HashTable::insert(const KeyType* d_keys, + size_t len, + uint64_t* global_num, + StreamType stream) { + if (len == 0) { + return; + } + const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; + insert_kernel<<>>( + container_, d_keys, len, global_num); +} + template template void HashTable::insert(const KeyType* d_keys, @@ -225,6 +302,20 @@ void HashTable::insert(const KeyType* d_keys, container_, d_keys, d_vals, len); } +template +template +void HashTable::get_keys(KeyType* d_out, + uint64_t* global_cursor, + StreamType stream) { + size_t len = container_->size(); + const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; + KeyType unuse_key = std::numeric_limits::max(); + size_t shared_mem_size = sizeof(KeyType) * BLOCK_SIZE_; + get_keys_kernel<<>>( + container_, d_out, global_cursor, unuse_key); +} + + template template void HashTable::insert(const KeyType* d_keys, @@ -436,6 +527,17 @@ template void HashTable::insert( size_t len, cudaStream_t stream); +template void HashTable::get_keys( + unsigned long* d_out, + unsigned long* global_cursor, + cudaStream_t stream); + +template void HashTable::insert( + const unsigned long* d_keys, + unsigned long len, + uint64_t* global_num, + cudaStream_t stream); + template void HashTable::insert( const unsigned long* d_keys, const unsigned long* d_vals, diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 82532836b8e22..cf6c4eaf8b99a 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -1127,13 +1127,22 @@ void HeterComm::split_input_to_shard( AnyDeviceGuard guard(dev_id); auto stream = resource_->local_stream(dev_num, 0); - auto d_idx_tmp = memory::Alloc(place, len * sizeof(int)); + auto d_idx_tmp = + memory::Alloc(place, + len * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_idx_tmp_ptr = reinterpret_cast(d_idx_tmp->ptr()); - auto d_shard_index = memory::Alloc(place, len * sizeof(int)); + auto d_shard_index = + memory::Alloc(place, + len * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_shard_index_ptr = reinterpret_cast(d_shard_index->ptr()); - auto d_shard_index_tmp = memory::Alloc(place, len * sizeof(int)); + auto d_shard_index_tmp = + memory::Alloc(place, + len * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_shard_index_tmp_ptr = reinterpret_cast(d_shard_index_tmp->ptr()); heter_comm_kernel_->fill_idx(d_idx_tmp_ptr, len, stream); @@ -1153,7 +1162,10 @@ void HeterComm::split_input_to_shard( num_bits, stream); - auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); + auto d_temp_storage = + memory::Alloc(place, + temp_storage_bytes, + phi::Stream(reinterpret_cast(stream))); heter_comm_kernel_->sort_pairs(d_temp_storage->ptr(), temp_storage_bytes, d_shard_index_tmp_ptr, diff --git a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h index 4696a7cc91b5a..22d6199584f58 100644 --- a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h +++ b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h @@ -96,6 +96,53 @@ class HBMMemoryPool : public managed { size_t block_size_; }; +class HBMMemoryPoolFix : public managed { + public: + HBMMemoryPoolFix() { + capacity_ = 0; + size_ = 0 ; + block_size_ = 0; + max_byte_capacity_ = 0; + } + + ~HBMMemoryPoolFix() { + VLOG(3) << "delete hbm memory pool"; + cudaFree(mem_); + } + + size_t block_size() { return block_size_; } + + void clear(void) { cudaMemset(mem_, 0, block_size_ * capacity_); } + + void reset(size_t capacity, size_t block_size) { + if (max_byte_capacity_ < capacity * block_size) { + if (mem_ != NULL) { + cudaFree(mem_); + } + max_byte_capacity_ = (block_size * capacity / 8 + 1) * 8; + CUDA_CHECK(cudaMalloc(&mem_, max_byte_capacity_)); + } + size_ = capacity; + block_size_ = block_size; + capacity_ = max_byte_capacity_ / block_size; + } + + char* mem() { return mem_; } + + size_t capacity() { return capacity_; } + size_t size() { return size_; } + __forceinline__ __device__ void* mem_address(const uint32_t& idx) { + return (void*)&mem_[(idx)*block_size_]; + } + + private: + char* mem_ = NULL; + size_t capacity_; + size_t size_; + size_t block_size_; + size_t max_byte_capacity_; +}; + } // end namespace framework } // end namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 91eb11d9dbdbc..a61860c7f3608 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -34,12 +34,14 @@ limitations under the License. */ #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h" +#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h" #include "paddle/fluid/platform/timer.h" #if defined(PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/ps/table/depends/feature_value.h" #endif DECLARE_int32(gpugraph_dedup_pull_push_mode); +DECLARE_int32(gpugraph_storage_mode); namespace paddle { namespace framework { @@ -111,6 +113,102 @@ void PSGPUWrapper::InitAfsApi(const std::string& fs_name, use_afs_api_ = 1; } #endif + +void PSGPUWrapper::add_key_to_local(const std::vector& vec_data) { + size_t total_len = vec_data.size(); + size_t len_per_thread = total_len / thread_keys_thread_num_; + size_t begin = 0; + std::vector threads; + + int remain = total_len % thread_keys_thread_num_; + auto gen_graph_data_func = [this](const std::vector& total_data, + int begin_index, + int end_index, + int i) { + for (auto iter = total_data.begin() + begin_index; + iter != total_data.begin() + end_index; + iter++) { + uint64_t cur_key = *iter; + int shard_id = cur_key % thread_keys_shard_num_; + this->thread_keys_[i][shard_id].insert(cur_key); + } + }; + auto gen_graph_dynamic_mf_func = [this]( + const std::vector& total_data, + int begin_index, + int end_index, + int i) { + for (auto iter = total_data.begin() + begin_index; + iter != total_data.begin() + end_index; + iter++) { + uint64_t cur_key = *iter; + int shard_id = cur_key % thread_keys_shard_num_; + // TODO: feasign <-> slot <-> multi_dim + this->thread_dim_keys_[i][shard_id][0].insert(cur_key); + } + }; + for (int i = 0; i < thread_keys_thread_num_; i++) { + if (!multi_mf_dim_) { + threads.push_back( + std::thread(gen_graph_data_func, + std::ref(vec_data), + begin, + begin + len_per_thread + (i < remain ? 1 : 0), + i)); + } else { + threads.push_back( + std::thread(gen_graph_dynamic_mf_func, + std::ref(vec_data), + begin, + begin + len_per_thread + (i < remain ? 1 : 0), + i)); + } + begin += len_per_thread + (i < remain ? 1 : 0); + } + for (std::thread& t : threads) { + t.join(); + } +} + +void PSGPUWrapper::add_key_to_gputask(std::shared_ptr gpu_task) { + std::vector threads; + platform::Timer timeline; + timeline.Start(); + // merge thread_keys to shard_keys + auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) { + for (int i = 0; i < thread_keys_thread_num_; ++i) { + gpu_task->batch_add_keys( + shard_num, dim_id, thread_dim_keys_[i][shard_num][dim_id]); + thread_dim_keys_[i][shard_num][dim_id].clear(); + } + }; + for (int i = 0; i < thread_keys_shard_num_; ++i) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j)); + } + } + for (auto& t : threads) { + t.join(); + } + timeline.Pause(); + + VLOG(0) << "GpuPs task add keys cost " << timeline.ElapsedSec() + << " seconds."; + timeline.Start(); + gpu_task->UniqueKeys(); + timeline.Pause(); + VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds."; + for (int i = 0; i < thread_keys_shard_num_; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + if (i == 0 && j == multi_mf_dim_ - 1) { + gpu_task->feature_dim_keys_[i][j].push_back(0); + } + gpu_task->value_dim_ptr_[i][j].resize( + gpu_task->feature_dim_keys_[i][j].size()); + } + } +} + void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { VLOG(3) << "PSGPUWrapper::BuildGPUPSTask begin"; platform::Timer timeline; @@ -237,118 +335,24 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { VLOG(0) << "PreBuild in GpuGraph mode"; SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_); const std::vector& vec_data = dataset->GetGpuGraphTotalKeys(); - - total_len = vec_data.size(); - len_per_thread = total_len / thread_keys_thread_num_; - VLOG(0) << "GpuGraphTotalKeys: " << total_len; - remain = total_len % thread_keys_thread_num_; - auto gen_graph_data_func = [this](const std::vector& total_data, - int begin_index, - int end_index, - int i) { - for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; - iter++) { - uint64_t cur_key = *iter; - int shard_id = cur_key % thread_keys_shard_num_; - this->thread_keys_[i][shard_id].insert(cur_key); - } - }; - auto gen_graph_dynamic_mf_func = - [this](const std::vector& total_data, - int begin_index, - int end_index, - int i) { - for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; - iter++) { - uint64_t cur_key = *iter; - int shard_id = cur_key % thread_keys_shard_num_; - // TODO: feasign <-> slot <-> multi_dim - this->thread_dim_keys_[i][shard_id][0].insert(cur_key); - } - }; - for (int i = 0; i < thread_keys_thread_num_; i++) { - if (!multi_mf_dim_) { - VLOG(1) << "psgpu graph wrapper genfunc"; - threads.push_back( - std::thread(gen_graph_data_func, - std::ref(vec_data), - begin, - begin + len_per_thread + (i < remain ? 1 : 0), - i)); - } else { - VLOG(1) << "psgpu graph wrapper genfunc with dynamic mf"; - threads.push_back( - std::thread(gen_graph_dynamic_mf_func, - std::ref(vec_data), - begin, - begin + len_per_thread + (i < remain ? 1 : 0), - i)); - } - begin += len_per_thread + (i < remain ? 1 : 0); - } - for (std::thread& t : threads) { - t.join(); - } + VLOG(0) << "GpuGraphTotalKeys: " << vec_data.size(); + timeline.Start(); + add_key_to_local(vec_data); + timeline.Pause(); + VLOG(0) << "add_key_to_local cost " << timeline.ElapsedSec() << " seconds."; } - timeline.Start(); - - threads.clear(); - // merge thread_keys to shard_keys - auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) { - for (int i = 0; i < thread_keys_thread_num_; ++i) { - gpu_task->batch_add_keys( - shard_num, dim_id, thread_dim_keys_[i][shard_num][dim_id]); - thread_dim_keys_[i][shard_num][dim_id].clear(); - } - }; - for (int i = 0; i < thread_keys_shard_num_; ++i) { - for (int j = 0; j < multi_mf_dim_; j++) { - threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j)); - } - } - for (auto& t : threads) { - t.join(); - } - timeline.Pause(); - - VLOG(0) << "GpuPs task add keys cost " << timeline.ElapsedSec() - << " seconds."; - timeline.Start(); - gpu_task->UniqueKeys(); - timeline.Pause(); - - VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds."; - for (int i = 0; i < thread_keys_shard_num_; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - if (i == 0 && j == multi_mf_dim_ - 1) { - gpu_task->feature_dim_keys_[i][j].push_back(0); - } - VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j] - << " key len: " << gpu_task->feature_dim_keys_[i][j].size(); - gpu_task->value_dim_ptr_[i][j].resize( - gpu_task->feature_dim_keys_[i][j].size()); - } - } + add_key_to_gputask(gpu_task); } void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { platform::Timer timeline; - std::vector> task_futures; - int device_num = heter_devices_.size(); - auto& local_keys = gpu_task->feature_keys_; - auto& local_ptr = gpu_task->value_ptr_; auto& local_dim_keys = gpu_task->feature_dim_keys_; auto& local_dim_ptr = gpu_task->value_dim_ptr_; - auto& device_keys = gpu_task->device_keys_; - auto& device_vals = gpu_task->device_values_; auto& device_dim_keys = gpu_task->device_dim_keys_; auto& device_dim_ptr = gpu_task->device_dim_ptr_; - auto& device_dim_mutex = gpu_task->dim_mutex_; for (size_t dev = 0; dev < device_dim_keys.size(); dev++) { device_dim_keys[dev].resize(multi_mf_dim_); @@ -380,7 +384,8 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { timeline.Start(); auto ptl_dynamic_mf_func = - [this, &local_dim_keys, &local_dim_ptr, &fleet_ptr](int i, int j) { + [this, &local_dim_keys, &local_dim_ptr, &fleet_ptr, &gpu_task](int i, + int j) { size_t key_size = local_dim_keys[i][j].size(); int32_t status = -1; int32_t cnt = 0; @@ -421,10 +426,12 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { #ifdef PADDLE_WITH_PSCORE while (true) { auto tt = fleet_ptr->worker_ptr_->PullSparsePtr( + i, reinterpret_cast(local_dim_ptr[i][j].data()), this->table_id_, local_dim_keys[i][j].data(), - key_size); + key_size, + gpu_task->pass_id_); bool flag = true; tt.wait(); @@ -461,7 +468,10 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { } }; + // fleet_ptr->pslib_ptr_->_worker_ptr->acquire_table_mutex(this->table_id_); threads.resize(thread_keys_shard_num_ * multi_mf_dim_); + + std::vector> task_futures; for (int i = 0; i < thread_keys_shard_num_; i++) { for (int j = 0; j < multi_mf_dim_; j++) { task_futures.emplace_back( @@ -471,6 +481,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { for (auto& f : task_futures) { f.wait(); } + // fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_); task_futures.clear(); timeline.Pause(); VLOG(0) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec() @@ -483,13 +494,29 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { } gloo_wrapper->Barrier(); } +} - timeline.Start(); - std::vector>> pass_values; +void PSGPUWrapper::divide_to_device(std::shared_ptr gpu_task) { + platform::Timer timeline; + int device_num = heter_devices_.size(); + std::vector threads; + std::vector> task_futures; + auto& local_dim_keys = gpu_task->feature_dim_keys_; + auto& local_dim_ptr = gpu_task->value_dim_ptr_; - bool record_status = false; - auto& device_task_keys = gpu_task->device_task_keys_; - auto& device_task_ptrs = gpu_task->device_task_ptr_; + auto& device_dim_keys = gpu_task->device_dim_keys_; + auto& device_dim_ptr = gpu_task->device_dim_ptr_; + auto& device_dim_mutex = gpu_task->dim_mutex_; + // auto& device_mutex = gpu_task->mutex_; + + if (multi_mf_dim_) { + for (size_t dev = 0; dev < device_dim_keys.size(); dev++) { + device_dim_keys[dev].resize(multi_mf_dim_); + device_dim_ptr[dev].resize(multi_mf_dim_); + } + } + + timeline.Start(); auto build_pull_dynamic_mf_func = [this, device_num, &local_dim_keys, @@ -526,6 +553,43 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { device_dim_mutex[dev][j]->unlock(); } }; + + if (multi_mf_dim_) { + threads.resize(thread_keys_shard_num_ * multi_mf_dim_); + for (int i = 0; i < thread_keys_shard_num_; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads[i * multi_mf_dim_ + j] = + std::thread(build_pull_dynamic_mf_func, i, j); + } + } + for (std::thread& t : threads) { + t.join(); + } + } + timeline.Pause(); + VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec() + << " seconds."; +} + +void PSGPUWrapper::PrepareGPUTask(std::shared_ptr gpu_task) { + platform::Timer timeline; + int device_num = heter_devices_.size(); + std::vector threads; + std::vector> task_futures; + auto& local_keys = gpu_task->feature_keys_; + auto& local_ptr = gpu_task->value_ptr_; + + auto& device_keys = gpu_task->device_keys_; + auto& device_vals = gpu_task->device_values_; + // auto& device_mutex = gpu_task->mutex_; + + timeline.Start(); + std::vector>> pass_values; + + bool record_status = false; + auto& device_task_keys = gpu_task->device_task_keys_; + auto& device_task_ptrs = gpu_task->device_task_ptr_; + auto build_func = [device_num, record_status, &pass_values, @@ -653,18 +717,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { #endif VLOG(3) << "GpuPs build hbmps done"; }; - - if (multi_mf_dim_) { - for (int i = 0; i < thread_keys_shard_num_; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - threads[i * multi_mf_dim_ + j] = - std::thread(build_pull_dynamic_mf_func, i, j); - } - } - for (std::thread& t : threads) { - t.join(); - } - } else { + if (!multi_mf_dim_) { for (int i = 0; i < thread_keys_shard_num_; i++) { for (int j = 0; j < device_num; j++) { task_futures.emplace_back( @@ -683,8 +736,8 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { int device_num = heter_devices_.size(); - platform::Timer timeline; - timeline.Start(); + platform::Timer stagetime; + stagetime.Start(); std::vector feature_keys_count(device_num); size_t size_max = 0; @@ -696,7 +749,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { << " dim index: " << j << " contains feasign nums: " << gpu_task->device_dim_ptr_[i][j].size(); } - VLOG(1) << i << " card with dynamic mf contains feasign nums total: " + VLOG(0) << i << " card with dynamic mf contains feasign nums total: " << feature_keys_count[i]; size_max = std::max(size_max, feature_keys_count[i]); } @@ -719,87 +772,28 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { HeterPs_->set_sparse_sgd(optimizer_config_); HeterPs_->set_embedx_sgd(optimizer_config_); #endif + stagetime.Pause(); + VLOG(0) << "card: " + << " BuildGPUTask create HeterPs_ costs: " << stagetime.ElapsedSec() + << " s."; + stagetime.Start(); - auto build_dymf_mem_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i, - int j) { - this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); - int mf_dim = this->index_dim_vec_[j]; - VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim - << " feature_value_size:" - << accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); - size_t feature_value_size = - accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); - auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; + auto build_dynamic_mf_func = [this, &gpu_task, &accessor_wrapper_ptr]( + int i, int j, size_t start, size_t end) { + // this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; - size_t len = device_dim_keys.size(); - CHECK(len == device_dim_ptrs.size()); - this->mem_pools_[i * this->multi_mf_dim_ + j] = - new MemoryPool(len, feature_value_size); - }; - auto build_dymf_hbm_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i, - int j) { - auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; - size_t len = device_dim_keys.size(); int mf_dim = this->index_dim_vec_[j]; size_t feature_value_size = accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + size_t real_len = end - start; + std::shared_ptr build_values(new char[feature_value_size * real_len], + [](char* p) { delete[] p; }); + char* test_build_values = build_values.get(); - auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; - platform::CUDADeviceGuard guard(resource_->dev_id(i)); - this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool); - auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; - - this->HeterPs_->build_ps(i, - device_dim_keys.data(), - cur_pool->mem(), - len, - feature_value_size, - 500000, - 2); - if (device_dim_keys.size() > 0) { - VLOG(3) << "show table: " << i - << " table kv size: " << device_dim_keys.size() - << "dim: " << mf_dim << " len: " << len; - HeterPs_->show_one_table(i); - } - delete mem_pool; - }; - int thread_num = 16; - auto build_dynamic_mf_func = [this, - &gpu_task, - thread_num, - &accessor_wrapper_ptr](int i, int j, int z) { - // this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); - int mf_dim = this->index_dim_vec_[j]; - VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim; - auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; - auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; - size_t len = device_dim_keys.size(); - CHECK(len == device_dim_ptrs.size()); - // this->mem_pools_[i * this->multi_mf_dim_ + j] = - // new MemoryPool(len, feature_value_size); - auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; - - // ============ add for multi-thread ================ - size_t len_per_thread = len / thread_num; - size_t remain = len % thread_num; - size_t left = 0, right = 0; - - size_t real_len = len_per_thread; - if ((size_t)z < remain) real_len++; - - if ((size_t)z < remain) { - left = z * (len_per_thread + 1); - right = left + real_len; - } else { - left = remain * (len_per_thread + 1) + (z - remain) * len_per_thread; - right = left + real_len; - } - // ============ add for multi-thread ================ - - for (size_t k = left; k < right; k++) { + for (size_t k = start; k < end; k++) { #ifdef PADDLE_WITH_PSLIB - float* val = (float*)(mem_pool->mem_address(k)); + float* val = + (float*)(test_build_values + (k - start) * feature_value_size); float* ptr_val = device_dim_ptrs[k]->data(); size_t dim = device_dim_ptrs[k]->size(); val->delta_score = @@ -833,54 +827,124 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { } #endif #ifdef PADDLE_WITH_PSCORE - void* val = mem_pool->mem_address(k); + void* val = + (float*)(test_build_values + (k - start) * feature_value_size); accessor_wrapper_ptr->BuildFill( val, device_dim_ptrs[k], cpu_table_accessor_, mf_dim); #endif } + task_info task; + task.build_values = build_values; + task.offset = start; + task.device_id = i; + task.multi_mf_dim = j; + task.start = 0; + task.end = int(real_len); + cpu_reday_channels_[i]->Put(task); }; - threads.resize(device_num * multi_mf_dim_); - for (int i = 0; i < device_num; i++) { + auto build_dymf_hbm_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i) { + platform::CUDADeviceGuard guard(resource_->dev_id(i)); + std::vector threads(multi_mf_dim_); for (int j = 0; j < multi_mf_dim_; j++) { - threads[i + j * device_num] = std::thread(build_dymf_mem_pool, i, j); + auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; + size_t len = device_dim_keys.size(); + int mf_dim = this->index_dim_vec_[j]; + size_t feature_value_size = + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + this->hbm_pools_[i * this->multi_mf_dim_ + j]->reset(len, + feature_value_size); + + auto build_ps_thread = + [this, &gpu_task]( + int i, int j, size_t len, size_t feature_value_size) { + auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; + this->HeterPs_->build_ps( + i, + device_dim_keys.data(), + this->hbm_pools_[i * this->multi_mf_dim_ + j]->mem(), + len, + feature_value_size, + 500000, + 2); + if (device_dim_keys.size() > 0) { + VLOG(3) << "show table: " << i + << " table kv size: " << device_dim_keys.size() + << "dim: " << this->index_dim_vec_[j] << " len: " << len; + HeterPs_->show_one_table(i); + } + }; + threads[j] = std::thread(build_ps_thread, i, j, len, feature_value_size); } - } - for (std::thread& t : threads) { - t.join(); - } - threads.clear(); + struct task_info task; + while (cpu_reday_channels_[i]->Get(task)) { + auto hbm = this->hbm_pools_[task.device_id * this->multi_mf_dim_ + + task.multi_mf_dim] + ->mem(); + int mf_dim = this->index_dim_vec_[task.multi_mf_dim]; + size_t feature_value_size = + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + auto hbm_start = hbm + task.offset * feature_value_size; + CUDA_CHECK( + cudaMemcpy(hbm_start, + task.build_values.get() + task.start * feature_value_size, + (task.end - task.start) * feature_value_size, + cudaMemcpyHostToDevice)); + } + platform::Timer stagetime; + stagetime.Start(); + for (std::thread& t : threads) { + t.join(); + } + stagetime.Pause(); + VLOG(0) << "card: " << i + << " BuildGPUTask build_ps async costs: " << stagetime.ElapsedSec() + << " s."; + }; + + std::vector> cpu_task_futures; + std::vector> gpu_task_futures; - // multi-thread process - threads.resize(device_num * multi_mf_dim_ * thread_num); + int once_gpu_copy = 64 * 1024; + threads.resize(device_num * multi_mf_dim_); for (int i = 0; i < device_num; i++) { + cpu_reday_channels_[i]->Open(); + gpu_task_futures.emplace_back( + hbm_thread_pool_[i]->enqueue(build_dymf_hbm_pool, i)); for (int j = 0; j < multi_mf_dim_; j++) { - for (int k = 0; k < thread_num; k++) { - threads[(i + j * device_num) * thread_num + k] = - std::thread(build_dynamic_mf_func, i, j, k); + auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; + size_t len = device_dim_keys.size(); + size_t start = 0; + size_t end = 0; + while (end < len) { + start = end; + end = end + once_gpu_copy < len ? (end + once_gpu_copy) : len; + cpu_task_futures.emplace_back(cpu_work_pool_[i]->enqueue( + build_dynamic_mf_func, i, j, start, end)); } } } - for (std::thread& t : threads) { - t.join(); + stagetime.Start(); + for (auto& f : cpu_task_futures) { + f.wait(); } - threads.clear(); - threads.resize(device_num * multi_mf_dim_); + cpu_task_futures.clear(); + stagetime.Pause(); + VLOG(0) << " BuildGPUTask build_dynamic_mf_func " + << " cost " << stagetime.ElapsedSec() << " s."; for (int i = 0; i < device_num; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - threads[i + j * device_num] = std::thread(build_dymf_hbm_pool, i, j); - } + cpu_reday_channels_[i]->Close(); } - for (std::thread& t : threads) { - t.join(); + stagetime.Start(); + for (auto& f : gpu_task_futures) { + f.wait(); } - threads.clear(); - - timeline.Pause(); - VLOG(0) << "GpuPs build table total costs: " << timeline.ElapsedSec() - << " s."; + gpu_task_futures.clear(); + stagetime.Pause(); + VLOG(0) << " BuildGPUTask build_dymf_hbm_pool " + << " cost " << stagetime.ElapsedSec() << " s."; } void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { @@ -890,17 +954,25 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { dataset_->LoadIntoMemory(); timer.Pause(); VLOG(0) << "LoadIntoMemory cost: " << timer.ElapsedSec() << "s"; - + gpu_graph_mode_ = dataset_->GetGpuGraphMode(); + if (dataset_->GetMemoryDataSize() == 0) { + VLOG(0) << "GetMemoryDataSize == 0"; + return; + } // local shuffle if (is_shuffle) { dataset_->LocalShuffle(); } - InitSlotInfo(); - gpu_graph_mode_ = dataset_->GetGpuGraphMode(); - std::shared_ptr gpu_task = gpu_task_pool_.Get(); - gpu_task->Reset(); - data_ready_channel_->Put(gpu_task); + InitSlotInfo(); + if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { + std::shared_ptr gpu_task = gpu_task_pool_.Get(); + gpu_task->Reset(); + gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID()); + data_ready_channel_->Put(gpu_task); + } else if (hbm_sparse_table_initialized_ == false) { + SparseTableToHbm(); + } VLOG(3) << "End LoadIntoMemory(), dataset[" << dataset_ << "]"; } @@ -909,6 +981,7 @@ void PSGPUWrapper::start_build_thread() { running_ = true; VLOG(3) << "start build CPU ps thread."; pre_build_threads_ = std::thread([this] { pre_build_thread(); }); + buildpull_threads_ = std::thread([this] { build_pull_thread(); }); } void PSGPUWrapper::pre_build_thread() { @@ -931,6 +1004,27 @@ void PSGPUWrapper::pre_build_thread() { VLOG(3) << "build cpu thread end"; } +void PSGPUWrapper::build_pull_thread() { + while (running_) { + std::shared_ptr gpu_task = nullptr; + if (!buildcpu_ready_channel_->Get(gpu_task)) { + continue; + } + VLOG(3) << "thread build pull start."; + platform::Timer timer; + timer.Start(); + // build cpu ps data process + BuildPull(gpu_task); + if (multi_mf_dim_) { + divide_to_device(gpu_task); + } + timer.Pause(); + VLOG(1) << "thread BuildPull end, cost time: " << timer.ElapsedSec() << "s"; + buildpull_ready_channel_->Put(gpu_task); + } + VLOG(3) << "build cpu thread end"; +} + void PSGPUWrapper::build_task() { // build_task: build_pull + build_gputask std::shared_ptr gpu_task = nullptr; @@ -939,24 +1033,29 @@ void PSGPUWrapper::build_task() { return; } // ins and pre_build end - if (!buildcpu_ready_channel_->Get(gpu_task)) { + if (!buildpull_ready_channel_->Get(gpu_task)) { return; } - VLOG(0) << "BuildPull start."; + VLOG(0) << "PrepareGPUTask start."; platform::Timer timer; timer.Start(); - BuildPull(gpu_task); + if (!multi_mf_dim_) { + PrepareGPUTask(gpu_task); + } BuildGPUTask(gpu_task); timer.Pause(); - VLOG(0) << "BuildPull + BuildGPUTask end, cost time: " << timer.ElapsedSec() - << "s"; + VLOG(0) << "PrepareGPUTask + BuildGPUTask end, cost time: " + << timer.ElapsedSec() << "s"; current_task_ = gpu_task; } void PSGPUWrapper::BeginPass() { platform::Timer timer; + if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) { + return; + } timer.Start(); if (current_task_) { PADDLE_THROW( @@ -982,12 +1081,59 @@ void PSGPUWrapper::BeginPass() { } void PSGPUWrapper::EndPass() { + if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) { + return; + } + platform::Timer stagetime; + stagetime.Start(); + HbmToSparseTable(); + stagetime.Pause(); + VLOG(0) << "EndPass HbmToSparseTable cost time: " << stagetime.ElapsedSec() + << "s"; + + gpu_task_pool_.Push(current_task_); + current_task_ = nullptr; + gpu_free_channel_->Put(current_task_); + // fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_); +} + +void PSGPUWrapper::SparseTableToHbm() { + std::shared_ptr gpu_task = gpu_task_pool_.Get(); + gpu_task->Reset(); + size_t device_num = heter_devices_.size(); + gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_); + gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID()); + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + auto node_to_id = gpu_graph_ptr->feature_to_id; + auto edge_to_id = gpu_graph_ptr->edge_to_id; + std::vector vec_data = gpu_graph_ptr->get_graph_total_keys(); + + thread_dim_keys_.resize(thread_keys_thread_num_); + for (int i = 0; i < thread_keys_thread_num_; i++) { + thread_dim_keys_[i].resize(thread_keys_shard_num_); + for (int j = 0; j < thread_keys_shard_num_; j++) { + thread_dim_keys_[i][j].resize(multi_mf_dim_); + } + } + + add_key_to_local(vec_data); + add_key_to_gputask(gpu_task); + BuildPull(gpu_task); + if (!multi_mf_dim_) { + PrepareGPUTask(gpu_task); + } else { + divide_to_device(gpu_task); + } + BuildGPUTask(gpu_task); + current_task_ = gpu_task; + hbm_sparse_table_initialized_ = true; +} + +void PSGPUWrapper::HbmToSparseTable() { if (!current_task_) { PADDLE_THROW( platform::errors::Fatal("[EndPass] current task has been ended.")); } - platform::Timer timer; - timer.Start(); size_t keysize_max = 0; // in case of feasign_num = 0, skip dump_to_cpu @@ -997,86 +1143,125 @@ void PSGPUWrapper::EndPass() { std::max(keysize_max, current_task_->device_dim_keys_[i][j].size()); } } - int thread_num = 8; auto accessor_wrapper_ptr = GlobalAccessorFactory::GetInstance().GetAccessorWrapper(); - auto dump_pool_to_cpu_func = [this, thread_num, &accessor_wrapper_ptr]( - int i, int j, int z) { + // auto fleet_ptr = FleetWrapper::GetInstance(); + // fleet_ptr->pslib_ptr_->_worker_ptr->acquire_table_mutex(this->table_id_); + + int once_cpu_num = 16 * 1024; + int once_gpu_copy = 8 * once_cpu_num; + + auto dump_pool_to_cpu_func = [this, &accessor_wrapper_ptr, once_cpu_num]( + int i, int j, size_t start, size_t end) { PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i))); auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; - auto& device_keys = this->current_task_->device_dim_keys_[i][j]; - size_t len = device_keys.size(); - // ====== multi-thread process feasign================ - int len_per_thread = len / thread_num; - int remain = len % thread_num; - int left = -1, right = -1; - int real_len = len_per_thread; - if (z < remain) real_len++; - if (z < remain) { - left = z * (len_per_thread + 1); - right = left + real_len; - } else { - left = remain * (len_per_thread + 1) + (z - remain) * len_per_thread; - right = left + real_len; - } + size_t real_len = end - start; // ============ multi-thread process feasign============ int mf_dim = this->index_dim_vec_[j]; size_t feature_value_size = accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); - VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim - << " key_len :" << len - << " feature_value_size:" << feature_value_size; - char* test_build_values = (char*)malloc(feature_value_size * real_len); - uint64_t offset = left * feature_value_size; + + std::shared_ptr build_values(new char[feature_value_size * real_len], + [](char* p) { delete[] p; }); + uint64_t offset = start * feature_value_size; + char* test_build_values = build_values.get(); + cudaMemcpy(test_build_values, hbm_pool->mem() + offset, feature_value_size * real_len, cudaMemcpyDeviceToHost); - CHECK(len == hbm_pool->capacity()); - uint64_t unuse_key = std::numeric_limits::max(); - for (int i = left; i < right; ++i) { - if (device_keys[i] == unuse_key) { - continue; - } - size_t local_offset = (i - left) * feature_value_size; - float* gpu_val = (float*)(test_build_values + local_offset); + for (size_t k = 0; k * once_cpu_num < real_len; k++) { + struct task_info task; + task.build_values = build_values; + task.offset = start; + task.device_id = i; + task.multi_mf_dim = j; + task.start = k * once_cpu_num; + task.end = (k + 1) * once_cpu_num < real_len ? ((k + 1) * once_cpu_num) + : (real_len); + cpu_reday_channels_[i]->Put(task); + } + }; + auto cpu_func = [this, &accessor_wrapper_ptr](int j) { + struct task_info task; + while (cpu_reday_channels_[j]->Get(task)) { + auto& device_keys = + this->current_task_ + ->device_dim_keys_[task.device_id][task.multi_mf_dim]; + char* test_build_values = task.build_values.get(); + int mf_dim = this->index_dim_vec_[task.multi_mf_dim]; + size_t feature_value_size = + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + uint64_t unuse_key = std::numeric_limits::max(); + for (int i = task.start; i < task.end; ++i) { + if (device_keys[i + task.offset] == unuse_key) { + continue; + } + size_t local_offset = i * feature_value_size; + float* gpu_val = (float*)(test_build_values + local_offset); #ifdef PADDLE_WITH_PSLIB - // TODO: PSLIB DumpFill + // TODO: PSLIB DumpFill #endif #ifdef PADDLE_WITH_PSCORE - accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim); + accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim); #endif + } } - free(test_build_values); }; + platform::Timer timer; + timer.Start(); + std::vector> cpu_task_futures; + std::vector> gpu_task_futures; + size_t thread_num = 16; + size_t device_num = heter_devices_.size(); if (multi_mf_dim_) { VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_; - size_t device_num = heter_devices_.size(); - std::vector threads(device_num * multi_mf_dim_ * thread_num); for (size_t i = 0; i < device_num; i++) { + cpu_reday_channels_[i]->Open(); for (int j = 0; j < multi_mf_dim_; j++) { - for (int k = 0; k < thread_num; k++) { - threads[(i + j * device_num) * thread_num + k] = - std::thread(dump_pool_to_cpu_func, i, j, k); + auto& device_keys = this->current_task_->device_dim_keys_[i][j]; + size_t len = device_keys.size(); + size_t start = 0; + size_t end = 0; + while (end < len) { + start = end; + end = end + once_gpu_copy < len ? (end + once_gpu_copy) : len; + gpu_task_futures.emplace_back(hbm_thread_pool_[i]->enqueue( + dump_pool_to_cpu_func, i, j, start, end)); } } + for (size_t j = 0; j < thread_num; j++) { + cpu_task_futures.emplace_back(cpu_work_pool_[i]->enqueue(cpu_func, i)); + } } - for (std::thread& t : threads) { - t.join(); - } } + for (auto& f : gpu_task_futures) { + f.wait(); + } + timer.Pause(); + VLOG(0) << " EndPass dump_pool_to_cpu_func " + << " cost " << timer.ElapsedSec() << " s."; + for (size_t i = 0; i < device_num; i++) { + cpu_reday_channels_[i]->Close(); + } + gpu_task_futures.clear(); + timer.Start(); + for (auto& f : cpu_task_futures) { + f.wait(); + } + cpu_task_futures.clear(); + timer.Pause(); + VLOG(0) << " EndPass cpu_func " + << " cost " << timer.ElapsedSec() << " s."; if (keysize_max != 0) { HeterPs_->end_pass(); } +} - for (size_t i = 0; i < hbm_pools_.size(); i++) { - delete hbm_pools_[i]; +void PSGPUWrapper::DumpToMem() { + if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) { + this->HbmToSparseTable(); } - gpu_task_pool_.Push(current_task_); - current_task_ = nullptr; - gpu_free_channel_->Put(current_task_); - timer.Pause(); - VLOG(1) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; } void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index c48cf3347573a..eff966e1f1616 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -36,6 +36,7 @@ limitations under the License. */ #include "paddle/fluid/framework/fleet/heter_context.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" +#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h" #include "paddle/fluid/framework/heter_util.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/framework/fleet/heter_ps/mem_pool.h" @@ -63,6 +64,7 @@ limitations under the License. */ #include "downpour_accessor.h" // NOLINT #endif #include "paddle/fluid/framework/fleet/heter_ps/log_patch.h" +DECLARE_int32(gpugraph_storage_mode); namespace paddle { namespace framework { @@ -96,6 +98,15 @@ class AfsWrapper { }; #endif +struct task_info { + std::shared_ptr build_values; + size_t offset; + int device_id; + int multi_mf_dim; + int start; + int end; +}; + class PSGPUWrapper { class DCacheBuffer { public: @@ -188,27 +199,45 @@ class PSGPUWrapper { int total_len, int* key2slot); + + void divide_to_device(std::shared_ptr gpu_task); void BuildGPUTask(std::shared_ptr gpu_task); void PreBuildTask(std::shared_ptr gpu_task); void BuildPull(std::shared_ptr gpu_task); + void PrepareGPUTask(std::shared_ptr gpu_task); void LoadIntoMemory(bool is_shuffle); void BeginPass(); void EndPass(); + void add_key_to_local(const std::vector & keys); + void add_key_to_gputask(std::shared_ptr gpu_task); + void SparseTableToHbm(); + void HbmToSparseTable(); void start_build_thread(); void pre_build_thread(); + void build_pull_thread(); void build_task(); + void DumpToMem(); void Finalize() { VLOG(3) << "PSGPUWrapper Begin Finalize."; if (s_instance_ == nullptr) { return; } + if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) { + this->EndPass(); + } + for (size_t i = 0; i < hbm_pools_.size(); i++) { + delete hbm_pools_[i]; + } data_ready_channel_->Close(); buildcpu_ready_channel_->Close(); + buildpull_ready_channel_->Close(); gpu_free_channel_->Close(); running_ = false; VLOG(3) << "begin stop pre_build_threads_"; pre_build_threads_.join(); + VLOG(3) << "begin stop buildpull_threads_"; + buildpull_threads_.join(); s_instance_ = nullptr; VLOG(3) << "PSGPUWrapper Finalize Finished."; HeterPs_->show_table_collisions(); @@ -278,9 +307,17 @@ class PSGPUWrapper { data_ready_channel_->SetCapacity(3); buildcpu_ready_channel_->Open(); buildcpu_ready_channel_->SetCapacity(3); + buildpull_ready_channel_->Open(); + buildpull_ready_channel_->SetCapacity(1); gpu_free_channel_->Open(); gpu_free_channel_->SetCapacity(1); + cpu_reday_channels_.resize(dev_ids.size()); + for (size_t i = 0; i < dev_ids.size(); i++) { + cpu_reday_channels_[i] = paddle::framework::MakeChannel(); + cpu_reday_channels_[i]->SetCapacity(16); + } + current_task_ = nullptr; gpu_free_channel_->Put(current_task_); @@ -378,6 +415,11 @@ class PSGPUWrapper { hbm_thread_pool_[i].reset(new ::ThreadPool(1)); } + cpu_work_pool_.resize(thread_keys_shard_num_); + for (size_t i = 0; i < hbm_thread_pool_.size(); i++) { + cpu_work_pool_[i].reset(new ::ThreadPool(16)); + } + auto sparse_table_accessor = sparse_table.accessor(); auto sparse_table_accessor_parameter = sparse_table_accessor.ctr_accessor_param(); @@ -589,6 +631,10 @@ class PSGPUWrapper { dim_index_map[index_dim_vec_[i]] = i; } hbm_pools_.resize(resource_->total_device() * num_of_dim); + for (size_t i = 0; i < hbm_pools_.size(); i++) { + hbm_pools_[i] = new HBMMemoryPoolFix(); + } + mem_pools_.resize(resource_->total_device() * num_of_dim); max_mf_dim_ = index_dim_vec_.back(); multi_mf_dim_ = (dim_index_map.size() >= 1) ? dim_index_map.size() : 0; @@ -686,6 +732,7 @@ class PSGPUWrapper { int month_; int day_; bool slot_info_initialized_ = false; + bool hbm_sparse_table_initialized_ = false; int use_afs_api_ = 0; int optimizer_type_ = 1; std::string accessor_class_; @@ -696,7 +743,7 @@ class PSGPUWrapper { #ifdef PADDLE_WITH_CUDA std::vector mem_pools_; - std::vector hbm_pools_; // in multi mfdim, one table need hbm + std::vector hbm_pools_; // in multi mfdim, one table need hbm // pools of totol dims number #endif @@ -712,11 +759,19 @@ class PSGPUWrapper { paddle::framework::ChannelObject>> gpu_free_channel_ = paddle::framework::MakeChannel>(); + std::shared_ptr< + paddle::framework::ChannelObject>> + buildpull_ready_channel_ = + paddle::framework::MakeChannel>(); + std::vector>> cpu_reday_channels_ ; std::shared_ptr current_task_ = nullptr; std::thread pre_build_threads_; + std::thread buildpull_threads_; bool running_ = false; std::vector> pull_thread_pool_; std::vector> hbm_thread_pool_; + std::vector> cpu_work_pool_; OptimizerConfig optimizer_config_; protected: diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc index ceed8cb6bfa63..96a473be1aa8c 100755 --- a/paddle/fluid/framework/multi_trainer.cc +++ b/paddle/fluid/framework/multi_trainer.cc @@ -48,6 +48,7 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc, places_.push_back(place); } #endif + user_define_dump_filename_ = trainer_desc.user_define_dump_filename(); // get filelist from trainer_desc here const std::vector readers = dataset->GetReaders(); diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index 27c7563fee840..d2ef1f238de8a 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -49,7 +49,12 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator( : underlying_allocator_(underlying_allocator), alignment_(alignment), chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)), - allow_free_idle_chunk_(allow_free_idle_chunk) {} + allow_free_idle_chunk_(allow_free_idle_chunk) { + total_alloc_times_ = 0; + total_alloc_size_ = 0; + total_free_times_ = 0; + total_free_size_ = 0; + } phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl( size_t unaligned_size) { @@ -112,6 +117,8 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl( VLOG(2) << "Not found and reallocate " << realloc_size << "(" << static_cast(p) << "), and remaining " << remaining_size; } + ++total_alloc_times_; + total_alloc_size_ += size; VLOG(10) << "Alloc " << block_it->size_ << " bytes, ptr = " << block_it->ptr_; return new BlockAllocation(block_it); } @@ -126,6 +133,9 @@ void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) { auto block_it = static_cast(allocation)->block_it_; auto &blocks = block_it->chunk_->blocks_; + total_free_times_ += 1; + total_free_size_ += block_it->size_; + block_it->is_free_ = true; if (block_it != blocks.begin()) { @@ -176,9 +186,28 @@ uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() { ++chunk_it; } } + + Trace(); return bytes; } +void AutoGrowthBestFitAllocator::Trace() const { + size_t cur_idle_bytes = 0; + auto it = free_blocks_.begin(); + for (; it != free_blocks_.end(); ++it) { + cur_idle_bytes += it->second->size_; + } + + VLOG(0) << "alloc:" << total_alloc_size_ / double(1024*1024) + << "m free:" << total_free_size_ / double(1024*1024) + << "m busy:" << (total_alloc_size_ - total_free_size_) / double(1024*1024) + << "m idle:" << cur_idle_bytes / double(1024*1024) + << "m alloc_times:" << total_alloc_times_ + << " free_times:" << total_free_times_ + << " free_blocks_num:" << free_blocks_.size() + << " curr_chunks_num:" << chunks_.size(); +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h index dadf751bdfa41..138f4a98c4db5 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h @@ -49,6 +49,7 @@ class AutoGrowthBestFitAllocator : public Allocator { private: uint64_t FreeIdleChunks(); + void Trace() const; template using List = std::list; @@ -93,6 +94,12 @@ class AutoGrowthBestFitAllocator : public Allocator { size_t chunk_size_; bool allow_free_idle_chunk_; + // stat info + size_t total_alloc_times_; + size_t total_alloc_size_; + size_t total_free_times_; + size_t total_free_size_; + SpinLock spinlock_; }; diff --git a/paddle/fluid/operators/shuffle_batch_op.cu b/paddle/fluid/operators/shuffle_batch_op.cu index 6b70b8d37d79c..19dc830cc430f 100644 --- a/paddle/fluid/operators/shuffle_batch_op.cu +++ b/paddle/fluid/operators/shuffle_batch_op.cu @@ -27,6 +27,37 @@ namespace paddle { namespace operators { +struct CacheAllocator { + typedef char value_type; + CacheAllocator(platform::Place place) { + VLOG(2) << "construct allocator"; + place_ = place; + } + + ~CacheAllocator() { VLOG(2) << "destory allocator"; } + + char *allocate(std::ptrdiff_t num_bytes) { + VLOG(2) << "allocate " << num_bytes << " bytes"; + auto storage = memory::AllocShared(place_, num_bytes); + char *ptr = reinterpret_cast(storage->ptr()); + busy_allocation_.emplace(std::make_pair(ptr, storage)); + return ptr; + } + + void deallocate(char *ptr, size_t) { + VLOG(2) << "deallocate "; + allocation_map_type::iterator iter = busy_allocation_.find(ptr); + CHECK(iter != busy_allocation_.end()); + busy_allocation_.erase(iter); + } + + private: + typedef std::unordered_map> + allocation_map_type; + allocation_map_type busy_allocation_; + platform::Place place_; +}; + template struct ReorderFunctor { ReorderFunctor(const T *x, const int64_t *shuffle_idx, T *y, int64_t stride) @@ -90,7 +121,8 @@ class ShuffleBatchCUDAKernel : public framework::OpKernel { auto &dev_ctx = ctx.template device_context(); #ifdef PADDLE_WITH_CUDA - const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream()); + CacheAllocator allocator(ctx.GetPlace()); + const auto &exec_policy = thrust::cuda::par(allocator).on(dev_ctx.stream()); #else const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream()); #endif diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 4933cbb6cf74b..79bb41fd01f55 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -833,6 +833,18 @@ PADDLE_DEFINE_EXPORTED_bool( false, "It controls whether exit trainer when an worker has no ins."); +/** + * Distributed related FLAG + * Name: enable_exit_when_partial_worker + * Since Version: 2.2.0 + * Value Range: bool, default=false + * Example: + * Note: represent gpugraph storage mode, 1 for full hbm, 2 for hbm + mem + ssd. + */ +PADDLE_DEFINE_EXPORTED_int32(gpugraph_storage_mode, + 1, + "gpugraph storage mode, default 1"); + /** * KP kernel related FLAG * Name: FLAGS_run_kp_kernel @@ -968,7 +980,6 @@ PADDLE_DEFINE_EXPORTED_int32( PADDLE_DEFINE_EXPORTED_bool(gpugraph_load_node_list_into_hbm, true, "enable load_node_list_into_hbm, default true"); - /** * ProcessGroupNCCL related FLAG * Name: nccl_blocking_wait diff --git a/paddle/fluid/platform/monitor.cc b/paddle/fluid/platform/monitor.cc index ea6240b649cad..dd38ce7956309 100644 --- a/paddle/fluid/platform/monitor.cc +++ b/paddle/fluid/platform/monitor.cc @@ -19,6 +19,7 @@ namespace platform {} // namespace platform } // namespace paddle DEFINE_INT_STATUS(STAT_total_feasign_num_in_mem) +DEFINE_INT_STATUS(STAT_epoch_finish) DEFINE_INT_STATUS(STAT_gpu0_mem_size) DEFINE_INT_STATUS(STAT_gpu1_mem_size) DEFINE_INT_STATUS(STAT_gpu2_mem_size) diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc index e902baa13532e..dc381e6a033e0 100644 --- a/paddle/fluid/pybind/data_set_py.cc +++ b/paddle/fluid/pybind/data_set_py.cc @@ -368,6 +368,9 @@ void BindDataset(py::module *m) { py::call_guard()) .def("set_gpu_graph_mode", &framework::Dataset::SetGpuGraphMode, + py::call_guard()) + .def("set_pass_id", + &framework::Dataset::SetPassId, py::call_guard()); py::class_(*m, "IterableDatasetWrapper") diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc old mode 100755 new mode 100644 index b11f5832d8c8a..db2ab38530a2c --- a/paddle/fluid/pybind/fleet_py.cc +++ b/paddle/fluid/pybind/fleet_py.cc @@ -64,6 +64,7 @@ void BindDistFleetWrapper(py::module* m) { .def("save_one_model", &FleetWrapper::SaveModelOneTable) .def("recv_and_save_model", &FleetWrapper::RecvAndSaveTable) .def("sparse_table_stat", &FleetWrapper::PrintTableStat) + .def("save_cache_table", &FleetWrapper::SaveCacheTable) .def("stop_server", &FleetWrapper::StopServer) .def("stop_worker", &FleetWrapper::FinalizeWorker) .def("barrier", &FleetWrapper::BarrierWithTable) @@ -371,7 +372,12 @@ void BindGraphGpuWrapper(py::module* m) { .def("set_up_types", &GraphGpuWrapper::set_up_types) .def("query_node_list", &GraphGpuWrapper::query_node_list) .def("add_table_feat_conf", &GraphGpuWrapper::add_table_feat_conf) - .def("load_edge_file", &GraphGpuWrapper::load_edge_file) + .def("load_edge_file", + py::overload_cast( + &GraphGpuWrapper::load_edge_file)) + .def("load_edge_file", + py::overload_cast( + &GraphGpuWrapper::load_edge_file)) .def("load_node_and_edge", &GraphGpuWrapper::load_node_and_edge) .def("upload_batch", py::overload_cast( @@ -395,7 +401,15 @@ void BindGraphGpuWrapper(py::module* m) { .def("get_partition", &GraphGpuWrapper::get_partition) .def("load_node_weight", &GraphGpuWrapper::load_node_weight) .def("export_partition_files", &GraphGpuWrapper::export_partition_files) - .def("load_node_file", &GraphGpuWrapper::load_node_file) + .def("load_node_file", + py::overload_cast( + &GraphGpuWrapper::load_node_file)) + .def("load_node_file", + py::overload_cast( + &GraphGpuWrapper::load_node_file)) + .def("release_graph", &GraphGpuWrapper::release_graph) + .def("release_graph_edge", &GraphGpuWrapper::release_graph_edge) + .def("release_graph_node", &GraphGpuWrapper::release_graph_node) .def("finalize", &GraphGpuWrapper::finalize); } #endif diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc index e9c993d3ee128..4d7d17463e4fe 100644 --- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc +++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc @@ -64,6 +64,9 @@ void BindPSGPUWrapper(py::module* m) { .def("begin_pass", &framework::PSGPUWrapper::BeginPass, py::call_guard()) + .def("dump_to_mem", + &framework::PSGPUWrapper::DumpToMem, + py::call_guard()) .def("load_into_memory", &framework::PSGPUWrapper::LoadIntoMemory, py::call_guard()) diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py index 0cfb946d3d8ca..83f60a6e26b40 100755 --- a/python/paddle/distributed/fleet/__init__.py +++ b/python/paddle/distributed/fleet/__init__.py @@ -88,3 +88,4 @@ shrink = fleet.shrink get_hybrid_communicate_group = fleet.get_hybrid_communicate_group distributed_scaler = fleet.distributed_scaler +save_cache_table = fleet.save_cache_table diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index d58770dd714ff..93acfb8042fbe 100755 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -531,7 +531,9 @@ def fleet_desc_configs(self, configs): 'embed_sparse_beta2_decay_rate', 'embedx_sparse_optimizer', 'embedx_sparse_learning_rate', \ 'embedx_sparse_weight_bounds', 'embedx_sparse_initial_range', 'embedx_sparse_initial_g2sum', \ 'embedx_sparse_beta1_decay_rate', 'embedx_sparse_beta2_decay_rate', 'feature_learning_rate', 'nodeid_slot'] - support_sparse_table_class = ['DownpourSparseTable'] + support_sparse_table_class = [ + 'DownpourSparseTable', 'DownpourSparseSSDTable' + ] support_sparse_accessor_class = [ 'DownpourSparseValueAccessor', 'DownpourCtrAccessor', 'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor', @@ -623,9 +625,12 @@ def set_sparse_table_config(table_data, config): "DownpourSparseTable") if table_class not in support_sparse_table_class: raise ValueError( - "support sparse_table_class: ['DownpourSparseTable'], but actual %s" + "support sparse_table_class: ['DownpourSparseTable, DownpourSparseSSDTable'], but actual %s" % (table_class)) - table_data.table_class = 'MemorySparseTable' + if table_class == "DownpourSparseSSDTable": + table_data.table_class = 'SSDSparseTable' + else: + table_data.table_class = 'MemorySparseTable' table_data.shard_num = config.get('sparse_shard_num', 1000) accessor_class = config.get("sparse_accessor_class", diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 1a9b3f565b77a..4b9037795e067 100755 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -906,6 +906,15 @@ def save_persistables(self, executor, dirname, main_program=None, mode=0): def save_cache_model(self, dirname, **configs): return self._runtime_handle._save_cache_model(dirname, **configs) + @is_non_distributed_check + @inited_runtime_handler + def save_cache_table(self, + table_id, + pass_id, + mem_cache_key_threshold=4000000000): + return self._runtime_handle._save_cache_table(table_id, pass_id, + mem_cache_key_threshold) + def shrink(self, threshold=None): self._runtime_handle._shrink(threshold) diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py index abf7eec73b8fe..5c7c1b11a27a7 100755 --- a/python/paddle/distributed/ps/the_one_ps.py +++ b/python/paddle/distributed/ps/the_one_ps.py @@ -1420,6 +1420,12 @@ def _save_cache_model(self, dirname, **kwargs): fleet.util.barrier() return feasign_num + def _save_cache_table(self, table_id, pass_id, mem_cache_key_threshold): + if self.role_maker._is_first_worker(): + self._worker.save_cache_table(table_id, pass_id, + mem_cache_key_threshold) + fleet.util.barrier() + def _load_sparse_params(self, dirname, context, main_program, mode): distributed_varnames = get_sparse_tablenames(self.origin_main_programs, True) diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index 9fba7bb70f189..d84d66741344d 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -388,6 +388,7 @@ def __init__(self): self.merge_by_lineid = False self.fleet_send_sleep_seconds = None self.trainer_num = -1 + self.pass_id = 0 @deprecated(since="2.0.0", update_to="paddle.distributed.InMemoryDataset._set_feed_type") @@ -1080,8 +1081,25 @@ def set_graph_config(self, config): self.proto_desc.graph_config.meta_path = config.get("meta_path", "") self.proto_desc.graph_config.gpu_graph_training = config.get( "gpu_graph_training", True) + self.proto_desc.graph_config.train_table_cap = config.get( + "train_table_cap", 800000) + self.proto_desc.graph_config.infer_table_cap = config.get( + "infer_table_cap", 800000) self.dataset.set_gpu_graph_mode(True) + def set_pass_id(self, pass_id): + """ + set_pass_id + """ + self.pass_id = pass_id + self.dataset.set_pass_id(pass_id) + + def get_pass_id(self): + """ + get_pass_id + """ + return self.pass_id + class QueueDataset(DatasetBase): """ diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py index 3ba9f9eea46d1..945b28aac88de 100644 --- a/python/paddle/fluid/trainer_factory.py +++ b/python/paddle/fluid/trainer_factory.py @@ -73,6 +73,9 @@ def _create_trainer(self, opt_info=None): if opt_info.get("dump_fields_path") is not None and len( opt_info.get("dump_fields_path")) != 0: trainer._set_dump_fields_path(opt_info["dump_fields_path"]) + if opt_info.get("user_define_dump_filename") is not None and len( + opt_info.get("user_define_dump_filename")) != 0: + trainer._set_user_define_dump_filename(opt_info["user_define_dump_filename"]) if opt_info.get("dump_file_num") is not None: trainer._set_dump_file_num(opt_info["dump_file_num"]) if opt_info.get("dump_converter") is not None: