Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reduce the buffer when using high dimensional data in distributed mode. #2485

Merged
merged 17 commits into from
Oct 15, 2019
33 changes: 31 additions & 2 deletions src/io/dataset_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,22 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,

const data_size_t filter_cnt = static_cast<data_size_t>(
static_cast<double>(config_.min_data_in_leaf * total_sample_size) / num_data);
if (Network::num_machines() == 1) {

bool force_findbin_in_single_machine = false;
if (Network::num_machines() > 1) {
int total_num_feature = Network::GlobalSyncUpByMin(num_col);
size_t esimate_sync_size = BinMapper::SizeForSpecificBin(config_.max_bin) * total_num_feature;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will still lead to overflow, need to cast the operands before assigning to a wider type.

Suggested change
size_t esimate_sync_size = BinMapper::SizeForSpecificBin(config_.max_bin) * total_num_feature;
size_t esimate_sync_size = (size_t)BinMapper::SizeForSpecificBin(config_.max_bin) * (size_t)total_num_feature;

const size_t max_buf_size = 2 << 31;
if (esimate_sync_size >= max_buf_size) {
if (config_.pre_partition) {
Log::Warning("Too many features for distributed model, it is better to pass categorical feature directly instead of sparse high dimensional feature vectors.");
} else {
force_findbin_in_single_machine = true;
}
}
}

if (Network::num_machines() == 1 || force_findbin_in_single_machine) {
// if only one machine, find bin locally
OMP_INIT_EX();
#pragma omp parallel for schedule(guided)
Expand Down Expand Up @@ -933,8 +948,22 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
const data_size_t filter_cnt = static_cast<data_size_t>(
static_cast<double>(config_.min_data_in_leaf* sample_data.size()) / dataset->num_data_);

bool force_findbin_in_single_machine = false;
if (Network::num_machines() > 1) {
int total_num_feature = Network::GlobalSyncUpByMin(dataset->num_total_features_);
size_t esimate_sync_size = BinMapper::SizeForSpecificBin(config_.max_bin) * total_num_feature;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above, avoids overflow.

Suggested change
size_t esimate_sync_size = BinMapper::SizeForSpecificBin(config_.max_bin) * total_num_feature;
size_t esimate_sync_size = (size_t)BinMapper::SizeForSpecificBin(config_.max_bin) * (size_t)total_num_feature;

const size_t max_buf_size = 2 << 31;
if (esimate_sync_size >= max_buf_size) {
if (config_.pre_partition) {
Log::Warning("Too many features for distributed model, it is better to pass categorical feature directly instead of sparse high dimensional feature vectors.");
} else {
force_findbin_in_single_machine = true;
}
}
}

// start find bins
if (num_machines == 1) {
if (num_machines == 1 || force_findbin_in_single_machine) {
// if only one machine, find bin locally
OMP_INIT_EX();
#pragma omp parallel for schedule(guided)
Expand Down