diff --git a/doc/env_var.md b/doc/env_var.md index d1467d535f14..16a9ee4ff3bc 100644 --- a/doc/env_var.md +++ b/doc/env_var.md @@ -16,6 +16,10 @@ Usually you do not need to change these settings, but they are listed here for r * MXNET_EXEC_MATCH_RANGE (default=10) - The rough matching scale in symbolic execution memory allocator. - Set this to 0 if we do not want to enable memory sharing between graph nodes(for debug purpose). +* MXNET_EXEC_NUM_TEMP (default=4) + - Maximum number of temp workspace we can allocate to each device. + - Set this to small number can save GPU memory. + - It will also likely to decrease level of parallelism, which is usually OK. * MXNET_ENGINE_TYPE (default=ThreadedEnginePerDevice) - The type of underlying execution engine of MXNet. - List of choices @@ -27,3 +31,15 @@ Usually you do not need to change these settings, but they are listed here for r * MXNET_KVSTORE_BIGARRAY_BOUND (default=1e6) - The minimum size of "big array". - When the array size is bigger than this threshold, MXNET_KVSTORE_REDUCTION_NTHREADS threads will be used for reduction. + +Settings for Minimum Memory Usage +--------------------------------- +- Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1``` + - The default setting satisfies this. + +Settings for More GPU Parallelism +--------------------------------- +- Set ```MXNET_GPU_WORKER_NTHREADS``` to larger number (e.g. 2) + - You may want to set ```MXNET_EXEC_NUM_TEMP``` to reduce memory usage. +- This may not speedup things as GPU can already be fully occupied with serialized jobs. + diff --git a/src/common/utils.h b/src/common/utils.h index 24b67f41aabf..fbaf5f4fdb55 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -12,6 +12,7 @@ #include #include #include +#include #endif // DMLC_USE_CXX11 #include @@ -27,6 +28,14 @@ inline int GetNumThreadPerGPU() { return dmlc::GetEnv("MXNET_GPU_WORKER_NTHREADS", 1); } +// heuristic to get number of matching colors. +// this decides how much parallelism we can get in each GPU. +inline int GetExecNumMatchColor() { + // This is resource efficient option. + int num_match_color = dmlc::GetEnv("MXNET_EXEC_NUM_TEMP", 4); + return std::min(num_match_color, GetNumThreadPerGPU()); +} + /*! * \brief Random Engine */ diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc index aa98f8ac57b6..9c2bed3071b3 100644 --- a/src/symbol/graph_executor.cc +++ b/src/symbol/graph_executor.cc @@ -456,8 +456,6 @@ void GraphExecutor::InitDataEntryMemory() { } void GraphExecutor::InitResources() { - // maximum amount of color allowed in coloring algorithm - const uint32_t kMaxNumColor = 8; // prepare for temp space allocation std::vector req_temp_cnt(topo_order_.size(), 0); for (size_t i = 0; i < topo_order_.size(); ++i) { @@ -471,9 +469,8 @@ void GraphExecutor::InitResources() { CHECK_LE(cnt, 1) << "Node can only have one temp space request"; req_temp_cnt[nid] = cnt; } - // restrict allocation to maximum number of parallelism per device - uint32_t num_color = std::min(static_cast(common::GetNumThreadPerGPU()), - kMaxNumColor); + + uint32_t num_color = static_cast(common::GetExecNumMatchColor()); std::vector req_temp_color; // use graph coloring to find node that won't run in parallel num_color = graph::ColorNodeGroup(graph_, topo_order_, req_temp_cnt, diff --git a/src/symbol/graph_memory_allocator.h b/src/symbol/graph_memory_allocator.h index c969fc1aaa86..dba317fbd376 100644 --- a/src/symbol/graph_memory_allocator.h +++ b/src/symbol/graph_memory_allocator.h @@ -119,9 +119,7 @@ GraphStorageAllocator::GraphStorageAllocator( // if we set this to 1, this means no color based match. // color based match will cost a bit more memory usually // but also enables more parallelization. - num_match_color_ = dmlc::GetEnv("MXNET_EXEC_MATCH_NUM_COLOR", 4); - num_match_color_ = std::min(static_cast(common::GetNumThreadPerGPU()), - num_match_color_); + num_match_color_ = static_cast(common::GetExecNumMatchColor()); this->InitColor(topo_order); }