diff --git a/doc/env_var.md b/doc/env_var.md
index d1467d535f14..16a9ee4ff3bc 100644
--- a/doc/env_var.md
+++ b/doc/env_var.md
@@ -16,6 +16,10 @@ Usually you do not need to change these settings, but they are listed here for r
 * MXNET_EXEC_MATCH_RANGE (default=10)
   - The rough matching scale in symbolic execution memory allocator.
   - Set this to 0 if we do not want to enable memory sharing between graph nodes(for debug purpose).
+* MXNET_EXEC_NUM_TEMP (default=4)
+  - Maximum number of temp workspace we can allocate to each device.
+  - Set this to small number can save GPU memory.
+  - It will also likely to decrease level of parallelism, which is usually OK.
 * MXNET_ENGINE_TYPE (default=ThreadedEnginePerDevice)
   - The type of underlying execution engine of MXNet.
   - List of choices
@@ -27,3 +31,15 @@ Usually you do not need to change these settings, but they are listed here for r
 * MXNET_KVSTORE_BIGARRAY_BOUND (default=1e6)
 	- The minimum size of "big array".
 	- When the array size is bigger than this threshold, MXNET_KVSTORE_REDUCTION_NTHREADS threads will be used for reduction.
+
+Settings for Minimum Memory Usage
+---------------------------------
+- Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```
+  - The default setting satisfies this.
+
+Settings for More GPU Parallelism
+---------------------------------
+- Set ```MXNET_GPU_WORKER_NTHREADS``` to larger number (e.g. 2)
+  - You may want to set ```MXNET_EXEC_NUM_TEMP``` to reduce memory usage.
+- This may not speedup things as GPU can already be fully occupied with serialized jobs.
+
diff --git a/src/common/utils.h b/src/common/utils.h
index 24b67f41aabf..fbaf5f4fdb55 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -12,6 +12,7 @@
 #include <utility>
 #include <random>
 #include <thread>
+#include <algorithm>
 #endif  // DMLC_USE_CXX11
 
 #include <dmlc/logging.h>
@@ -27,6 +28,14 @@ inline int GetNumThreadPerGPU() {
   return dmlc::GetEnv("MXNET_GPU_WORKER_NTHREADS", 1);
 }
 
+// heuristic to get number of matching colors.
+// this decides how much parallelism we can get in each GPU.
+inline int GetExecNumMatchColor() {
+  // This is resource efficient option.
+  int num_match_color = dmlc::GetEnv("MXNET_EXEC_NUM_TEMP", 4);
+  return std::min(num_match_color, GetNumThreadPerGPU());
+}
+
 /*!
  * \brief Random Engine
  */
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index aa98f8ac57b6..9c2bed3071b3 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -456,8 +456,6 @@ void GraphExecutor::InitDataEntryMemory() {
 }
 
 void GraphExecutor::InitResources() {
-  // maximum amount of color allowed in coloring algorithm
-  const uint32_t kMaxNumColor = 8;
   // prepare for temp space allocation
   std::vector<uint32_t> req_temp_cnt(topo_order_.size(), 0);
   for (size_t i = 0; i < topo_order_.size(); ++i) {
@@ -471,9 +469,8 @@ void GraphExecutor::InitResources() {
     CHECK_LE(cnt, 1) << "Node can only have one temp space request";
     req_temp_cnt[nid] = cnt;
   }
-  // restrict allocation to maximum number of parallelism per device
-  uint32_t num_color = std::min(static_cast<uint32_t>(common::GetNumThreadPerGPU()),
-                                kMaxNumColor);
+
+  uint32_t num_color = static_cast<uint32_t>(common::GetExecNumMatchColor());
   std::vector<uint32_t> req_temp_color;
   // use graph coloring to find node that won't run in parallel
   num_color = graph::ColorNodeGroup(graph_, topo_order_, req_temp_cnt,
diff --git a/src/symbol/graph_memory_allocator.h b/src/symbol/graph_memory_allocator.h
index c969fc1aaa86..dba317fbd376 100644
--- a/src/symbol/graph_memory_allocator.h
+++ b/src/symbol/graph_memory_allocator.h
@@ -119,9 +119,7 @@ GraphStorageAllocator::GraphStorageAllocator(
   // if we set this to 1, this means no color based match.
   // color based match will cost a bit more memory usually
   // but also enables more parallelization.
-  num_match_color_ = dmlc::GetEnv("MXNET_EXEC_MATCH_NUM_COLOR", 4);
-  num_match_color_ = std::min(static_cast<uint32_t>(common::GetNumThreadPerGPU()),
-                              num_match_color_);
+  num_match_color_ = static_cast<uint32_t>(common::GetExecNumMatchColor());
   this->InitColor(topo_order);
 }