diff --git a/cmake/external/jemalloc.cmake b/cmake/external/jemalloc.cmake
new file mode 100644
index 0000000000000..efce686b20929
--- /dev/null
+++ b/cmake/external/jemalloc.cmake
@@ -0,0 +1,35 @@
+include(ExternalProject)
+
+set(JEMALLOC_PROJECT "extern_jemalloc")
+set(JEMALLOC_URL
+    https://github.com/jemalloc/jemalloc/releases/download/5.1.0/jemalloc-5.1.0.tar.bz2
+)
+set(JEMALLOC_BUILD ${THIRD_PARTY_PATH}/jemalloc/src/extern_jemalloc)
+set(JEMALLOC_SOURCE_DIR "${THIRD_PARTY_PATH}/jemalloc")
+set(JEMALLOC_INSTALL ${THIRD_PARTY_PATH}/install/jemalloc)
+set(JEMALLOC_INCLUDE_DIR ${JEMALLOC_INSTALL}/include)
+set(JEMALLOC_DOWNLOAD_DIR "${JEMALLOC_SOURCE_DIR}/src/${JEMALLOC_PROJECT}")
+
+set(JEMALLOC_STATIC_LIBRARIES
+    ${THIRD_PARTY_PATH}/install/jemalloc/lib/libjemalloc_pic.a)
+set(JEMALLOC_LIBRARIES
+    ${THIRD_PARTY_PATH}/install/jemalloc/lib/libjemalloc_pic.a)
+
+ExternalProject_Add(
+  extern_jemalloc
+  PREFIX ${JEMALLOC_SOURCE_DIR}
+  URL ${JEMALLOC_URL}
+  INSTALL_DIR ${JEMALLOC_INSTALL}
+  DOWNLOAD_DIR "${JEMALLOC_DOWNLOAD_DIR}"
+  BUILD_COMMAND $(MAKE)
+  BUILD_IN_SOURCE 1
+  INSTALL_COMMAND $(MAKE) install
+  CONFIGURE_COMMAND "${JEMALLOC_DOWNLOAD_DIR}/configure"
+                    --prefix=${JEMALLOC_INSTALL} --disable-initial-exec-tls)
+
+add_library(jemalloc STATIC IMPORTED GLOBAL)
+set_property(TARGET jemalloc PROPERTY IMPORTED_LOCATION
+                                      ${JEMALLOC_STATIC_LIBRARIES})
+
+include_directories(${JEMALLOC_INCLUDE_DIR})
+add_dependencies(jemalloc extern_jemalloc)
diff --git a/cmake/external/rocksdb.cmake b/cmake/external/rocksdb.cmake
index 41a1916dc3308..0084247461b74 100644
--- a/cmake/external/rocksdb.cmake
+++ b/cmake/external/rocksdb.cmake
@@ -14,6 +14,13 @@
 
 include(ExternalProject)
 
+# find_package(jemalloc REQUIRED)
+
+set(JEMALLOC_INCLUDE_DIR ${THIRD_PARTY_PATH}/install/jemalloc/include)
+set(JEMALLOC_LIBRARIES
+    ${THIRD_PARTY_PATH}/install/jemalloc/lib/libjemalloc_pic.a)
+message(STATUS "rocksdb jemalloc:" ${JEMALLOC_LIBRARIES})
+
 set(ROCKSDB_PREFIX_DIR ${THIRD_PARTY_PATH}/rocksdb)
 set(ROCKSDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocksdb)
 set(ROCKSDB_INCLUDE_DIR
@@ -22,22 +29,41 @@ set(ROCKSDB_INCLUDE_DIR
 set(ROCKSDB_LIBRARIES
     "${ROCKSDB_INSTALL_DIR}/lib/librocksdb.a"
     CACHE FILEPATH "rocksdb library." FORCE)
-set(ROCKSDB_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+set(ROCKSDB_COMMON_FLAGS
+    "-g -pipe -O2 -W -Wall -Wno-unused-parameter -fPIC -fno-builtin-memcmp -fno-omit-frame-pointer"
+)
+set(ROCKSDB_FLAGS
+    "-DNDEBUG -DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DOS_LINUX -DROCKSDB_FALLOCATE_PRESENT -DHAVE_SSE42 -DHAVE_PCLMUL -DZLIB -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_PTHREAD_ADAPTIVE_MUTEX -DROCKSDB_BACKTRACE -DROCKSDB_SUPPORT_THREAD_LOCAL -DROCKSDB_USE_RTTI -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_AUXV_GETAUXVAL_PRESENT"
+)
+set(ROCKSDB_CMAKE_CXX_FLAGS
+    "${ROCKSDB_COMMON_FLAGS} -DROCKSDB_LIBAIO_PRESENT -msse -msse4.2 -mpclmul ${ROCKSDB_FLAGS} -fPIC  -I${JEMALLOC_INCLUDE_DIR}"
+)
+set(ROCKSDB_CMAKE_C_FLAGS
+    "${ROCKSDB_COMMON_FLAGS} ${ROCKSDB_FLAGS} -DROCKSDB_LIBAIO_PRESENT -fPIC  -I${JEMALLOC_INCLUDE_DIR}"
+)
 include_directories(${ROCKSDB_INCLUDE_DIR})
 
+set(CMAKE_CXX_LINK_EXECUTABLE
+    "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt -lz")
 ExternalProject_Add(
   extern_rocksdb
   ${EXTERNAL_PROJECT_LOG_ARGS}
   PREFIX ${ROCKSDB_PREFIX_DIR}
-  GIT_REPOSITORY "https://github.com/facebook/rocksdb"
-  GIT_TAG v6.10.1
+  GIT_REPOSITORY "https://github.com/Thunderbrook/rocksdb"
+  GIT_TAG 6.19.fb
   UPDATE_COMMAND ""
   CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
              -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
              -DWITH_BZ2=OFF
              -DWITH_GFLAGS=OFF
+             -DWITH_TESTS=OFF
+             -DWITH_JEMALLOC=ON
+             -DWITH_BENCHMARK_TOOLS=OFF
+             -DJeMalloc_LIBRARIES=${JEMALLOC_LIBRARIES}
+             -DJeMalloc_INCLUDE_DIRS=${JEMALLOC_INCLUDE_DIR}
              -DCMAKE_CXX_FLAGS=${ROCKSDB_CMAKE_CXX_FLAGS}
-             -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+             -DCMAKE_C_FLAGS=${ROCKSDB_CMAKE_C_FLAGS}
+             -DCMAKE_CXX_LINK_EXECUTABLE=${CMAKE_CXX_LINK_EXECUTABLE}
   #    BUILD_BYPRODUCTS ${ROCKSDB_PREFIX_DIR}/src/extern_rocksdb/librocksdb.a
   INSTALL_COMMAND
     mkdir -p ${ROCKSDB_INSTALL_DIR}/lib/ && cp
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 94fb1b4d838f9..5455ddadfdea4 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -422,6 +422,9 @@ if(WITH_PSCORE)
 
   include(external/rocksdb) # download, build, install rocksdb
   list(APPEND third_party_deps extern_rocksdb)
+
+  include(external/jemalloc) # download, build, install jemalloc
+  list(APPEND third_party_deps extern_jemalloc)
 endif()
 
 if(WITH_XBYAK)
diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h
index 5654669d76fdb..74f946b2253aa 100644
--- a/paddle/fluid/distributed/ps/service/ps_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_client.h
@@ -148,10 +148,12 @@ class PSClient {
     return fut;
   }
 
-  virtual ::std::future<int32_t> PullSparsePtr(char **select_values,
+  virtual ::std::future<int32_t> PullSparsePtr(int shard_id,
+                                               char **select_values,
                                                size_t table_id,
                                                const uint64_t *keys,
-                                               size_t num) {
+                                               size_t num,
+                                               uint16_t pass_id) {
     VLOG(0) << "Did not implement";
     std::promise<int32_t> promise;
     std::future<int> fut = promise.get_future();
@@ -160,6 +162,15 @@ class PSClient {
   }
 
   virtual std::future<int32_t> PrintTableStat(uint32_t table_id) = 0;
+  virtual std::future<int32_t> SaveCacheTable(uint32_t table_id,
+                                              uint16_t pass_id,
+                                              size_t threshold) {
+    VLOG(0) << "Did not implement";
+    std::promise<int32_t> promise;
+    std::future<int> fut = promise.get_future();
+    promise.set_value(-1);
+    return fut;
+  }
 
   // 确保所有积攒中的请求都发起发送
   virtual std::future<int32_t> Flush() = 0;
diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc
index e8bf426710bc3..5466e9cd95bd0 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_client.cc
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc
@@ -260,10 +260,12 @@ ::std::future<int32_t> PsLocalClient::PushDense(const Region* regions,
 //  return done();
 //}
 
-::std::future<int32_t> PsLocalClient::PullSparsePtr(char** select_values,
+::std::future<int32_t> PsLocalClient::PullSparsePtr(int shard_id,
+                                                    char** select_values,
                                                     size_t table_id,
                                                     const uint64_t* keys,
-                                                    size_t num) {
+                                                    size_t num,
+                                                    uint16_t pass_id) {
   // FIXME
   // auto timer =
   // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse");
@@ -278,6 +280,8 @@ ::std::future<int32_t> PsLocalClient::PullSparsePtr(char** select_values,
   table_context.pull_context.ptr_values = select_values;
   table_context.use_ptr = true;
   table_context.num = num;
+  table_context.shard_id = shard_id;
+  table_context.pass_id = pass_id;
 
   //  table_ptr->PullSparsePtr(select_values, keys, num);
   table_ptr->Pull(table_context);
@@ -285,6 +289,28 @@ ::std::future<int32_t> PsLocalClient::PullSparsePtr(char** select_values,
   return done();
 }
 
+::std::future<int32_t> PsLocalClient::PrintTableStat(uint32_t table_id) {
+  auto* table_ptr = GetTable(table_id);
+  std::pair<int64_t, int64_t> ret = table_ptr->PrintTableStat();
+  VLOG(0) << "table id: " << table_id << ", feasign size: " << ret.first
+          << ", mf size: " << ret.second;
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::SaveCacheTable(uint32_t table_id,
+                                                     uint16_t pass_id,
+                                                     size_t threshold) {
+  auto* table_ptr = GetTable(table_id);
+  std::pair<int64_t, int64_t> ret = table_ptr->PrintTableStat();
+  VLOG(0) << "table id: " << table_id << ", feasign size: " << ret.first
+          << ", mf size: " << ret.second;
+  if (ret.first > threshold) {
+    VLOG(0) << "run cache table";
+    table_ptr->CacheTable(pass_id);
+  }
+  return done();
+}
+
 ::std::future<int32_t> PsLocalClient::PushSparseRawGradient(
     size_t table_id,
     const uint64_t* keys,
diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.h b/paddle/fluid/distributed/ps/service/ps_local_client.h
index 593805547af84..583ea8052eb01 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.h
@@ -76,18 +76,19 @@ class PsLocalClient : public PSClient {
     return fut;
   }
 
-  virtual ::std::future<int32_t> PullSparsePtr(char** select_values,
+  virtual ::std::future<int32_t> PullSparsePtr(int shard_id,
+                                               char** select_values,
                                                size_t table_id,
                                                const uint64_t* keys,
-                                               size_t num);
+                                               size_t num,
+                                               uint16_t pass_id);
 
-  virtual ::std::future<int32_t> PrintTableStat(uint32_t table_id) {
-    std::promise<int32_t> prom;
-    std::future<int32_t> fut = prom.get_future();
-    prom.set_value(0);
+  virtual ::std::future<int32_t> PrintTableStat(uint32_t table_id);
+
+  virtual ::std::future<int32_t> SaveCacheTable(uint32_t table_id,
+                                                uint16_t pass_id,
+                                                size_t threshold);
 
-    return fut;
-  }
   virtual ::std::future<int32_t> PushSparse(size_t table_id,
                                             const uint64_t* keys,
                                             const float** update_values,
diff --git a/paddle/fluid/distributed/ps/table/accessor.h b/paddle/fluid/distributed/ps/table/accessor.h
index b55c77bf52d84..9f6baf3189fb8 100644
--- a/paddle/fluid/distributed/ps/table/accessor.h
+++ b/paddle/fluid/distributed/ps/table/accessor.h
@@ -162,6 +162,15 @@ class ValueAccessor {
     return 0;
   }
 
+  virtual bool SaveMemCache(float* value,
+                            int param,
+                            double global_cache_threshold,
+                            uint16_t pass_id) {
+    return true;
+  }
+
+  virtual void UpdatePassId(float* value, uint16_t pass_id) {}
+
   virtual float GetField(float* value, const std::string& name) { return 0.0; }
 #define DEFINE_GET_INDEX(class, field) \
   virtual int get_##field##_index() override { return class ::field##_index(); }
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
index 08bef6261cf34..995eafcdd61eb 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -24,6 +24,7 @@
 #include "gflags/gflags.h"
 #include "paddle/fluid/distributed/common/utils.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/io/fs.h"
 #include "paddle/fluid/platform/timer.h"
@@ -32,6 +33,7 @@
 
 DECLARE_bool(graph_load_in_parallel);
 DECLARE_bool(graph_get_neighbor_id);
+DECLARE_int32(gpugraph_storage_mode);
 
 namespace paddle {
 namespace distributed {
@@ -311,7 +313,7 @@ int64_t GraphTable::load_graph_to_memory_from_ssd(int idx,
           std::string str;
           if (_db->get(i, ch, sizeof(int) * 2 + sizeof(uint64_t), str) == 0) {
             count[i] += (int64_t)str.size();
-            for (size_t j = 0; j < (int)str.size(); j += sizeof(uint64_t)) {
+            for (size_t j = 0; j < str.size(); j += sizeof(uint64_t)) {
               uint64_t id = *(uint64_t *)(str.c_str() + j);
               add_comm_edge(idx, v, id);
             }
@@ -381,7 +383,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
         score[i] = 0;
       }
     }
-    for (size_t j = 0; j < (int)value.size(); j += sizeof(uint64_t)) {
+    for (size_t j = 0; j < value.size(); j += sizeof(uint64_t)) {
       uint64_t v = *((uint64_t *)(value.c_str() + j));
       int index = -1;
       if (id_map.find(v) != id_map.end()) {
@@ -472,6 +474,71 @@ void GraphTable::clear_graph(int idx) {
     edge_shards[idx].push_back(new GraphShard());
   }
 }
+
+void GraphTable::release_graph() {
+  // Before releasing graph, prepare for sampling ids and embedding keys.
+  build_graph_type_keys();
+
+  if (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::WHOLE_HBM) {
+    build_graph_total_keys();
+  }
+  // clear graph
+  clear_graph();
+}
+
+void GraphTable::release_graph_edge() {
+  build_graph_total_keys();
+  clear_edge_shard();
+}
+
+void GraphTable::release_graph_node() {
+  build_graph_type_keys();
+  clear_feature_shard();
+}
+
+void GraphTable::clear_edge_shard() {
+  VLOG(0) << "begin clear edge shard";
+  std::vector<std::future<int>> tasks;
+  for (auto &type_shards : edge_shards) {
+    for (auto &shard : type_shards) {
+      tasks.push_back(
+        load_node_edge_task_pool->enqueue([&shard, this]() -> int {
+          delete shard;
+          return 0;
+      }));
+    }
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); 
+  for (auto &shards : edge_shards) shards.clear();
+  edge_shards.clear();
+  VLOG(0) << "finish clear edge shard"; 
+}
+
+void GraphTable::clear_feature_shard() {
+  VLOG(0) << "begin clear feature shard";
+  std::vector<std::future<int>> tasks;
+  for (auto &type_shards : feature_shards) {
+    for (auto &shard : type_shards) {
+      tasks.push_back(
+        load_node_edge_task_pool->enqueue([&shard, this]() -> int {
+          delete shard;
+          return 0;
+      }));
+    }
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  for (auto &shards : feature_shards) shards.clear();
+  feature_shards.clear();
+  VLOG(0) << "finish clear feature shard";
+}
+
+void GraphTable::clear_graph() {
+  VLOG(0) << "begin clear_graph";
+  clear_edge_shard();
+  clear_feature_shard();
+  VLOG(0) << "finish clear_graph";
+}
+
 int32_t GraphTable::load_next_partition(int idx) {
   if (next_partition >= (int)partitions[idx].size()) {
     VLOG(0) << "partition iteration is done";
@@ -538,7 +605,7 @@ int32_t GraphTable::dump_edges_to_ssd(int idx) {
           std::vector<Node *> &v = shards[i]->get_bucket();
           for (size_t j = 0; j < v.size(); j++) {
             std::vector<uint64_t> s;
-            for (size_t k = 0; k < (int)v[j]->get_neighbor_size(); k++) {
+            for (size_t k = 0; k < v[j]->get_neighbor_size(); k++) {
               s.push_back(v[j]->get_neighbor_id(k));
             }
             cost += v[j]->get_neighbor_size() * sizeof(uint64_t);
@@ -1037,21 +1104,7 @@ Node *GraphShard::find_node(uint64_t id) {
   return iter == node_location.end() ? nullptr : bucket[iter->second];
 }
 
-GraphTable::~GraphTable() {
-  for (int i = 0; i < (int)edge_shards.size(); i++) {
-    for (auto p : edge_shards[i]) {
-      delete p;
-    }
-    edge_shards[i].clear();
-  }
-
-  for (int i = 0; i < (int)feature_shards.size(); i++) {
-    for (auto p : feature_shards[i]) {
-      delete p;
-    }
-    feature_shards[i].clear();
-  }
-}
+GraphTable::~GraphTable() { clear_graph(); }
 
 int32_t GraphTable::Load(const std::string &path, const std::string &param) {
   bool load_edge = (param[0] == 'e');
@@ -1079,14 +1132,133 @@ std::string GraphTable::get_inverse_etype(std::string &etype) {
   return res;
 }
 
-int32_t GraphTable::load_node_and_edge_file(std::string etype,
-                                            std::string ntype,
-                                            std::string epath,
-                                            std::string npath,
+int32_t GraphTable::parse_type_to_typepath(
+    std::string &type2files,
+    std::string graph_data_local_path,
+    std::vector<std::string> &res_type,
+    std::unordered_map<std::string, std::string> &res_type2path) {
+  auto type2files_split =
+      paddle::string::split_string<std::string>(type2files, ",");
+  if (type2files_split.size() == 0) {
+    return -1;
+  }
+  for (auto one_type2file : type2files_split) {
+    auto one_type2file_split =
+        paddle::string::split_string<std::string>(one_type2file, ":");
+    auto type = one_type2file_split[0];
+    auto type_dir = one_type2file_split[1];
+    res_type.push_back(type);
+    res_type2path[type] = graph_data_local_path + "/" + type_dir;
+  }
+  return 0;
+}
+
+int32_t GraphTable::parse_edge_and_load(std::string etype2files,
+                                       std::string graph_data_local_path,
+                                       int part_num,
+                                       bool reverse) {
+   std::vector<std::string> etypes;
+   std::unordered_map<std::string, std::string> edge_to_edgedir;
+   int res = parse_type_to_typepath(
+       etype2files, graph_data_local_path, etypes, edge_to_edgedir);
+   if (res != 0) {
+     VLOG(0) << "parse edge type and edgedir failed!";
+     return -1;
+   }
+   VLOG(0) << "etypes size: " << etypes.size();
+   VLOG(0) << "whether reverse: " << reverse;
+   is_load_reverse_edge = reverse;
+   std::string delim = ";";
+   size_t total_len = etypes.size();
+   
+   std::vector<std::future<int>> tasks;
+   for (size_t i = 0; i < total_len; i++) { 
+     tasks.push_back(
+         _shards_task_pool[i % task_pool_size_]->enqueue([&, i, this]() -> int {
+           std::string etype_path = edge_to_edgedir[etypes[i]];
+           auto etype_path_list = paddle::framework::localfs_list(etype_path);
+           std::string etype_path_str;
+           if (part_num > 0 && part_num < (int)etype_path_list.size()) {
+             std::vector<std::string> sub_etype_path_list(
+                etype_path_list.begin(), etype_path_list.begin() + part_num);
+             etype_path_str =
+                paddle::string::join_strings(sub_etype_path_list, delim);
+           } else {
+             etype_path_str =
+                paddle::string::join_strings(etype_path_list, delim);
+           }
+           this->load_edges(etype_path_str, false, etypes[i]);
+           if (reverse) {
+             std::string r_etype = get_inverse_etype(etypes[i]);
+             this->load_edges(etype_path_str, true, r_etype);
+           }
+           return 0;
+     }));
+   }
+   for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
+   return 0; 
+}
+
+int32_t GraphTable::parse_node_and_load(std::string ntype2files,
+                                       std::string graph_data_local_path,
+                                       int part_num) {
+  std::vector<std::string> ntypes;
+  std::unordered_map<std::string, std::string> node_to_nodedir;
+  int res = parse_type_to_typepath(
+       ntype2files, graph_data_local_path, ntypes, node_to_nodedir);
+  if (res != 0) {
+    VLOG(0) << "parse node type and nodedir failed!";
+    return -1;
+  }
+
+  std::string delim = ";";
+  std::string npath = node_to_nodedir[ntypes[0]];
+  auto npath_list = paddle::framework::localfs_list(npath);
+  std::string npath_str;
+  if (part_num > 0 && part_num < (int)npath_list.size()) {
+    std::vector<std::string> sub_npath_list(
+      npath_list.begin(), npath_list.begin() + part_num);
+    npath_str = paddle::string::join_strings(sub_npath_list, delim);
+  } else {
+    npath_str = paddle::string::join_strings(npath_list, delim);
+  }
+  if (ntypes.size() == 0) {
+    VLOG(0) << "node_type not specified, nothing will be loaded ";
+    return 0;
+  }
+
+  if (FLAGS_graph_load_in_parallel) {
+    this->load_nodes(npath_str, "");
+  } else {
+    for (size_t j = 0; j < ntypes.size(); j++) {
+      this->load_nodes(npath_str, ntypes[j]);
+    }
+  } 
+  return 0;
+}
+
+int32_t GraphTable::load_node_and_edge_file(std::string etype2files,
+                                            std::string ntype2files,
+                                            std::string graph_data_local_path,
                                             int part_num,
                                             bool reverse) {
-  auto etypes = paddle::string::split_string<std::string>(etype, ",");
-  auto ntypes = paddle::string::split_string<std::string>(ntype, ",");
+  std::vector<std::string> etypes;
+  std::unordered_map<std::string, std::string> edge_to_edgedir;
+  int res = parse_type_to_typepath(
+      etype2files, graph_data_local_path, etypes, edge_to_edgedir);
+  if (res != 0) {
+    VLOG(0) << "parse edge type and edgedir failed!";
+    return -1;
+  }
+  std::vector<std::string> ntypes;
+  std::unordered_map<std::string, std::string> node_to_nodedir;
+  res = parse_type_to_typepath(
+      ntype2files, graph_data_local_path, ntypes, node_to_nodedir);
+  if (res != 0) {
+    VLOG(0) << "parse node type and nodedir failed!";
+    return -1;
+  }
+
   VLOG(0) << "etypes size: " << etypes.size();
   VLOG(0) << "whether reverse: " << reverse;
   is_load_reverse_edge = reverse;
@@ -1098,7 +1270,7 @@ int32_t GraphTable::load_node_and_edge_file(std::string etype,
     tasks.push_back(
         _shards_task_pool[i % task_pool_size_]->enqueue([&, i, this]() -> int {
           if (i < etypes.size()) {
-            std::string etype_path = epath + "/" + etypes[i];
+            std::string etype_path = edge_to_edgedir[etypes[i]];
             auto etype_path_list = paddle::framework::localfs_list(etype_path);
             std::string etype_path_str;
             if (part_num > 0 && part_num < (int)etype_path_list.size()) {
@@ -1116,6 +1288,7 @@ int32_t GraphTable::load_node_and_edge_file(std::string etype,
               this->load_edges(etype_path_str, true, r_etype);
             }
           } else {
+            std::string npath = node_to_nodedir[ntypes[0]];
             auto npath_list = paddle::framework::localfs_list(npath);
             std::string npath_str;
             if (part_num > 0 && part_num < (int)npath_list.size()) {
@@ -1414,7 +1587,6 @@ int32_t GraphTable::load_edges(const std::string &path,
                                const std::string &edge_type) {
 #ifdef PADDLE_WITH_HETERPS
   if (search_level == 2) total_memory_cost = 0;
-  const uint64_t fixed_load_edges = 1000000;
 #endif
   int idx = 0;
   if (edge_type == "") {
@@ -1436,7 +1608,7 @@ int32_t GraphTable::load_edges(const std::string &path,
   VLOG(0) << "Begin GraphTable::load_edges() edge_type[" << edge_type << "]";
   if (FLAGS_graph_load_in_parallel) {
     std::vector<std::future<std::pair<uint64_t, uint64_t>>> tasks;
-    for (int i = 0; i < paths.size(); i++) {
+    for (size_t i = 0; i < paths.size(); i++) {
       tasks.push_back(load_node_edge_task_pool->enqueue(
           [&, i, idx, this]() -> std::pair<uint64_t, uint64_t> {
             return parse_edge_file(paths[i], idx, reverse_edge);
@@ -1895,8 +2067,8 @@ int GraphTable::get_all_id(int type_id,
   MergeShardVector shard_merge(output, slice_num);
   auto &search_shards = type_id == 0 ? edge_shards : feature_shards;
   std::vector<std::future<size_t>> tasks;
-  for (int idx = 0; idx < search_shards.size(); idx++) {
-    for (int j = 0; j < search_shards[idx].size(); j++) {
+  for (size_t idx = 0; idx < search_shards.size(); idx++) {
+    for (size_t j = 0; j < search_shards[idx].size(); j++) {
       tasks.push_back(_shards_task_pool[j % task_pool_size_]->enqueue(
           [&search_shards, idx, j, slice_num, &shard_merge]() -> size_t {
             std::vector<std::vector<uint64_t>> shard_keys;
@@ -1919,8 +2091,8 @@ int GraphTable::get_all_neighbor_id(
   MergeShardVector shard_merge(output, slice_num);
   auto &search_shards = type_id == 0 ? edge_shards : feature_shards;
   std::vector<std::future<size_t>> tasks;
-  for (int idx = 0; idx < search_shards.size(); idx++) {
-    for (int j = 0; j < search_shards[idx].size(); j++) {
+  for (size_t idx = 0; idx < search_shards.size(); idx++) {
+    for (size_t j = 0; j < search_shards[idx].size(); j++) {
       tasks.push_back(_shards_task_pool[j % task_pool_size_]->enqueue(
           [&search_shards, idx, j, slice_num, &shard_merge]() -> size_t {
             std::vector<std::vector<uint64_t>> shard_keys;
@@ -1972,7 +2144,7 @@ int GraphTable::get_all_neighbor_id(
   auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
   std::vector<std::future<size_t>> tasks;
   VLOG(3) << "begin task, task_pool_size_[" << task_pool_size_ << "]";
-  for (int i = 0; i < search_shards.size(); i++) {
+  for (size_t i = 0; i < search_shards.size(); i++) {
     tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
         [&search_shards, i, slice_num, &shard_merge]() -> size_t {
           std::vector<std::vector<uint64_t>> shard_keys;
@@ -1998,7 +2170,7 @@ int GraphTable::get_all_feature_ids(
   MergeShardVector shard_merge(output, slice_num);
   auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
   std::vector<std::future<size_t>> tasks;
-  for (int i = 0; i < search_shards.size(); i++) {
+  for (size_t i = 0; i < search_shards.size(); i++) {
     tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
         [&search_shards, i, slice_num, &shard_merge]() -> size_t {
           std::vector<std::vector<uint64_t>> shard_keys;
@@ -2233,5 +2405,43 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) {
   return 0;
 }
 
+void GraphTable::build_graph_total_keys() {
+  VLOG(0) << "begin insert edge to graph_total_keys";
+  // build node embedding id
+  std::vector<std::vector<uint64_t>> keys;
+  this->get_node_embedding_ids(1, &keys);
+  graph_total_keys_.insert(
+      graph_total_keys_.end(), keys[0].begin(), keys[0].end());
+
+  VLOG(0) << "finish insert edge to graph_total_keys";
+}
+
+void GraphTable::build_graph_type_keys() {
+  VLOG(0) << "begin build_graph_type_keys";
+  graph_type_keys_.clear();
+  graph_type_keys_.resize(this->feature_to_id.size());
+
+  int cnt = 0;
+  for (auto &it : this->feature_to_id) {
+    auto node_idx = it.second;
+    std::vector<std::vector<uint64_t>> keys;
+    this->get_all_id(1, node_idx, 1, &keys);
+    type_to_index_[node_idx] = cnt;
+    graph_type_keys_[cnt++] = std::move(keys[0]);
+  }
+  VLOG(0) << "finish build_graph_type_keys";
+
+  VLOG(0) << "begin insert feature into graph_total_keys";
+  // build feature embedding id
+  for (auto &it : this->feature_to_id) {
+    auto node_idx = it.second;
+    std::vector<std::vector<uint64_t>> keys;
+    this->get_all_feature_ids(1, node_idx, 1, &keys);
+    graph_total_keys_.insert(
+        graph_total_keys_.end(), keys[0].begin(), keys[0].end());
+  }
+  VLOG(0) << "finish insert feature into graph_total_keys";
+}
+
 }  // namespace distributed
 };  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h
index 0855babec83c1..9b416b30d9788 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -271,7 +271,6 @@ class RandomSampleLRU {
       remove(node_head);
       remove_count--;
     }
-    // std::cerr<<"after remove_count = "<<remove_count<<std::endl;
   }
 
   void move_to_tail(LRUNode<K, V> *node) {
@@ -536,20 +535,27 @@ class GraphTable : public Table {
                              const FsClientParameter &fs_config);
   virtual int32_t Initialize(const GraphParameter &config);
   int32_t Load(const std::string &path, const std::string &param);
-
-  int32_t load_node_and_edge_file(std::string etype,
-                                  std::string ntype,
-                                  std::string epath,
-                                  std::string npath,
+  int32_t load_node_and_edge_file(std::string etype2files,
+                                  std::string ntype2files,
+                                  std::string graph_data_local_path,
                                   int part_num,
                                   bool reverse);
-
+  int32_t parse_edge_and_load(std::string etype2files,
+                                        std::string graph_data_local_path,
+                                        int part_num,
+                                        bool reverse);
+  int32_t parse_node_and_load(std::string ntype2files,
+                                        std::string graph_data_local_path,
+                                        int part_num);
   std::string get_inverse_etype(std::string &etype);
-
+  int32_t parse_type_to_typepath(
+      std::string &type2files,
+      std::string graph_data_local_path,
+      std::vector<std::string> &res_type,
+      std::unordered_map<std::string, std::string> &res_type2path);
   int32_t load_edges(const std::string &path,
                      bool reverse,
                      const std::string &edge_type);
-
   int get_all_id(int type,
                  int slice_num,
                  std::vector<std::vector<uint64_t>> *output);
@@ -631,7 +637,13 @@ class GraphTable : public Table {
       const std::vector<std::vector<std::string>> &res);
 
   size_t get_server_num() { return server_num; }
+  void clear_graph();
   void clear_graph(int idx);
+  void clear_edge_shard();
+  void clear_feature_shard();
+  void release_graph();
+  void release_graph_edge();
+  void release_graph_node();
   virtual int32_t make_neighbor_sample_cache(size_t size_limit, size_t ttl) {
     {
       std::unique_lock<std::mutex> lock(mutex_);
@@ -694,6 +706,14 @@ class GraphTable : public Table {
   virtual int32_t add_comm_edge(int idx, uint64_t src_id, uint64_t dst_id);
   virtual int32_t build_sampler(int idx, std::string sample_type = "random");
   void set_feature_separator(const std::string &ch);
+
+  void build_graph_total_keys();
+  void build_graph_type_keys();
+
+  std::vector<uint64_t> graph_total_keys_;
+  std::vector<std::vector<uint64_t>> graph_type_keys_;
+  std::unordered_map<int, int> type_to_index_;
+
   std::vector<std::vector<GraphShard *>> edge_shards, feature_shards;
   size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num;
   int task_pool_size_ = 24;
diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
index 4feee70fed751..3e4f4d68f49ca 100644
--- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
@@ -292,7 +292,8 @@ std::string CtrDymfAccessor::ParseToString(const float* v, int param) {
   thread_local std::ostringstream os;
   os.clear();
   os.str("");
-  os << v[0] << " " << v[1] << " " << v[2] << " " << v[3] << " " << v[4];
+  os << common_feature_value.UnseenDays(const_cast<float*>(v)) << " " << v[1]
+     << " " << v[2] << " " << v[3] << " " << v[4];
   //    << v[5] << " " << v[6];
   for (int i = common_feature_value.EmbedG2SumIndex();
        i < common_feature_value.EmbedxG2SumIndex();
@@ -320,5 +321,18 @@ int CtrDymfAccessor::ParseFromString(const std::string& str, float* value) {
   return ret;
 }
 
+bool CtrDymfAccessor::SaveMemCache(float* value,
+                                   int param,
+                                   double global_cache_threshold,
+                                   uint16_t pass_id) {
+  auto base_threshold = _config.ctr_accessor_param().base_threshold();
+  return common_feature_value.Show(value) > global_cache_threshold ||
+         common_feature_value.PassId(value) >= pass_id;
+}
+
+void CtrDymfAccessor::UpdatePassId(float* value, uint16_t pass_id) {
+  common_feature_value.PassId(value) = pass_id;
+}
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
index b820d617d06ae..047bafedd9d7b 100644
--- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
@@ -30,7 +30,7 @@ namespace distributed {
 class CtrDymfAccessor : public ValueAccessor {
  public:
   struct CtrDymfFeatureValue {
-    /*
+    /*v1: old version
       float unseen_days;
       float delta_score;
       float show;
@@ -44,6 +44,20 @@ class CtrDymfAccessor : public ValueAccessor {
       // float embedx_g2sum;
       std::vector<float> embedx_w;
        */
+    /* V2: support pass_id
+      uint16_t pass_id;
+      uint16_t unseen_days;
+      float show;
+      float click;
+      float embed_w;
+      // float embed_g2sum;
+      std::vector<float> embed_g2sum;
+      float slot;
+      float mf_dim
+      std::<vector>float embedx_g2sum;
+      // float embedx_g2sum;
+      std::vector<float> embedx_w;
+    */
 
     int Dim() { return 7 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; }
     int DimSize(size_t dim, int embedx_dim) { return sizeof(float); }
@@ -73,7 +87,17 @@ class CtrDymfAccessor : public ValueAccessor {
     // 根据mf_dim计算的总byte数
     int Size(int& mf_dim) { return (Dim(mf_dim)) * sizeof(float); }
 
-    float& UnseenDays(float* val) { return val[UnseenDaysIndex()]; }
+    uint16_t& PassId(float* val) {
+      uint16_t* int16_val =
+          reinterpret_cast<uint16_t*>(val + UnseenDaysIndex());
+      return int16_val[0];
+    }
+
+    uint16_t& UnseenDays(float* val) {
+      uint16_t* int16_val =
+          reinterpret_cast<uint16_t*>(val + UnseenDaysIndex());
+      return int16_val[1];
+    }
     float& DeltaScore(float* val) { return val[DeltaScoreIndex()]; }
     float& Show(float* val) { return val[ShowIndex()]; }
     float& Click(float* val) { return val[ClickIndex()]; }
@@ -217,6 +241,14 @@ class CtrDymfAccessor : public ValueAccessor {
     return 0.0;
   }
 
+  //根据pass_id和show_threashold阈值来判断cache到ssd
+  bool SaveMemCache(float* value,
+                    int param,
+                    double global_cache_threshold,
+                    uint16_t pass_id);
+  //更新pass_id
+  void UpdatePassId(float* value, uint16_t pass_id);
+
  private:
   // float ShowClickScore(float show, float click);
 
diff --git a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h
index eb3ff2e254f56..5b5a0057f3707 100644
--- a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h
+++ b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h
@@ -16,8 +16,12 @@
 #include <glog/logging.h>
 #include <rocksdb/db.h>
 #include <rocksdb/filter_policy.h>
+#include <rocksdb/iostats_context.h>
 #include <rocksdb/options.h>
+#include <rocksdb/perf_context.h>
+#include <rocksdb/perf_level.h>
 #include <rocksdb/slice.h>
+#include <rocksdb/slice_transform.h>
 #include <rocksdb/table.h>
 #include <rocksdb/write_batch.h>
 
@@ -27,6 +31,55 @@
 namespace paddle {
 namespace distributed {
 
+class Uint64Comparator : public rocksdb::Comparator {
+  int Compare(const rocksdb::Slice& a, const rocksdb::Slice& b) const {
+    uint64_t A = *((uint64_t*)const_cast<char*>(a.data()));
+    uint64_t B = *((uint64_t*)const_cast<char*>(b.data()));
+    if (A < B) {
+      return -1;
+    }
+    if (A > B) {
+      return 1;
+    }
+    return 0;
+  }
+  const char* Name() const { return "Uint64Comparator"; }
+  void FindShortestSeparator(std::string*, const rocksdb::Slice&) const {}
+  void FindShortSuccessor(std::string*) const {}
+};
+
+class RocksDBItem {
+ public:
+  RocksDBItem() {}
+  ~RocksDBItem() {}
+  void reset() {
+    batch_keys.clear();
+    batch_index.clear();
+    batch_values.clear();
+    status.clear();
+  }
+  std::vector<rocksdb::Slice> batch_keys;
+  std::vector<int> batch_index;
+  std::vector<rocksdb::PinnableSlice> batch_values;
+  std::vector<rocksdb::Status> status;
+};
+
+class RocksDBCtx {
+ public:
+  RocksDBCtx() {
+    items[0].reset();
+    items[1].reset();
+    cur_index = 0;
+  }
+  ~RocksDBCtx() {}
+  RocksDBItem* switch_item() {
+    cur_index = (cur_index + 1) % 2;
+    return &items[cur_index];
+  }
+  RocksDBItem items[2];
+  int cur_index;
+};
+
 class RocksDBHandler {
  public:
   RocksDBHandler() {}
@@ -38,55 +91,69 @@ class RocksDBHandler {
   }
 
   int initialize(const std::string& db_path, const int colnum) {
-    VLOG(3) << "db path: " << db_path << " colnum: " << colnum;
-    rocksdb::Options options;
-    rocksdb::BlockBasedTableOptions bbto;
-    bbto.block_size = 4 * 1024;
-    bbto.block_cache = rocksdb::NewLRUCache(64 * 1024 * 1024);
-    bbto.block_cache_compressed = rocksdb::NewLRUCache(64 * 1024 * 1024);
-    bbto.cache_index_and_filter_blocks = false;
-    bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(20, false));
-    bbto.whole_key_filtering = true;
-    options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbto));
-
-    options.keep_log_file_num = 100;
-    options.max_log_file_size = 50 * 1024 * 1024;  // 50MB
-    options.create_if_missing = true;
-    options.use_direct_reads = true;
-    options.max_background_flushes = 5;
-    options.max_background_compactions = 5;
-    options.base_background_compactions = 10;
-    options.write_buffer_size = 256 * 1024 * 1024;  // 256MB
-    options.max_write_buffer_number = 8;
-    options.max_bytes_for_level_base =
-        options.max_write_buffer_number * options.write_buffer_size;
-    options.min_write_buffer_number_to_merge = 1;
-    options.target_file_size_base = 1024 * 1024 * 1024;  // 1024MB
-    options.memtable_prefix_bloom_size_ratio = 0.02;
-    options.num_levels = 4;
-    options.max_open_files = -1;
-
-    options.compression = rocksdb::kNoCompression;
-    options.level0_file_num_compaction_trigger = 8;
-    options.level0_slowdown_writes_trigger =
-        1.8 * options.level0_file_num_compaction_trigger;
-    options.level0_stop_writes_trigger =
-        3.6 * options.level0_file_num_compaction_trigger;
-
-    if (!db_path.empty()) {
-      std::string rm_cmd = "rm -rf " + db_path;
-      system(rm_cmd.c_str());
-    }
-
-    rocksdb::Status s = rocksdb::DB::Open(options, db_path, &_db);
-    assert(s.ok());
-    _handles.resize(colnum);
+    VLOG(0) << "db path: " << db_path << " colnum: " << colnum;
+    _dbs.resize(colnum);
     for (int i = 0; i < colnum; i++) {
-      s = _db->CreateColumnFamily(
-          options, "shard_" + std::to_string(i), &_handles[i]);
+      rocksdb::Options options;
+      options.comparator = &_comparator;
+      rocksdb::BlockBasedTableOptions bbto;
+      // options.memtable_factory.reset(rocksdb::NewHashSkipListRepFactory(65536));
+      // options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(2));
+      bbto.format_version = 5;
+      bbto.use_delta_encoding = false;
+      bbto.block_size = 4 * 1024;
+      bbto.block_restart_interval = 6;
+      bbto.block_cache = rocksdb::NewLRUCache(64 * 1024 * 1024);
+      // bbto.block_cache_compressed = rocksdb::NewLRUCache(64 * 1024 * 1024);
+      bbto.cache_index_and_filter_blocks = false;
+      bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(15, false));
+      bbto.whole_key_filtering = true;
+      options.statistics = rocksdb::CreateDBStatistics();
+      options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbto));
+
+      // options.IncreaseParallelism();
+      options.OptimizeLevelStyleCompaction();
+      options.keep_log_file_num = 100;
+      // options.db_log_dir = "./log/rocksdb";
+      options.max_log_file_size = 50 * 1024 * 1024;  // 50MB
+      // options.threads = 8;
+      options.create_if_missing = true;
+      options.use_direct_reads = true;
+      options.max_background_flushes = 37;
+      options.max_background_compactions = 64;
+      options.base_background_compactions = 10;
+      options.write_buffer_size = 256 * 1024 * 1024;  // 256MB
+      options.max_write_buffer_number = 8;
+      options.max_bytes_for_level_base =
+          options.max_write_buffer_number * options.write_buffer_size;
+      options.min_write_buffer_number_to_merge = 1;
+      options.target_file_size_base = 1024 * 1024 * 1024;  // 1024MB
+      // options.verify_checksums_in_compaction = false;
+      // options.disable_auto_compactions = true;
+      options.memtable_prefix_bloom_size_ratio = 0.02;
+      options.num_levels = 4;
+      options.max_open_files = -1;
+
+      options.compression = rocksdb::kNoCompression;
+      // options.compaction_options_fifo = rocksdb::CompactionOptionsFIFO();
+      // options.compaction_style =
+      // rocksdb::CompactionStyle::kCompactionStyleFIFO;
+      options.level0_file_num_compaction_trigger = 5;
+      options.level0_slowdown_writes_trigger =
+          1.8 * options.level0_file_num_compaction_trigger;
+      options.level0_stop_writes_trigger =
+          3.6 * options.level0_file_num_compaction_trigger;
+
+      std::string shard_path = db_path + "_" + std::to_string(i);
+      if (!shard_path.empty()) {
+        std::string rm_cmd = "rm -rf " + shard_path;
+        system(rm_cmd.c_str());
+      }
+
+      rocksdb::Status s = rocksdb::DB::Open(options, shard_path, &_dbs[i]);
       assert(s.ok());
     }
-    LOG(INFO) << "DB initialize success, colnum:" << colnum;
+    VLOG(0) << "DB initialize success, colnum:" << colnum;
     return 0;
   }
 
@@ -94,10 +161,9 @@ class RocksDBHandler {
       int id, const char* key, int key_len, const char* value, int value_len) {
     rocksdb::WriteOptions options;
     options.disableWAL = true;
-    rocksdb::Status s = _db->Put(options,
-                                 _handles[id],
-                                 rocksdb::Slice(key, key_len),
-                                 rocksdb::Slice(value, value_len));
+    rocksdb::Status s = _dbs[id]->Put(options,
+                                      rocksdb::Slice(key, key_len),
+                                      rocksdb::Slice(value, value_len));
     assert(s.ok());
     return 0;
   }
@@ -110,20 +176,17 @@ class RocksDBHandler {
     options.disableWAL = true;
     rocksdb::WriteBatch batch(n * 128);
     for (int i = 0; i < n; i++) {
-      batch.Put(_handles[id],
-                rocksdb::Slice(ssd_keys[i].first, ssd_keys[i].second),
+      batch.Put(rocksdb::Slice(ssd_keys[i].first, ssd_keys[i].second),
                 rocksdb::Slice(ssd_values[i].first, ssd_values[i].second));
     }
-    rocksdb::Status s = _db->Write(options, &batch);
+    rocksdb::Status s = _dbs[id]->Write(options, &batch);
     assert(s.ok());
     return 0;
   }
 
   int get(int id, const char* key, int key_len, std::string& value) {
-    rocksdb::Status s = _db->Get(rocksdb::ReadOptions(),
-                                 _handles[id],
-                                 rocksdb::Slice(key, key_len),
-                                 &value);
+    rocksdb::Status s = _dbs[id]->Get(
+        rocksdb::ReadOptions(), rocksdb::Slice(key, key_len), &value);
     if (s.IsNotFound()) {
       return 1;
     }
@@ -131,33 +194,58 @@ class RocksDBHandler {
     return 0;
   }
 
+  void multi_get(int id,
+                 const size_t num_keys,
+                 const rocksdb::Slice* keys,
+                 rocksdb::PinnableSlice* values,
+                 rocksdb::Status* status,
+                 const bool sorted_input = true) {
+    rocksdb::ColumnFamilyHandle* handle = _dbs[id]->DefaultColumnFamily();
+    auto read_opt = rocksdb::ReadOptions();
+    read_opt.fill_cache = false;
+    _dbs[id]->MultiGet(
+        read_opt, handle, num_keys, keys, values, status, sorted_input);
+  }
+
   int del_data(int id, const char* key, int key_len) {
     rocksdb::WriteOptions options;
     options.disableWAL = true;
-    rocksdb::Status s =
-        _db->Delete(options, _handles[id], rocksdb::Slice(key, key_len));
+    rocksdb::Status s = _dbs[id]->Delete(options, rocksdb::Slice(key, key_len));
     assert(s.ok());
     return 0;
   }
 
   int flush(int id) {
-    rocksdb::Status s = _db->Flush(rocksdb::FlushOptions(), _handles[id]);
+    rocksdb::Status s = _dbs[id]->Flush(rocksdb::FlushOptions());
     assert(s.ok());
     return 0;
   }
 
   rocksdb::Iterator* get_iterator(int id) {
-    return _db->NewIterator(rocksdb::ReadOptions(), _handles[id]);
+    return _dbs[id]->NewIterator(rocksdb::ReadOptions());
   }
 
   int get_estimate_key_num(uint64_t& num_keys) {
-    _db->GetAggregatedIntProperty("rocksdb.estimate-num-keys", &num_keys);
+    // _db->GetAggregatedIntProperty("rocksdb.estimate-num-keys", &num_keys);
+    return 0;
+  }
+
+  Uint64Comparator* get_comparator() { return &_comparator; }
+
+  int ingest_externel_file(int id,
+                           const std::vector<std::string>& sst_filelist) {
+    rocksdb::IngestExternalFileOptions ifo;
+    ifo.move_files = true;
+    rocksdb::Status s = _dbs[id]->IngestExternalFile(sst_filelist, ifo);
+    assert(s.ok());
     return 0;
   }
 
  private:
   std::vector<rocksdb::ColumnFamilyHandle*> _handles;
-  rocksdb::DB* _db;
+  // rocksdb::DB* _db;
+  std::vector<rocksdb::DB*> _dbs;
+  Uint64Comparator _comparator;
 };
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
index a46244265ef20..9a69433c6104a 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
@@ -172,7 +172,6 @@ int32_t MemorySparseTable::Load(const std::string& path,
           value.resize(feature_value_size);
           int parse_size = _value_accesor->ParseFromString(++end, value.data());
           value.resize(parse_size);
-
         }
         read_channel->close();
         if (err_no == -1) {
@@ -725,7 +724,8 @@ int32_t MemorySparseTable::Pull(TableContext& context) {
   if (context.use_ptr) {
     char** pull_values = context.pull_context.ptr_values;
     const uint64_t* keys = context.pull_context.keys;
-    return PullSparsePtr(pull_values, keys, context.num);
+    return PullSparsePtr(
+        context.shard_id, pull_values, keys, context.num, context.pass_id);
   } else {
     float* pull_values = context.pull_context.values;
     const PullSparseValue& pull_value = context.pull_context.pull_value;
@@ -822,9 +822,11 @@ int32_t MemorySparseTable::PullSparse(float* pull_values,
   return 0;
 }
 
-int32_t MemorySparseTable::PullSparsePtr(char** pull_values,
+int32_t MemorySparseTable::PullSparsePtr(int shard_id,  // fake num
+                                         char** pull_values,
                                          const uint64_t* keys,
-                                         size_t num) {
+                                         size_t num,
+                                         uint16_t pass_id) {
   CostTimer timer("pscore_sparse_select_all");
   size_t value_size = _value_accesor->GetAccessorInfo().size / sizeof(float);
   size_t mf_value_size =
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
index 17018d5e5dfc3..658446d770c71 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
@@ -90,7 +90,11 @@ class MemorySparseTable : public Table {
   std::pair<int64_t, int64_t> PrintTableStat() override;
   int32_t PullSparse(float* values, const PullSparseValue& pull_value);
 
-  int32_t PullSparsePtr(char** pull_values, const uint64_t* keys, size_t num);
+  int32_t PullSparsePtr(int shard_id,
+                        char** pull_values,
+                        const uint64_t* keys,
+                        size_t num,
+                        uint16_t pass_id);
 
   int32_t PushSparse(const uint64_t* keys, const float* values, size_t num);
 
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
index 3e0f631ed41bc..b2bd0b36795ad 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
@@ -24,8 +24,10 @@ DECLARE_bool(pserver_print_missed_key_num_every_push);
 DECLARE_bool(pserver_create_value_when_push);
 DECLARE_bool(pserver_enable_create_feasign_randomly);
 DEFINE_bool(pserver_open_strict_check, false, "pserver_open_strict_check");
-DEFINE_string(rocksdb_path, "database", "path of sparse table rocksdb file");
 DEFINE_int32(pserver_load_batch_size, 5000, "load batch size for ssd");
+PADDLE_DEFINE_EXPORTED_string(rocksdb_path,
+                              "database",
+                              "path of sparse table rocksdb file");
 
 namespace paddle {
 namespace distributed {
@@ -34,6 +36,9 @@ int32_t SSDSparseTable::Initialize() {
   MemorySparseTable::Initialize();
   _db = paddle::distributed::RocksDBHandler::GetInstance();
   _db->initialize(FLAGS_rocksdb_path, _real_local_shard_num);
+  VLOG(0) << "initalize SSDSparseTable succ";
+  VLOG(0) << "SSD FLAGS_pserver_print_missed_key_num_every_push:"
+          << FLAGS_pserver_print_missed_key_num_every_push;
   return 0;
 }
 
@@ -44,7 +49,8 @@ int32_t SSDSparseTable::Pull(TableContext& context) {
   if (context.use_ptr) {
     char** pull_values = context.pull_context.ptr_values;
     const uint64_t* keys = context.pull_context.keys;
-    return PullSparsePtr(pull_values, keys, context.num);
+    return PullSparsePtr(
+        context.shard_id, pull_values, keys, context.num, context.pass_id);
   } else {
     float* pull_values = context.pull_context.values;
     const PullSparseValue& pull_value = context.pull_context.pull_value;
@@ -171,90 +177,139 @@ int32_t SSDSparseTable::PullSparse(float* pull_values,
   return 0;
 }
 
-int32_t SSDSparseTable::PullSparsePtr(char** pull_values,
-                                      const uint64_t* keys,
-                                      size_t num) {
+int32_t SSDSparseTable::PullSparsePtr(int shard_id,
+                                      char** pull_values,
+                                      const uint64_t* pull_keys,
+                                      size_t num,
+                                      uint16_t pass_id) {
   CostTimer timer("pserver_ssd_sparse_select_all");
   size_t value_size = _value_accesor->GetAccessorInfo().size / sizeof(float);
   size_t mf_value_size =
       _value_accesor->GetAccessorInfo().mf_size / sizeof(float);
 
   {  // 从table取值 or create
-    std::vector<std::future<int>> tasks(_real_local_shard_num);
-    std::vector<std::vector<std::pair<uint64_t, int>>> task_keys(
-        _real_local_shard_num);
-    for (size_t i = 0; i < num; ++i) {
-      int shard_id = (keys[i] % _sparse_table_shard_num) % _avg_local_shard_num;
-      task_keys[shard_id].push_back({keys[i], i});
-    }
+    RocksDBCtx context;
+    std::vector<std::future<int>> tasks;
+    RocksDBItem* cur_ctx = context.switch_item();
+    cur_ctx->reset();
+    FixedFeatureValue* ret = NULL;
+    auto& local_shard = _local_shards[shard_id];
+    float data_buffer[value_size];
+    float* data_buffer_ptr = data_buffer;
 
-    std::atomic<uint32_t> missed_keys{0};
-    for (int shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
-      tasks[shard_id] =
+    for (int i = 0; i < num; ++i) {
+      uint64_t key = pull_keys[i];
+      auto itr = local_shard.find(key);
+      if (itr == local_shard.end()) {
+        cur_ctx->batch_index.push_back(i);
+        cur_ctx->batch_keys.push_back(
+            rocksdb::Slice((char*)&(pull_keys[i]), sizeof(uint64_t)));
+        if (cur_ctx->batch_keys.size() == 1024) {
+          cur_ctx->batch_values.resize(cur_ctx->batch_keys.size());
+          cur_ctx->status.resize(cur_ctx->batch_keys.size());
+          auto fut =
+              _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue(
+                  [this, shard_id, cur_ctx]() -> int {
+                    _db->multi_get(shard_id,
+                                   cur_ctx->batch_keys.size(),
+                                   cur_ctx->batch_keys.data(),
+                                   cur_ctx->batch_values.data(),
+                                   cur_ctx->status.data());
+                    return 0;
+                  });
+          cur_ctx = context.switch_item();
+          for (size_t x = 0; x < tasks.size(); ++x) {
+            tasks[x].wait();
+            for (size_t idx = 0; idx < cur_ctx->status.size(); idx++) {
+              uint64_t cur_key = *((uint64_t*)const_cast<char*>(
+                  cur_ctx->batch_keys[idx].data()));
+              if (cur_ctx->status[idx].IsNotFound()) {
+                auto& feature_value = local_shard[cur_key];
+                int init_size = value_size - mf_value_size;
+                feature_value.resize(init_size);
+                _value_accesor->Create(&data_buffer_ptr, 1);
+                memcpy(const_cast<float*>(feature_value.data()),
+                       data_buffer_ptr,
+                       init_size * sizeof(float));
+                ret = &feature_value;
+              } else {
+                int data_size =
+                    cur_ctx->batch_values[idx].size() / sizeof(float);
+                // from rocksdb to mem
+                auto& feature_value = local_shard[cur_key];
+                feature_value.resize(data_size);
+                memcpy(const_cast<float*>(feature_value.data()),
+                       paddle::string::str_to_float(
+                           cur_ctx->batch_values[idx].data()),
+                       data_size * sizeof(float));
+                _db->del_data(shard_id, (char*)&cur_key, sizeof(uint64_t));
+                ret = &feature_value;
+              }
+              _value_accesor->UpdatePassId(ret->data(), pass_id);
+              int pull_data_idx = cur_ctx->batch_index[idx];
+              pull_values[pull_data_idx] = (char*)ret;
+            }
+          }
+          cur_ctx->reset();
+          tasks.clear();
+          tasks.push_back(std::move(fut));
+        }
+      } else {
+        ret = itr.value_ptr();
+        // int pull_data_idx = keys[i].second;
+        _value_accesor->UpdatePassId(ret->data(), pass_id);
+        pull_values[i] = (char*)ret;
+      }
+    }
+    if (cur_ctx->batch_keys.size() != 0) {
+      cur_ctx->batch_values.resize(cur_ctx->batch_keys.size());
+      cur_ctx->status.resize(cur_ctx->batch_keys.size());
+      auto fut =
           _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue(
-              [this,
-               shard_id,
-               &task_keys,
-               value_size,
-               mf_value_size,
-               pull_values,
-               &missed_keys]() -> int {
-                auto& keys = task_keys[shard_id];
-                auto& local_shard = _local_shards[shard_id];
-                float data_buffer[value_size];  // NOLINT
-                float* data_buffer_ptr = data_buffer;
-                for (size_t i = 0; i < keys.size(); ++i) {
-                  uint64_t key = keys[i].first;
-                  auto itr = local_shard.find(key);
-                  size_t data_size = value_size - mf_value_size;
-                  FixedFeatureValue* ret = NULL;
-                  if (itr == local_shard.end()) {
-                    // pull rocksdb
-                    std::string tmp_string("");
-                    if (_db->get(shard_id,
-                                 reinterpret_cast<char*>(&key),
-                                 sizeof(uint64_t),
-                                 tmp_string) > 0) {
-                      ++missed_keys;
-                      auto& feature_value = local_shard[key];
-                      feature_value.resize(data_size);
-                      float* data_ptr =
-                          const_cast<float*>(feature_value.data());
-                      _value_accesor->Create(&data_buffer_ptr, 1);
-                      memcpy(
-                          data_ptr, data_buffer_ptr, data_size * sizeof(float));
-                      ret = &feature_value;
-                    } else {
-                      data_size = tmp_string.size() / sizeof(float);
-                      memcpy(data_buffer_ptr,
-                             paddle::string::str_to_float(tmp_string),
-                             data_size * sizeof(float));
-                      // from rocksdb to mem
-                      auto& feature_value = local_shard[key];
-                      feature_value.resize(data_size);
-                      memcpy(const_cast<float*>(feature_value.data()),
-                             data_buffer_ptr,
-                             data_size * sizeof(float));
-                      _db->del_data(shard_id,
-                                    reinterpret_cast<char*>(&key),
-                                    sizeof(uint64_t));
-                      ret = &feature_value;
-                    }
-                  } else {
-                    ret = itr.value_ptr();
-                  }
-                  int pull_data_idx = keys[i].second;
-                  pull_values[pull_data_idx] = reinterpret_cast<char*>(ret);
-                }
+              [this, shard_id, cur_ctx]() -> int {
+                _db->multi_get(shard_id,
+                               cur_ctx->batch_keys.size(),
+                               cur_ctx->batch_keys.data(),
+                               cur_ctx->batch_values.data(),
+                               cur_ctx->status.data());
                 return 0;
               });
+      tasks.push_back(std::move(fut));
     }
-    for (int i = 0; i < _real_local_shard_num; ++i) {
-      tasks[i].wait();
+    for (size_t x = 0; x < tasks.size(); ++x) {
+      tasks[x].wait();
     }
-    if (FLAGS_pserver_print_missed_key_num_every_push) {
-      LOG(WARNING) << "total pull keys:" << num
-                   << " missed_keys:" << missed_keys.load();
+    for (size_t x = 0; x < 2; x++) {
+      cur_ctx = context.switch_item();
+      for (size_t idx = 0; idx < cur_ctx->status.size(); idx++) {
+        uint64_t cur_key =
+            *((uint64_t*)const_cast<char*>(cur_ctx->batch_keys[idx].data()));
+        if (cur_ctx->status[idx].IsNotFound()) {
+          auto& feature_value = local_shard[cur_key];
+          int init_size = value_size - mf_value_size;
+          feature_value.resize(init_size);
+          _value_accesor->Create(&data_buffer_ptr, 1);
+          memcpy(const_cast<float*>(feature_value.data()),
+                 data_buffer_ptr,
+                 init_size * sizeof(float));
+          ret = &feature_value;
+        } else {
+          int data_size = cur_ctx->batch_values[idx].size() / sizeof(float);
+          // from rocksdb to mem
+          auto& feature_value = local_shard[cur_key];
+          feature_value.resize(data_size);
+          memcpy(
+              const_cast<float*>(feature_value.data()),
+              paddle::string::str_to_float(cur_ctx->batch_values[idx].data()),
+              data_size * sizeof(float));
+          _db->del_data(shard_id, (char*)&cur_key, sizeof(uint64_t));
+          ret = &feature_value;
+        }
+        _value_accesor->UpdatePassId(ret->data(), pass_id);
+        int pull_data_idx = cur_ctx->batch_index[idx];
+        pull_values[pull_data_idx] = (char*)ret;
+      }
+      cur_ctx->reset();
     }
   }
   return 0;
@@ -536,157 +591,165 @@ int32_t SSDSparseTable::Save(const std::string& path,
   //    }
 
   // LOG(INFO) << "table cache rate is: " << _config.sparse_table_cache_rate();
-  LOG(INFO) << "table cache rate is: " << _config.sparse_table_cache_rate();
-  LOG(INFO) << "enable_sparse_table_cache: "
-            << _config.enable_sparse_table_cache();
-  LOG(INFO) << "LocalSize: " << LocalSize();
+  VLOG(0) << "table cache rate is: " << _config.sparse_table_cache_rate();
+  VLOG(0) << "enable_sparse_table_cache: "
+          << _config.enable_sparse_table_cache();
+  VLOG(0) << "LocalSize: " << LocalSize();
   if (_config.enable_sparse_table_cache()) {
-    LOG(INFO) << "Enable sparse table cache, top n:" << _cache_tk_size;
+    VLOG(0) << "Enable sparse table cache, top n:" << _cache_tk_size;
   }
   _cache_tk_size = LocalSize() * _config.sparse_table_cache_rate();
   TopkCalculator tk(_real_local_shard_num, _cache_tk_size);
+  VLOG(0) << "TopkCalculator top n:" << _cache_tk_size;
   size_t file_start_idx = _avg_local_shard_num * _shard_idx;
   std::string table_path = TableDir(path);
   _afs_client.remove(paddle::string::format_string(
       "%s/part-%03d-*", table_path.c_str(), _shard_idx));
+#ifdef PADDLE_WITH_GPU_GRAPH
+  int thread_num = _real_local_shard_num;
+#else
   int thread_num = _real_local_shard_num < 20 ? _real_local_shard_num : 20;
+#endif
 
   // std::atomic<uint32_t> feasign_size;
   std::atomic<uint32_t> feasign_size_all{0};
   // feasign_size = 0;
 
-  omp_set_num_threads(thread_num);
-#pragma omp parallel for schedule(dynamic)
-  for (int i = 0; i < _real_local_shard_num; ++i) {
+  std::vector<
+      paddle::framework::Channel<std::pair<uint64_t, std::vector<float>>>>
+      fs_channel;
+  for (int i = 0; i < _real_local_shard_num; i++) {
+    fs_channel.push_back(
+        paddle::framework::MakeChannel<std::pair<uint64_t, std::vector<float>>>(
+            10240));
+  }
+  std::vector<std::thread> threads;
+  threads.resize(_real_local_shard_num);
+
+  auto save_func = [this,
+                    &save_param,
+                    &table_path,
+                    &file_start_idx,
+                    &fs_channel](int file_num) {
+    int err_no = 0;
     FsChannelConfig channel_config;
     if (_config.compress_in_save() && (save_param == 0 || save_param == 3)) {
       channel_config.path =
           paddle::string::format_string("%s/part-%03d-%05d.gz",
                                         table_path.c_str(),
                                         _shard_idx,
-                                        file_start_idx + i);
+                                        file_start_idx + file_num);
     } else {
-      channel_config.path = paddle::string::format_string("%s/part-%03d-%05d",
-                                                          table_path.c_str(),
-                                                          _shard_idx,
-                                                          file_start_idx + i);
+      channel_config.path =
+          paddle::string::format_string("%s/part-%03d-%05d",
+                                        table_path.c_str(),
+                                        _shard_idx,
+                                        file_start_idx + file_num);
     }
     channel_config.converter = _value_accesor->Converter(save_param).converter;
     channel_config.deconverter =
         _value_accesor->Converter(save_param).deconverter;
-    int err_no = 0;
-    int retry_num = 0;
-    bool is_write_failed = false;
+    auto write_channel =
+        _afs_client.open_w(channel_config, 1024 * 1024 * 40, &err_no);
+    paddle::framework::ChannelReader<std::pair<uint64_t, std::vector<float>>>
+        reader(fs_channel[file_num].get());
+    std::pair<uint64_t, std::vector<float>> out_str;
+    while (reader >> out_str) {
+      std::string format_value = _value_accesor->ParseToString(
+          out_str.second.data(), out_str.second.size());
+      if (0 != write_channel->write_line(paddle::string::format_string(
+                   "%lu %s", out_str.first, format_value.c_str()))) {
+        LOG(FATAL) << "SSDSparseTable save failed, retry it! path:"
+                   << channel_config.path;
+      }
+    }
+    write_channel->close();
+  };
+  for (size_t i = 0; i < threads.size(); i++) {
+    threads[i] = std::thread(save_func, i);
+  }
+
+  std::vector<
+      paddle::framework::ChannelWriter<std::pair<uint64_t, std::vector<float>>>>
+      writers(_real_local_shard_num);
+  omp_set_num_threads(thread_num);
+#pragma omp parallel for schedule(dynamic)
+  for (int i = 0; i < _real_local_shard_num; ++i) {
     int feasign_size = 0;
     auto& shard = _local_shards[i];
-    do {
-      err_no = 0;
-      feasign_size = 0;
-      is_write_failed = false;
-      auto write_channel =
-          _afs_client.open_w(channel_config, 1024 * 1024 * 40, &err_no);
+    auto& writer = writers[i];
+    writer.Reset(fs_channel[i].get());
+    {
       for (auto it = shard.begin(); it != shard.end(); ++it) {
         if (_config.enable_sparse_table_cache() &&
-            (save_param == 1 || save_param == 2) &&
-            _value_accesor->Save(it.value().data(), 4)) {
-          // tk.push(i, it.value().data()[2]);
+            (save_param == 1 || save_param == 2)) {
+          // get_field get right decayed show
           tk.push(i, _value_accesor->GetField(it.value().data(), "show"));
         }
         if (_value_accesor->Save(it.value().data(), save_param)) {
-          std::string format_value = _value_accesor->ParseToString(
-              it.value().data(), it.value().size());
-          if (0 != write_channel->write_line(paddle::string::format_string(
-                       "%lu %s", it.key(), format_value.c_str()))) {
-            ++retry_num;
-            is_write_failed = true;
-            LOG(ERROR) << "SSDSparseTable save failed, retry it! path:"
-                       << channel_config.path << ", retry_num=" << retry_num;
-            break;
-          }
+          std::vector<float> feature_value;
+          feature_value.resize(it.value().size());
+          memcpy(const_cast<float*>(feature_value.data()),
+                 it.value().data(),
+                 it.value().size() * sizeof(float));
+          writer << std::make_pair(it.key(), std::move(feature_value));
           ++feasign_size;
         }
       }
+    }
 
-      if (err_no == -1 && !is_write_failed) {
-        ++retry_num;
-        is_write_failed = true;
-        LOG(ERROR) << "SSDSparseTable save failed after write, retry it! "
-                   << "path:" << channel_config.path
-                   << " , retry_num=" << retry_num;
-      }
-      if (is_write_failed) {
-        _afs_client.remove(channel_config.path);
-        continue;
-      }
-
-      // delta and cache and revert is all in mem, base in rocksdb
-      if (save_param != 1) {
-        auto* it = _db->get_iterator(i);
-        for (it->SeekToFirst(); it->Valid(); it->Next()) {
-          bool need_save = _value_accesor->Save(
-              paddle::string::str_to_float(it->value().data()), save_param);
-          _value_accesor->UpdateStatAfterSave(
-              paddle::string::str_to_float(it->value().data()), save_param);
-          if (need_save) {
-            std::string format_value = _value_accesor->ParseToString(
-                paddle::string::str_to_float(it->value().data()),
-                it->value().size() / sizeof(float));
-            if (0 != write_channel->write_line(paddle::string::format_string(
-                         "%lu %s",
-                         *((uint64_t*)const_cast<char*>(it->key().data())),
-                         format_value.c_str()))) {
-              ++retry_num;
-              is_write_failed = true;
-              LOG(ERROR) << "SSDSparseTable save failed, retry it! path:"
-                         << channel_config.path << ", retry_num=" << retry_num;
-              break;
-            }
-            if (save_param == 3) {
-              _db->put(i,
-                       it->key().data(),
-                       it->key().size(),
-                       it->value().data(),
-                       it->value().size());
-            }
-            ++feasign_size;
-          }
+    if (save_param != 1) {
+      auto* it = _db->get_iterator(i);
+      for (it->SeekToFirst(); it->Valid(); it->Next()) {
+        bool need_save = _value_accesor->Save(
+            paddle::string::str_to_float(it->value().data()), save_param);
+        _value_accesor->UpdateStatAfterSave(
+            paddle::string::str_to_float(it->value().data()), save_param);
+        if (need_save) {
+          std::vector<float> feature_value;
+          feature_value.resize(it->value().size() / sizeof(float));
+          memcpy(const_cast<float*>(feature_value.data()),
+                 paddle::string::str_to_float(it->value().data()),
+                 it->value().size());
+          writer << std::make_pair(
+              *((uint64_t*)const_cast<char*>(it->key().data())),
+              std::move(feature_value));
+          ++feasign_size;
         }
-        delete it;
       }
+      delete it;
+    }
 
-      write_channel->close();
-      if (err_no == -1) {
-        ++retry_num;
-        is_write_failed = true;
-        LOG(ERROR) << "SSDSparseTable save failed after write, retry it! "
-                   << "path:" << channel_config.path
-                   << " , retry_num=" << retry_num;
-      }
-      if (is_write_failed) {
-        _afs_client.remove(channel_config.path);
-      }
-    } while (is_write_failed);
+    writer.Flush();
+    fs_channel[i]->Close();
     feasign_size_all += feasign_size;
     for (auto it = shard.begin(); it != shard.end(); ++it) {
       _value_accesor->UpdateStatAfterSave(it.value().data(), save_param);
     }
   }
+  for (int i = 0; i < threads.size(); i++) {
+    threads[i].join();
+  }
+  for (int i = 0; i < fs_channel.size(); i++) {
+    fs_channel[i].reset();
+  }
+  fs_channel.clear();
+
   if (save_param == 3) {
-    UpdateTable();
+    // UpdateTable();
     _cache_tk_size = LocalSize() * _config.sparse_table_cache_rate();
-    LOG(INFO) << "SSDSparseTable update success.";
-  }
-  LOG(INFO) << "SSDSparseTable save success, path:"
-            << paddle::string::format_string("%s/%03d/part-%03d-",
-                                             path.c_str(),
-                                             _config.table_id(),
-                                             _shard_idx)
-            << " from " << file_start_idx << " to "
-            << file_start_idx + _real_local_shard_num - 1;
-  // return feasign_size_all;
+    VLOG(0) << "SSDSparseTable update success.";
+  }
+  VLOG(0) << "SSDSparseTable save success, feasign size:" << feasign_size_all
+          << ", path:"
+          << paddle::string::format_string("%s/%03d/part-%03d-",
+                                           path.c_str(),
+                                           _config.table_id(),
+                                           _shard_idx)
+          << " from " << file_start_idx << " to "
+          << file_start_idx + _real_local_shard_num - 1;
   _local_show_threshold = tk.top();
-  LOG(INFO) << "local cache threshold: " << _local_show_threshold;
-  // int32 may overflow need to change return value
+  VLOG(0) << "local cache threshold: " << _local_show_threshold;
   return 0;
 }
 
@@ -857,7 +920,167 @@ int32_t SSDSparseTable::SaveCache(
 
 int32_t SSDSparseTable::Load(const std::string& path,
                              const std::string& param) {
-  return MemorySparseTable::Load(path, param);
+  VLOG(0) << "LOAD FLAGS_rocksdb_path:" << FLAGS_rocksdb_path;
+  std::string table_path = TableDir(path);
+  auto file_list = _afs_client.list(table_path);
+
+  // std::sort(file_list.begin(), file_list.end());
+  for (auto file : file_list) {
+    VLOG(1) << "SSDSparseTable::Load() file list: " << file;
+  }
+
+  int load_param = atoi(param.c_str());
+  size_t expect_shard_num = _sparse_table_shard_num;
+  if (file_list.size() != expect_shard_num) {
+    LOG(WARNING) << "SSDSparseTable file_size:" << file_list.size()
+                 << " not equal to expect_shard_num:" << expect_shard_num;
+    return -1;
+  }
+  if (file_list.size() == 0) {
+    LOG(WARNING) << "SSDSparseTable load file is empty, path:" << path;
+    return -1;
+  }
+
+  size_t file_start_idx = _shard_idx * _avg_local_shard_num;
+
+  if (file_start_idx >= file_list.size()) {
+    return 0;
+  }
+
+  size_t feature_value_size =
+      _value_accesor->GetAccessorInfo().size / sizeof(float);
+  size_t mf_value_size =
+      _value_accesor->GetAccessorInfo().mf_size / sizeof(float);
+
+#ifdef PADDLE_WITH_HETERPS
+  int thread_num = _real_local_shard_num;
+#else
+  int thread_num = _real_local_shard_num < 15 ? _real_local_shard_num : 15;
+#endif
+
+  for (int i = 0; i < _real_local_shard_num; i++) {
+    _fs_channel.push_back(paddle::framework::MakeChannel<std::string>(30000));
+  }
+
+  std::vector<std::thread> threads;
+  threads.resize(thread_num);
+  auto load_func = [this, &file_start_idx, &file_list, &load_param](
+                       int file_num) {
+    int err_no = 0;
+    FsChannelConfig channel_config;
+    channel_config.path = file_list[file_num + file_start_idx];
+    VLOG(1) << "SSDSparseTable::load begin load " << channel_config.path
+            << " into local shard " << file_num;
+    channel_config.converter = _value_accesor->Converter(load_param).converter;
+    channel_config.deconverter =
+        _value_accesor->Converter(load_param).deconverter;
+
+    std::string line_data;
+    auto read_channel = _afs_client.open_r(channel_config, 0, &err_no);
+    paddle::framework::ChannelWriter<std::string> writer(
+        _fs_channel[file_num].get());
+    while (read_channel->read_line(line_data) == 0 && line_data.size() > 1) {
+      writer << line_data;
+    }
+    writer.Flush();
+    read_channel->close();
+    _fs_channel[file_num]->Close();
+  };
+  for (size_t i = 0; i < threads.size(); i++) {
+    threads[i] = std::thread(load_func, i);
+  }
+
+  omp_set_num_threads(thread_num);
+#pragma omp parallel for schedule(dynamic)
+  for (int i = 0; i < _real_local_shard_num; ++i) {
+    std::vector<std::pair<char*, int>> ssd_keys;
+    std::vector<std::pair<char*, int>> ssd_values;
+    std::vector<uint64_t> tmp_key;
+    ssd_keys.reserve(FLAGS_pserver_load_batch_size);
+    ssd_values.reserve(FLAGS_pserver_load_batch_size);
+    tmp_key.reserve(FLAGS_pserver_load_batch_size);
+    ssd_keys.clear();
+    ssd_values.clear();
+    tmp_key.clear();
+    std::string line_data;
+    char* end = NULL;
+    int local_shard_id = i % _avg_local_shard_num;
+    auto& shard = _local_shards[local_shard_id];
+    float data_buffer[FLAGS_pserver_load_batch_size * feature_value_size];
+    float* data_buffer_ptr = data_buffer;
+    uint64_t mem_count = 0;
+    uint64_t ssd_count = 0;
+    uint64_t mem_mf_count = 0;
+    uint64_t ssd_mf_count = 0;
+
+    paddle::framework::ChannelReader<std::string> reader(_fs_channel[i].get());
+
+    while (reader >> line_data) {
+      uint64_t key = std::strtoul(line_data.data(), &end, 10);
+      if (FLAGS_pserver_open_strict_check) {
+        if (key % _sparse_table_shard_num != (i + file_start_idx)) {
+          LOG(WARNING) << "SSDSparseTable key:" << key << " not match shard,"
+                       << " file_idx:" << i
+                       << " shard num:" << _sparse_table_shard_num;
+          continue;
+        }
+      }
+      size_t value_size =
+          _value_accesor->ParseFromString(++end, data_buffer_ptr);
+      // ssd or mem
+      if (_value_accesor->SaveSSD(data_buffer_ptr)) {
+        tmp_key.emplace_back(key);
+        ssd_keys.emplace_back(
+            std::make_pair((char*)&tmp_key.back(), sizeof(uint64_t)));
+        ssd_values.emplace_back(
+            std::make_pair((char*)data_buffer_ptr, value_size * sizeof(float)));
+        data_buffer_ptr += feature_value_size;
+        if (static_cast<int>(ssd_keys.size()) ==
+            FLAGS_pserver_load_batch_size) {
+          _db->put_batch(local_shard_id, ssd_keys, ssd_values, ssd_keys.size());
+          ssd_keys.clear();
+          ssd_values.clear();
+          tmp_key.clear();
+          data_buffer_ptr = data_buffer;
+        }
+        ssd_count++;
+        if (value_size > feature_value_size - mf_value_size) {
+          ssd_mf_count++;
+        }
+      } else {
+        auto& value = shard[key];
+        value.resize(value_size);
+        _value_accesor->ParseFromString(end, value.data());
+        mem_count++;
+        if (value_size > feature_value_size - mf_value_size) {
+          mem_mf_count++;
+        }
+      }
+    }
+    // last batch
+    if (ssd_keys.size() > 0) {
+      _db->put_batch(local_shard_id, ssd_keys, ssd_values, ssd_keys.size());
+    }
+
+    _db->flush(local_shard_id);
+    VLOG(0) << "Table>> load done. ALL[" << mem_count + ssd_count << "] MEM["
+            << mem_count << "] MEM_MF[" << mem_mf_count << "] SSD[" << ssd_count
+            << "] SSD_MF[" << ssd_mf_count << "].";
+  }
+  for (int i = 0; i < threads.size(); i++) {
+    threads[i].join();
+  }
+  for (int i = 0; i < _fs_channel.size(); i++) {
+    _fs_channel[i].reset();
+  }
+  _fs_channel.clear();
+  LOG(INFO) << "load num:" << LocalSize();
+  LOG(INFO) << "SSDSparseTable load success, path from "
+            << file_list[file_start_idx] << " to "
+            << file_list[file_start_idx + _real_local_shard_num - 1];
+
+  _cache_tk_size = LocalSize() * _config.sparse_table_cache_rate();
+  return 0;
 }
 
 //加载path目录下数据[start_idx, end_idx)
@@ -877,7 +1100,11 @@ int32_t SSDSparseTable::Load(size_t start_idx,
   end_idx = static_cast<int>(end_idx) < _sparse_table_shard_num
                 ? end_idx
                 : _sparse_table_shard_num;
+#ifdef PADDLE_WITH_HETERPS
+  int thread_num = end_idx - start_idx;
+#else
   int thread_num = (end_idx - start_idx) < 20 ? (end_idx - start_idx) : 20;
+#endif
   omp_set_num_threads(thread_num);
 #pragma omp parallel for schedule(dynamic)
   for (size_t i = start_idx; i < end_idx; ++i) {
@@ -994,5 +1221,164 @@ int32_t SSDSparseTable::Load(size_t start_idx,
   return 0;
 }
 
+std::pair<int64_t, int64_t> SSDSparseTable::PrintTableStat() {
+  int64_t feasign_size = LocalSize();
+  return {feasign_size, -1};
+}
+
+int32_t SSDSparseTable::CacheTable(uint16_t pass_id) {
+  // acquire_table_mutex();
+  VLOG(0) << "cache_table";
+  std::atomic<uint32_t> count{0};
+  auto thread_num = _real_local_shard_num;
+  std::vector<std::future<int>> tasks;
+
+  double show_threshold = 10000000;
+
+  //保证cache数据不被淘汰掉
+  if (_config.enable_sparse_table_cache()) {
+    if (_local_show_threshold < show_threshold) {
+      show_threshold = _local_show_threshold;
+    }
+  }
+
+  if (show_threshold < 500) {
+    show_threshold = 500;
+  }
+  VLOG(0) << " show_threshold:" << show_threshold
+          << " ; local_show_threshold:" << _local_show_threshold;
+  VLOG(0) << "Table>> origin mem feasign size:" << LocalSize();
+  static int cache_table_count = 0;
+  ++cache_table_count;
+  for (size_t shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
+    // from mem to ssd
+    auto fut = _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue(
+        [shard_id, this, &count, show_threshold, pass_id]() -> int {
+          rocksdb::Options options;
+          options.comparator = _db->get_comparator();
+          rocksdb::BlockBasedTableOptions bbto;
+          bbto.format_version = 5;
+          bbto.use_delta_encoding = false;
+          bbto.block_size = 4 * 1024;
+          bbto.block_restart_interval = 6;
+          bbto.cache_index_and_filter_blocks = false;
+          bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(15, false));
+          bbto.whole_key_filtering = true;
+          options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbto));
+          options.OptimizeLevelStyleCompaction();
+          options.keep_log_file_num = 100;
+          options.max_log_file_size = 50 * 1024 * 1024;  // 50MB
+          options.create_if_missing = true;
+          options.use_direct_reads = true;
+          options.write_buffer_size = 64 * 1024 * 1024;  // 256MB
+          options.max_write_buffer_number = 4;
+          options.max_bytes_for_level_base =
+              options.max_write_buffer_number * options.write_buffer_size;
+          options.min_write_buffer_number_to_merge = 1;
+          options.target_file_size_base = 1024 * 1024 * 1024;  // 1024MB
+          options.memtable_prefix_bloom_size_ratio = 0.02;
+          options.num_levels = 4;
+          options.max_open_files = -1;
+
+          options.compression = rocksdb::kNoCompression;
+
+          auto& shard = _local_shards[shard_id];
+          if (1) {
+            using DataType = shard_type::map_type::iterator;
+            std::vector<DataType> datas;
+            datas.reserve(shard.size() * 0.8);
+            size_t idx = 0;
+            for (auto it = shard.begin(); it != shard.end(); ++it) {
+              if (!_value_accesor->SaveMemCache(
+                      it.value().data(), 0, show_threshold, pass_id)) {
+                datas.emplace_back(it.it);
+              }
+            }
+            count.fetch_add(datas.size(), std::memory_order_relaxed);
+            VLOG(0) << "datas size:  " << datas.size();
+            {
+              // sst文件写入必须有序
+              uint64_t show_begin = butil::gettimeofday_ms();
+              std::sort(datas.begin(),
+                        datas.end(),
+                        [](const DataType& a, const DataType& b) {
+                          return a->first < b->first;
+                        });
+              VLOG(0) << "sort shard " << shard_id << ": "
+                      << butil::gettimeofday_ms() - show_begin
+                      << " ms, num: " << datas.size();
+            }
+
+            //必须做空判断，否则sst_writer.Finish会core掉
+            if (datas.size() != 0) {
+              rocksdb::SstFileWriter sst_writer(rocksdb::EnvOptions(), options);
+              std::string filename =
+                  paddle::string::format_string("%s_%d/cache-%05d.sst",
+                                                FLAGS_rocksdb_path.c_str(),
+                                                shard_id,
+                                                cache_table_count);
+              rocksdb::Status status = sst_writer.Open(filename);
+              if (!status.ok()) {
+                VLOG(0) << "sst writer open " << filename << "failed"
+                        << ", " << status.getState();
+                abort();
+              }
+              VLOG(0) << "sst writer open " << filename;
+
+              uint64_t show_begin = butil::gettimeofday_ms();
+              for (auto& data : datas) {
+                uint64_t tmp_key = data->first;
+                FixedFeatureValue& tmp_value =
+                    *((FixedFeatureValue*)(void*)(data->second));
+                status = sst_writer.Put(
+                    rocksdb::Slice((char*)(&(tmp_key)), sizeof(uint64_t)),
+                    rocksdb::Slice((char*)(tmp_value.data()),
+                                   tmp_value.size() * sizeof(float)));
+                if (!status.ok()) {
+                  VLOG(0) << "fatal in Put file: " << filename << ", "
+                          << status.getState();
+                  abort();
+                }
+              }
+              status = sst_writer.Finish();
+              if (!status.ok()) {
+                VLOG(0) << "fatal in finish file: " << filename << ", "
+                        << status.getState();
+                abort();
+              }
+              VLOG(0) << "write sst_file shard " << shard_id << ": "
+                      << butil::gettimeofday_ms() - show_begin << " ms";
+              int ret = _db->ingest_externel_file(shard_id, {filename});
+              if (ret) {
+                VLOG(0) << "ingest file failed"
+                        << ", " << status.getState();
+                abort();
+              }
+            }
+
+            for (auto it = shard.begin(); it != shard.end();) {
+              if (!_value_accesor->SaveMemCache(
+                      it.value().data(), 0, show_threshold, pass_id)) {
+                it = shard.erase(it);
+              } else {
+                ++it;
+              }
+            }
+          }
+          return 0;
+        });
+    tasks.push_back(std::move(fut));
+  }
+  for (size_t i = 0; i < tasks.size(); ++i) {
+    tasks[i].wait();
+  }
+  tasks.clear();
+
+  VLOG(0) << "Table>> cache ssd count: " << count.load();
+  VLOG(0) << "Table>> after update, mem feasign size:" << LocalSize();
+  // release_table_mutex();
+  return 0;
+}
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
index 55a05bbab5ec2..6fe13082240d7 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
@@ -38,7 +38,11 @@ class SSDSparseTable : public MemorySparseTable {
   int32_t Push(TableContext& context) override;
 
   int32_t PullSparse(float* pull_values, const uint64_t* keys, size_t num);
-  int32_t PullSparsePtr(char** pull_values, const uint64_t* keys, size_t num);
+  int32_t PullSparsePtr(int shard_id,
+                        char** pull_values,
+                        const uint64_t* keys,
+                        size_t num,
+                        uint16_t pass_id);
   int32_t PushSparse(const uint64_t* keys, const float* values, size_t num);
   int32_t PushSparse(const uint64_t* keys, const float** values, size_t num);
 
@@ -77,10 +81,15 @@ class SSDSparseTable : public MemorySparseTable {
                        const std::string& param);
   int64_t LocalSize();
 
+  std::pair<int64_t, int64_t> PrintTableStat() override;
+
+  int32_t CacheTable(uint16_t pass_id) override;
+
  private:
   RocksDBHandler* _db;
   int64_t _cache_tk_size;
   double _local_show_threshold{0.0};
+  std::vector<paddle::framework::Channel<std::string>> _fs_channel;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/ps/table/table.h b/paddle/fluid/distributed/ps/table/table.h
index aee707712f662..f07a3f2132217 100644
--- a/paddle/fluid/distributed/ps/table/table.h
+++ b/paddle/fluid/distributed/ps/table/table.h
@@ -62,6 +62,8 @@ struct TableContext {
   size_t num;
   bool use_ptr = false;
   uint32_t trainer_id;  // for GEO and global step
+  int shard_id;         // for gpups
+  uint16_t pass_id;     // for gpups ssd
 };
 
 class Table {
@@ -147,6 +149,7 @@ class Table {
 
   virtual void *GetShard(size_t shard_idx) = 0;
   virtual std::pair<int64_t, int64_t> PrintTableStat() { return {0, 0}; }
+  virtual int32_t CacheTable(uint16_t pass_id) { return 0; }
 
   // for patch model
   virtual void Revert() {}
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc
index 5df74883f9247..db0dcf0605dc7 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.cc
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc
@@ -748,6 +748,17 @@ void FleetWrapper::PrintTableStat(const uint64_t table_id) {
   }
 }
 
+void FleetWrapper::SaveCacheTable(const uint64_t table_id,
+                                  uint16_t pass_id,
+                                  size_t threshold) {
+  auto ret = worker_ptr_->SaveCacheTable(table_id, pass_id, threshold);
+  ret.wait();
+  int32_t err_code = ret.get();
+  if (err_code == -1) {
+    LOG(ERROR) << "save cache table stat failed";
+  }
+}
+
 void FleetWrapper::ShrinkSparseTable(int table_id, int threshold) {
   auto ret = worker_ptr_->Shrink(table_id, std::to_string(threshold));
   ret.wait();
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.h b/paddle/fluid/distributed/ps/wrapper/fleet.h
old mode 100755
new mode 100644
index 28347b3502707..5065fb380a346
--- a/paddle/fluid/distributed/ps/wrapper/fleet.h
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.h
@@ -242,6 +242,9 @@ class FleetWrapper {
   void BarrierWithTable(uint32_t barrier_type);
 
   void PrintTableStat(const uint64_t table_id);
+  void SaveCacheTable(const uint64_t table_id,
+                      uint16_t pass_id,
+                      size_t threshold);
   // mode = 0, load all feature
   // mode = 1, load delta feature, which means load diff
   void LoadModel(const std::string& path, const int mode);
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index 47dc8abf402b9..fe6c51b87228a 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -353,7 +353,7 @@ class ChannelReader {
     }
     if (cursor_ >= buffer_.size()) {
       cursor_ = 0;
-      if (channel_->read(buffer_) == 0) {
+      if (channel_->Read(buffer_) == 0) {
         failed_ = true;
         return *this;
       }
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index c88c91f166112..0bad79fdcb3d2 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -2118,6 +2118,10 @@ void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) {
   }
 }
 
+void SlotRecordInMemoryDataFeed::InitGraphResource() {
+  gpu_graph_data_generator_.AllocResource(thread_id_, feed_vec_);
+}
+
 void SlotRecordInMemoryDataFeed::LoadIntoMemory() {
   VLOG(3) << "SlotRecord LoadIntoMemory() begin, thread_id=" << thread_id_;
   if (!so_parser_name_.empty()) {
@@ -2654,7 +2658,7 @@ bool SlotRecordInMemoryDataFeed::Start() {
   pack_ = BatchGpuPackMgr().get(this->GetPlace(), used_slots_info_);
 #endif
 #if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
-  gpu_graph_data_generator_.AllocResource(this->place_, feed_vec_);
+  gpu_graph_data_generator_.SetFeedVec(feed_vec_);
 #endif
   return true;
 }
@@ -2696,6 +2700,12 @@ int SlotRecordInMemoryDataFeed::Next() {
 #endif
 }
 
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+void SlotRecordInMemoryDataFeed::DoWalk() {
+  gpu_graph_data_generator_.DoWalk();
+}
+#endif
+
 #if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
 void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) {
   int offset_cols_size = (ins_num + 1);
diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu
index 3c4f2c5bbc74d..074fa407c5b51 100644
--- a/paddle/fluid/framework/data_feed.cu
+++ b/paddle/fluid/framework/data_feed.cu
@@ -24,9 +24,14 @@ limitations under the License. */
 #include <sstream>
 #include "cub/cub.cuh"
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
+#include "paddle/fluid/framework/fleet/heter_ps/hashtable.h"
+#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 
 DECLARE_bool(enable_opt_get_features);
+DECLARE_int32(gpugraph_storage_mode);
+DECLARE_double(gpugraph_hbm_table_load_factor);
 
 namespace paddle {
 namespace framework {
@@ -35,6 +40,11 @@ namespace framework {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
        i += blockDim.x * gridDim.x)
 
+#define DEBUG_STATE(state)                                             \
+  VLOG(2) << "left: " << state->left << " right: " << state->right     \
+          << " central_word: " << state->central_word                  \
+          << " step: " << state->step << " cursor: " << state->cursor  \
+          << " len: " << state->len << " row_num: " << state->row_num; \
 // CUDA: use 512 threads per block
 const int CUDA_NUM_THREADS = 512;
 // CUDA: number of blocks for threads.
@@ -207,13 +217,13 @@ __global__ void CopyDuplicateKeys(int64_t *dist_tensor,
 int GraphDataGenerator::AcquireInstance(BufState *state) {
   //
   if (state->GetNextStep()) {
-    state->Debug();
+    DEBUG_STATE(state);
     return state->len;
   } else if (state->GetNextCentrolWord()) {
-    state->Debug();
+    DEBUG_STATE(state);
     return state->len;
   } else if (state->GetNextBatch()) {
-    state->Debug();
+    DEBUG_STATE(state);
     return state->len;
   }
   return 0;
@@ -338,64 +348,152 @@ __global__ void GraphFillSlotLodKernel(int64_t *id_tensor, int len) {
   CUDA_KERNEL_LOOP(idx, len) { id_tensor[idx] = idx; }
 }
 
-int GraphDataGenerator::FillInsBuf() {
-  if (ins_buf_pair_len_ >= batch_size_) {
-    return batch_size_;
+int GraphDataGenerator::FillIdShowClkTensor(int total_instance,
+                                            bool gpu_graph_training,
+                                            size_t cursor) {
+  id_tensor_ptr_ =
+      feed_vec_[0]->mutable_data<int64_t>({total_instance, 1}, this->place_);
+  show_tensor_ptr_ =
+      feed_vec_[1]->mutable_data<int64_t>({total_instance}, this->place_);
+  clk_tensor_ptr_ =
+      feed_vec_[2]->mutable_data<int64_t>({total_instance}, this->place_);
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+  if (gpu_graph_training) {
+    uint64_t *ins_cursor, *ins_buf;
+    ins_buf = reinterpret_cast<uint64_t *>(d_ins_buf_->ptr());
+    ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance;
+    cudaMemcpyAsync(id_tensor_ptr_,
+                    ins_cursor,
+                    sizeof(uint64_t) * total_instance,
+                    cudaMemcpyDeviceToDevice,
+                    train_stream_);
+  } else {
+    uint64_t *d_type_keys =
+        reinterpret_cast<uint64_t *>(d_device_keys_[cursor]->ptr());
+    d_type_keys += infer_node_start_;
+    infer_node_start_ += total_instance / 2;
+    CopyDuplicateKeys<<<GET_BLOCKS(total_instance / 2),
+                        CUDA_NUM_THREADS,
+                        0,
+                        train_stream_>>>(
+        id_tensor_ptr_, d_type_keys, total_instance / 2);
   }
-  int total_instance = AcquireInstance(&buf_state_);
 
-  VLOG(2) << "total_ins: " << total_instance;
-  buf_state_.Debug();
+  GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
+                       CUDA_NUM_THREADS,
+                       0,
+                       train_stream_>>>(show_tensor_ptr_, total_instance);
+  GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
+                       CUDA_NUM_THREADS,
+                       0,
+                       train_stream_>>>(clk_tensor_ptr_, total_instance);
+  return 0;
+}
 
-  if (total_instance == 0) {
-    int res = FillWalkBuf(d_walk_);
-    if (!res) {
-      // graph iterate complete
-      return -1;
-    } else {
-      total_instance = buf_state_.len;
-      VLOG(2) << "total_ins: " << total_instance;
-      buf_state_.Debug();
-      // if (total_instance == 0) {
-      //  return -1;
-      //}
+int GraphDataGenerator::FillGraphSlotFeature(int total_instance,
+                                             bool gpu_graph_training) {
+  int64_t *slot_tensor_ptr_[slot_num_];
+  int64_t *slot_lod_tensor_ptr_[slot_num_];
+  for (int i = 0; i < slot_num_; ++i) {
+    slot_tensor_ptr_[i] = feed_vec_[3 + 2 * i]->mutable_data<int64_t>(
+        {total_instance, 1}, this->place_);
+    slot_lod_tensor_ptr_[i] = feed_vec_[3 + 2 * i + 1]->mutable_data<int64_t>(
+        {total_instance + 1}, this->place_);
+  }
+  uint64_t *ins_cursor, *ins_buf;
+  if (gpu_graph_training) {
+    ins_buf = reinterpret_cast<uint64_t *>(d_ins_buf_->ptr());
+    ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance;
+  } else {
+    id_tensor_ptr_ =
+        feed_vec_[0]->mutable_data<int64_t>({total_instance, 1}, this->place_);
+    ins_cursor = (uint64_t *)id_tensor_ptr_;
+  }
+
+  cudaMemcpyAsync(d_slot_tensor_ptr_->ptr(),
+                  slot_tensor_ptr_,
+                  sizeof(uint64_t *) * slot_num_,
+                  cudaMemcpyHostToDevice,
+                  train_stream_);
+  cudaMemcpyAsync(d_slot_lod_tensor_ptr_->ptr(),
+                  slot_lod_tensor_ptr_,
+                  sizeof(uint64_t *) * slot_num_,
+                  cudaMemcpyHostToDevice,
+                  train_stream_);
+  uint64_t *feature_buf = reinterpret_cast<uint64_t *>(d_feature_buf_->ptr());
+  FillFeatureBuf(ins_cursor, feature_buf, total_instance);
+  GraphFillSlotKernel<<<GET_BLOCKS(total_instance * slot_num_),
+                        CUDA_NUM_THREADS,
+                        0,
+                        train_stream_>>>((uint64_t *)d_slot_tensor_ptr_->ptr(),
+                                         feature_buf,
+                                         total_instance * slot_num_,
+                                         total_instance,
+                                         slot_num_);
+  GraphFillSlotLodKernelOpt<<<GET_BLOCKS((total_instance + 1) * slot_num_),
+                              CUDA_NUM_THREADS,
+                              0,
+                              train_stream_>>>(
+      (uint64_t *)d_slot_lod_tensor_ptr_->ptr(),
+      (total_instance + 1) * slot_num_,
+      total_instance + 1);
+  if (debug_mode_) {
+    uint64_t h_walk[total_instance];
+    cudaMemcpy(h_walk,
+               ins_cursor,
+               total_instance * sizeof(uint64_t),
+               cudaMemcpyDeviceToHost);
+    uint64_t h_feature[total_instance * slot_num_];
+    cudaMemcpy(h_feature,
+               feature_buf,
+               total_instance * slot_num_ * sizeof(uint64_t),
+               cudaMemcpyDeviceToHost);
+    for (int i = 0; i < total_instance; ++i) {
+      std::stringstream ss;
+      for (int j = 0; j < slot_num_; ++j) {
+        ss << h_feature[i * slot_num_ + j] << " ";
+      }
+      VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i
+              << "] = " << (uint64_t)h_walk[i] << " feature[" << i * slot_num_
+              << ".." << (i + 1) * slot_num_ << "] = " << ss.str();
     }
 
-    if (!FLAGS_enable_opt_get_features && slot_num_ > 0) {
-      FillFeatureBuf(d_walk_, d_feature_);
-      if (debug_mode_) {
-        int len = buf_size_ > 5000 ? 5000 : buf_size_;
-        uint64_t h_walk[len];
-        cudaMemcpy(h_walk,
-                   d_walk_->ptr(),
-                   len * sizeof(uint64_t),
-                   cudaMemcpyDeviceToHost);
-        uint64_t h_feature[len * slot_num_];
-        cudaMemcpy(h_feature,
-                   d_feature_->ptr(),
-                   len * slot_num_ * sizeof(uint64_t),
-                   cudaMemcpyDeviceToHost);
-        for (int i = 0; i < len; ++i) {
-          std::stringstream ss;
-          for (int j = 0; j < slot_num_; ++j) {
-            ss << h_feature[i * slot_num_ + j] << " ";
-          }
-          VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i
-                  << "] = " << (uint64_t)h_walk[i] << " feature["
-                  << i * slot_num_ << ".." << (i + 1) * slot_num_
-                  << "] = " << ss.str();
-        }
+    uint64_t h_slot_tensor[slot_num_][total_instance];
+    uint64_t h_slot_lod_tensor[slot_num_][total_instance + 1];
+    for (int i = 0; i < slot_num_; ++i) {
+      cudaMemcpy(h_slot_tensor[i],
+                 slot_tensor_ptr_[i],
+                 total_instance * sizeof(uint64_t),
+                 cudaMemcpyDeviceToHost);
+      int len = total_instance > 5000 ? 5000 : total_instance;
+      for (int j = 0; j < len; ++j) {
+        VLOG(2) << "gpu[" << gpuid_ << "] slot_tensor[" << i << "][" << j
+                << "] = " << h_slot_tensor[i][j];
+      }
+
+      cudaMemcpy(h_slot_lod_tensor[i],
+                 slot_lod_tensor_ptr_[i],
+                 (total_instance + 1) * sizeof(uint64_t),
+                 cudaMemcpyDeviceToHost);
+      len = total_instance + 1 > 5000 ? 5000 : total_instance + 1;
+      for (int j = 0; j < len; ++j) {
+        VLOG(2) << "gpu[" << gpuid_ << "] slot_lod_tensor[" << i << "][" << j
+                << "] = " << h_slot_lod_tensor[i][j];
       }
     }
   }
+  return 0;
+}
 
+int GraphDataGenerator::MakeInsPair() {
   uint64_t *walk = reinterpret_cast<uint64_t *>(d_walk_->ptr());
   uint64_t *ins_buf = reinterpret_cast<uint64_t *>(d_ins_buf_->ptr());
   int *random_row = reinterpret_cast<int *>(d_random_row_->ptr());
   int *d_pair_num = reinterpret_cast<int *>(d_pair_num_->ptr());
-  cudaMemsetAsync(d_pair_num, 0, sizeof(int), stream_);
+  cudaMemsetAsync(d_pair_num, 0, sizeof(int), train_stream_);
   int len = buf_state_.len;
-  GraphFillIdKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
+  // make pair
+  GraphFillIdKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, train_stream_>>>(
       ins_buf + ins_buf_pair_len_ * 2,
       d_pair_num,
       walk,
@@ -405,29 +503,12 @@ int GraphDataGenerator::FillInsBuf() {
       len,
       walk_len_);
   int h_pair_num;
-  cudaMemcpyAsync(
-      &h_pair_num, d_pair_num, sizeof(int), cudaMemcpyDeviceToHost, stream_);
-  if (!FLAGS_enable_opt_get_features && slot_num_ > 0) {
-    uint64_t *feature_buf = reinterpret_cast<uint64_t *>(d_feature_buf_->ptr());
-    uint64_t *feature = reinterpret_cast<uint64_t *>(d_feature_->ptr());
-    cudaMemsetAsync(d_pair_num, 0, sizeof(int), stream_);
-    int len = buf_state_.len;
-    VLOG(2) << "feature_buf start[" << ins_buf_pair_len_ * 2 * slot_num_
-            << "] len[" << len << "]";
-    GraphFillFeatureKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
-        feature_buf + ins_buf_pair_len_ * 2 * slot_num_,
-        d_pair_num,
-        walk,
-        feature,
-        random_row + buf_state_.cursor,
-        buf_state_.central_word,
-        window_step_[buf_state_.step],
-        len,
-        walk_len_,
-        slot_num_);
-  }
-
-  cudaStreamSynchronize(stream_);
+  cudaMemcpyAsync(&h_pair_num,
+                  d_pair_num,
+                  sizeof(int),
+                  cudaMemcpyDeviceToHost,
+                  train_stream_);
+  cudaStreamSynchronize(train_stream_);
   ins_buf_pair_len_ += h_pair_num;
 
   if (debug_mode_) {
@@ -441,70 +522,41 @@ int GraphDataGenerator::FillInsBuf() {
     for (int xx = 0; xx < 2 * ins_buf_pair_len_; xx++) {
       VLOG(2) << "h_ins_buf[" << xx << "]: " << h_ins_buf[xx];
     }
-    delete[] h_ins_buf;
-
-    if (!FLAGS_enable_opt_get_features && slot_num_ > 0) {
-      uint64_t *feature_buf =
-          reinterpret_cast<uint64_t *>(d_feature_buf_->ptr());
-      uint64_t h_feature_buf[(batch_size_ * 2 * 2) * slot_num_];
-      cudaMemcpy(h_feature_buf,
-                 feature_buf,
-                 (batch_size_ * 2 * 2) * slot_num_ * sizeof(uint64_t),
-                 cudaMemcpyDeviceToHost);
-      for (int xx = 0; xx < (batch_size_ * 2 * 2) * slot_num_; xx++) {
-        VLOG(2) << "h_feature_buf[" << xx << "]: " << h_feature_buf[xx];
-      }
-    }
   }
   return ins_buf_pair_len_;
 }
 
+int GraphDataGenerator::FillInsBuf() {
+  if (ins_buf_pair_len_ >= batch_size_) {
+    return batch_size_;
+  }
+  int total_instance = AcquireInstance(&buf_state_);
+
+  VLOG(2) << "total_ins: " << total_instance;
+  buf_state_.Debug();
+
+  if (total_instance == 0) {
+    return -1;
+  }
+  return MakeInsPair();
+}
+
 int GraphDataGenerator::GenerateBatch() {
   int total_instance = 0;
   platform::CUDADeviceGuard guard(gpuid_);
   int res = 0;
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
   if (!gpu_graph_training_) {
-    while (cursor_ < h_device_keys_.size()) {
-      size_t device_key_size = h_device_keys_[cursor_]->size();
-      if (infer_node_type_start_[cursor_] >= device_key_size) {
-        cursor_++;
-        continue;
-      }
-      total_instance =
-          (infer_node_type_start_[cursor_] + batch_size_ <= device_key_size)
-              ? batch_size_
-              : device_key_size - infer_node_type_start_[cursor_];
-      uint64_t *d_type_keys =
-          reinterpret_cast<uint64_t *>(d_device_keys_[cursor_]->ptr());
-      d_type_keys += infer_node_type_start_[cursor_];
-      infer_node_type_start_[cursor_] += total_instance;
-      VLOG(1) << "in graph_data generator:batch_size = " << batch_size_
-              << " instance = " << total_instance;
-      total_instance *= 2;
-      id_tensor_ptr_ = feed_vec_[0]->mutable_data<int64_t>({total_instance, 1},
-                                                           this->place_);
-      show_tensor_ptr_ =
-          feed_vec_[1]->mutable_data<int64_t>({total_instance}, this->place_);
-      clk_tensor_ptr_ =
-          feed_vec_[2]->mutable_data<int64_t>({total_instance}, this->place_);
-      CopyDuplicateKeys<<<GET_BLOCKS(total_instance / 2),
-                          CUDA_NUM_THREADS,
-                          0,
-                          stream_>>>(
-          id_tensor_ptr_, d_type_keys, total_instance / 2);
-      GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
-                           CUDA_NUM_THREADS,
-                           0,
-                           stream_>>>(show_tensor_ptr_, total_instance);
-      GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
-                           CUDA_NUM_THREADS,
-                           0,
-                           stream_>>>(clk_tensor_ptr_, total_instance);
-      break;
-    }
+    total_instance = (infer_node_start_ + batch_size_ <= infer_node_end_)
+                         ? batch_size_
+                         : infer_node_end_ - infer_node_start_;
+    VLOG(1) << "in graph_data generator:batch_size = " << batch_size_
+            << " instance = " << total_instance;
+    total_instance *= 2;
     if (total_instance == 0) {
       return 0;
     }
+    FillIdShowClkTensor(total_instance, gpu_graph_training_, cursor_);
   } else {
     while (ins_buf_pair_len_ < batch_size_) {
       res = FillInsBuf();
@@ -518,132 +570,15 @@ int GraphDataGenerator::GenerateBatch() {
     }
     total_instance =
         ins_buf_pair_len_ < batch_size_ ? ins_buf_pair_len_ : batch_size_;
-
     total_instance *= 2;
-    id_tensor_ptr_ =
-        feed_vec_[0]->mutable_data<int64_t>({total_instance, 1}, this->place_);
-    show_tensor_ptr_ =
-        feed_vec_[1]->mutable_data<int64_t>({total_instance}, this->place_);
-    clk_tensor_ptr_ =
-        feed_vec_[2]->mutable_data<int64_t>({total_instance}, this->place_);
-  }
-
-  int64_t *slot_tensor_ptr_[slot_num_];
-  int64_t *slot_lod_tensor_ptr_[slot_num_];
-  if (slot_num_ > 0) {
-    for (int i = 0; i < slot_num_; ++i) {
-      slot_tensor_ptr_[i] = feed_vec_[3 + 2 * i]->mutable_data<int64_t>(
-          {total_instance, 1}, this->place_);
-      slot_lod_tensor_ptr_[i] = feed_vec_[3 + 2 * i + 1]->mutable_data<int64_t>(
-          {total_instance + 1}, this->place_);
-    }
-    if (FLAGS_enable_opt_get_features || !gpu_graph_training_) {
-      cudaMemcpyAsync(d_slot_tensor_ptr_->ptr(),
-                      slot_tensor_ptr_,
-                      sizeof(uint64_t *) * slot_num_,
-                      cudaMemcpyHostToDevice,
-                      stream_);
-      cudaMemcpyAsync(d_slot_lod_tensor_ptr_->ptr(),
-                      slot_lod_tensor_ptr_,
-                      sizeof(uint64_t *) * slot_num_,
-                      cudaMemcpyHostToDevice,
-                      stream_);
-    }
-  }
-
-  uint64_t *ins_cursor, *ins_buf;
-  if (gpu_graph_training_) {
     VLOG(2) << "total_instance: " << total_instance
             << ", ins_buf_pair_len = " << ins_buf_pair_len_;
-    // uint64_t *ins_buf = reinterpret_cast<uint64_t *>(d_ins_buf_->ptr());
-    // uint64_t *ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance;
-    ins_buf = reinterpret_cast<uint64_t *>(d_ins_buf_->ptr());
-    ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance;
-    cudaMemcpyAsync(id_tensor_ptr_,
-                    ins_cursor,
-                    sizeof(uint64_t) * total_instance,
-                    cudaMemcpyDeviceToDevice,
-                    stream_);
-
-    GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
-                         CUDA_NUM_THREADS,
-                         0,
-                         stream_>>>(show_tensor_ptr_, total_instance);
-    GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
-                         CUDA_NUM_THREADS,
-                         0,
-                         stream_>>>(clk_tensor_ptr_, total_instance);
-  } else {
-    ins_cursor = (uint64_t *)id_tensor_ptr_;
+    FillIdShowClkTensor(total_instance, gpu_graph_training_);
   }
 
   if (slot_num_ > 0) {
-    uint64_t *feature_buf = reinterpret_cast<uint64_t *>(d_feature_buf_->ptr());
-    if (FLAGS_enable_opt_get_features || !gpu_graph_training_) {
-      FillFeatureBuf(ins_cursor, feature_buf, total_instance);
-      // FillFeatureBuf(id_tensor_ptr_, feature_buf, total_instance);
-      if (debug_mode_) {
-        uint64_t h_walk[total_instance];
-        cudaMemcpy(h_walk,
-                   ins_cursor,
-                   total_instance * sizeof(uint64_t),
-                   cudaMemcpyDeviceToHost);
-        uint64_t h_feature[total_instance * slot_num_];
-        cudaMemcpy(h_feature,
-                   feature_buf,
-                   total_instance * slot_num_ * sizeof(uint64_t),
-                   cudaMemcpyDeviceToHost);
-        for (int i = 0; i < total_instance; ++i) {
-          std::stringstream ss;
-          for (int j = 0; j < slot_num_; ++j) {
-            ss << h_feature[i * slot_num_ + j] << " ";
-          }
-          VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i
-                  << "] = " << (uint64_t)h_walk[i] << " feature["
-                  << i * slot_num_ << ".." << (i + 1) * slot_num_
-                  << "] = " << ss.str();
-        }
-      }
-
-      GraphFillSlotKernel<<<GET_BLOCKS(total_instance * slot_num_),
-                            CUDA_NUM_THREADS,
-                            0,
-                            stream_>>>((uint64_t *)d_slot_tensor_ptr_->ptr(),
-                                       feature_buf,
-                                       total_instance * slot_num_,
-                                       total_instance,
-                                       slot_num_);
-      GraphFillSlotLodKernelOpt<<<GET_BLOCKS((total_instance + 1) * slot_num_),
-                                  CUDA_NUM_THREADS,
-                                  0,
-                                  stream_>>>(
-          (uint64_t *)d_slot_lod_tensor_ptr_->ptr(),
-          (total_instance + 1) * slot_num_,
-          total_instance + 1);
-    } else {
-      for (int i = 0; i < slot_num_; ++i) {
-        int feature_buf_offset =
-            (ins_buf_pair_len_ * 2 - total_instance) * slot_num_ + i * 2;
-        for (int j = 0; j < total_instance; j += 2) {
-          VLOG(2) << "slot_tensor[" << i << "][" << j << "] <- feature_buf["
-                  << feature_buf_offset + j * slot_num_ << "]";
-          VLOG(2) << "slot_tensor[" << i << "][" << j + 1 << "] <- feature_buf["
-                  << feature_buf_offset + j * slot_num_ + 1 << "]";
-          cudaMemcpyAsync(slot_tensor_ptr_[i] + j,
-                          &feature_buf[feature_buf_offset + j * slot_num_],
-                          sizeof(uint64_t) * 2,
-                          cudaMemcpyDeviceToDevice,
-                          stream_);
-        }
-        GraphFillSlotLodKernel<<<GET_BLOCKS(total_instance),
-                                 CUDA_NUM_THREADS,
-                                 0,
-                                 stream_>>>(slot_lod_tensor_ptr_[i],
-                                            total_instance + 1);
-      }
-    }
+    FillGraphSlotFeature(total_instance, gpu_graph_training_);
   }
-
   offset_.clear();
   offset_.push_back(0);
   offset_.push_back(total_instance);
@@ -655,34 +590,9 @@ int GraphDataGenerator::GenerateBatch() {
     }
   }
 
-  cudaStreamSynchronize(stream_);
+  cudaStreamSynchronize(train_stream_);
   if (!gpu_graph_training_) return 1;
   ins_buf_pair_len_ -= total_instance / 2;
-  if (debug_mode_) {
-    uint64_t h_slot_tensor[slot_num_][total_instance];
-    uint64_t h_slot_lod_tensor[slot_num_][total_instance + 1];
-    for (int i = 0; i < slot_num_; ++i) {
-      cudaMemcpy(h_slot_tensor[i],
-                 slot_tensor_ptr_[i],
-                 total_instance * sizeof(uint64_t),
-                 cudaMemcpyDeviceToHost);
-      int len = total_instance > 5000 ? 5000 : total_instance;
-      for (int j = 0; j < len; ++j) {
-        VLOG(2) << "gpu[" << gpuid_ << "] slot_tensor[" << i << "][" << j
-                << "] = " << h_slot_tensor[i][j];
-      }
-
-      cudaMemcpy(h_slot_lod_tensor[i],
-                 slot_lod_tensor_ptr_[i],
-                 (total_instance + 1) * sizeof(uint64_t),
-                 cudaMemcpyDeviceToHost);
-      len = total_instance + 1 > 5000 ? 5000 : total_instance + 1;
-      for (int j = 0; j < len; ++j) {
-        VLOG(2) << "gpu[" << gpuid_ << "] slot_lod_tensor[" << i << "][" << j
-                << "] = " << h_slot_lod_tensor[i][j];
-      }
-    }
-  }
 
   return 1;
 }
@@ -750,6 +660,66 @@ __global__ void GraphFillFirstStepKernel(int *prefix_sum,
   }
 }
 
+__global__ void GetUniqueFeaNum(uint64_t *d_in,
+                                uint64_t *unique_num,
+                                size_t len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  __shared__ uint64_t local_num;
+  if (threadIdx.x == 0) {
+    local_num = 0;
+  }
+  __syncthreads();
+
+  if (i < len - 1) {
+    if (d_in[i] != d_in[i + 1]) {
+      atomicAdd(&local_num, 1);
+    }
+  }
+  if (i == len - 1) {
+    atomicAdd(&local_num, 1);
+  }
+
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    atomicAdd(unique_num, local_num);
+  }
+}
+
+__global__ void UniqueFeature(uint64_t *d_in,
+                              uint64_t *d_out,
+                              uint64_t *unique_num,
+                              size_t len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  __shared__ uint64_t local_key[CUDA_NUM_THREADS];
+  __shared__ uint64_t local_num;
+  __shared__ uint64_t global_num;
+  if (threadIdx.x == 0) {
+    local_num = 0;
+  }
+  __syncthreads();
+
+  if (i < len - 1) {
+    if (d_in[i] != d_in[i + 1]) {
+      size_t dst = atomicAdd(&local_num, 1);
+      local_key[dst] = d_in[i];
+    }
+  }
+  if (i == len - 1) {
+    size_t dst = atomicAdd(&local_num, 1);
+    local_key[dst] = d_in[i];
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    global_num = atomicAdd(unique_num, local_num);
+  }
+  __syncthreads();
+
+  if (threadIdx.x < local_num) {
+    d_out[global_num + threadIdx.x] = local_key[threadIdx.x];
+  }
+}
 // Fill sample_res to the stepth column of walk
 void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids,
                                      uint64_t *walk,
@@ -773,45 +743,50 @@ void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids,
                                            d_actual_sample_size,
                                            d_prefix_sum + 1,
                                            len,
-                                           stream_));
-  auto d_temp_storage = memory::Alloc(place_, temp_storage_bytes);
+                                           sample_stream_));
+  auto d_temp_storage = memory::Alloc(
+      place_,
+      temp_storage_bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
 
   CUDA_CHECK(cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(),
                                            temp_storage_bytes,
                                            d_actual_sample_size,
                                            d_prefix_sum + 1,
                                            len,
-                                           stream_));
+                                           sample_stream_));
 
-  cudaStreamSynchronize(stream_);
+  cudaStreamSynchronize(sample_stream_);
 
   if (step == 1) {
-    GraphFillFirstStepKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
-        d_prefix_sum,
-        d_tmp_sampleidx2row,
-        walk,
-        d_start_ids,
-        len,
-        walk_degree_,
-        walk_len_,
-        d_actual_sample_size,
-        d_neighbors,
-        d_sample_keys);
+    GraphFillFirstStepKernel<<<GET_BLOCKS(len),
+                               CUDA_NUM_THREADS,
+                               0,
+                               sample_stream_>>>(d_prefix_sum,
+                                                 d_tmp_sampleidx2row,
+                                                 walk,
+                                                 d_start_ids,
+                                                 len,
+                                                 walk_degree_,
+                                                 walk_len_,
+                                                 d_actual_sample_size,
+                                                 d_neighbors,
+                                                 d_sample_keys);
 
   } else {
     GraphFillSampleKeysKernel<<<GET_BLOCKS(len),
                                 CUDA_NUM_THREADS,
                                 0,
-                                stream_>>>(d_neighbors,
-                                           d_sample_keys,
-                                           d_prefix_sum,
-                                           d_sampleidx2row,
-                                           d_tmp_sampleidx2row,
-                                           d_actual_sample_size,
-                                           cur_degree,
-                                           len);
-
-    GraphDoWalkKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
+                                sample_stream_>>>(d_neighbors,
+                                                  d_sample_keys,
+                                                  d_prefix_sum,
+                                                  d_sampleidx2row,
+                                                  d_tmp_sampleidx2row,
+                                                  d_actual_sample_size,
+                                                  cur_degree,
+                                                  len);
+
+    GraphDoWalkKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, sample_stream_>>>(
         d_neighbors,
         walk,
         d_prefix_sum,
@@ -828,7 +803,6 @@ void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids,
     int *h_prefix_sum = new int[len + 1];
     int *h_actual_size = new int[len];
     int *h_offset2idx = new int[once_max_sample_keynum];
-    uint64_t h_sample_keys[once_max_sample_keynum];
     cudaMemcpy(h_offset2idx,
                d_tmp_sampleidx2row,
                once_max_sample_keynum * sizeof(int),
@@ -847,9 +821,8 @@ void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids,
     delete[] h_prefix_sum;
     delete[] h_actual_size;
     delete[] h_offset2idx;
-    delete[] h_sample_keys;
   }
-  cudaStreamSynchronize(stream_);
+  cudaStreamSynchronize(sample_stream_);
   cur_sampleidx2row_ = 1 - cur_sampleidx2row_;
 }
 
@@ -878,7 +851,155 @@ int GraphDataGenerator::FillFeatureBuf(
   return ret;
 }
 
-int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
+// 尝试插入table, 0表示插入成功
+int GraphDataGenerator::InsertTable(
+    const unsigned long *d_keys,
+    unsigned long len,
+    std::shared_ptr<phi::Allocation> d_uniq_node_num) {
+  uint64_t h_uniq_node_num = 0;
+  uint64_t *d_uniq_node_num_ptr =
+      reinterpret_cast<uint64_t *>(d_uniq_node_num->ptr());
+  cudaMemcpyAsync(&h_uniq_node_num,
+                  d_uniq_node_num_ptr,
+                  sizeof(uint64_t),
+                  cudaMemcpyDeviceToHost,
+                  sample_stream_);
+  cudaStreamSynchronize(sample_stream_);
+  // 产生了足够多的node，采样结束
+  VLOG(2) << "table capcity: " << train_table_cap_ << ", " << h_uniq_node_num
+          << " used";
+  if (h_uniq_node_num + len >= train_table_cap_) {
+    return 1;
+  }
+  table_->insert(d_keys, len, d_uniq_node_num_ptr, sample_stream_);
+  CUDA_CHECK(cudaStreamSynchronize(sample_stream_));
+  return 0;
+}
+
+std::shared_ptr<phi::Allocation> GraphDataGenerator::GetTableKeys(
+    std::shared_ptr<phi::Allocation> d_uniq_node_num,
+    uint64_t &h_uniq_node_num) {
+  uint64_t *d_uniq_node_num_ptr =
+      reinterpret_cast<uint64_t *>(d_uniq_node_num->ptr());
+  cudaMemcpyAsync(&h_uniq_node_num,
+                  d_uniq_node_num_ptr,
+                  sizeof(uint64_t),
+                  cudaMemcpyDeviceToHost,
+                  sample_stream_);
+  cudaStreamSynchronize(sample_stream_);
+
+  auto d_uniq_node = memory::AllocShared(
+      place_,
+      h_uniq_node_num * sizeof(uint64_t),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
+  uint64_t *d_uniq_node_ptr = reinterpret_cast<uint64_t *>(d_uniq_node->ptr());
+
+  auto d_node_cursor = memory::AllocShared(
+      place_,
+      sizeof(uint64_t),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
+
+  uint64_t *d_node_cursor_ptr =
+      reinterpret_cast<uint64_t *>(d_node_cursor->ptr());
+  cudaMemsetAsync(d_node_cursor_ptr, 0, sizeof(uint64_t), sample_stream_);
+  table_->get_keys(d_uniq_node_ptr, d_node_cursor_ptr, sample_stream_);
+
+  cudaStreamSynchronize(sample_stream_);
+  return d_uniq_node;
+}
+
+void GraphDataGenerator::CopyFeaFromTable(
+    std::shared_ptr<phi::Allocation> d_uniq_fea_num) {
+  uint64_t h_uniq_fea_num = 0;
+  auto d_uniq_fea = GetTableKeys(d_uniq_fea_num, h_uniq_fea_num);
+  uint64_t *d_uniq_fea_ptr = reinterpret_cast<uint64_t *>(d_uniq_fea->ptr());
+  size_t host_vec_size = host_vec_.size();
+  host_vec_.resize(host_vec_size + h_uniq_fea_num);
+
+  VLOG(0) << "uniq feature num: " << h_uniq_fea_num;
+
+  cudaMemcpyAsync(host_vec_.data() + host_vec_size,
+                  d_uniq_fea_ptr,
+                  sizeof(uint64_t) * h_uniq_fea_num,
+                  cudaMemcpyDeviceToHost,
+                  sample_stream_);
+  cudaStreamSynchronize(sample_stream_);
+}
+
+void GraphDataGenerator::DoWalk() {
+  int device_id = place_.GetDeviceId();
+  debug_gpu_memory_info(device_id, "DoWalk start");
+  if (gpu_graph_training_) {
+    FillWalkBuf();
+  } else {
+    FillInferBuf();
+  }
+  debug_gpu_memory_info(device_id, "DoWalk end");
+}
+
+void GraphDataGenerator::clear_gpu_mem() {
+  d_len_per_row_.reset();
+  d_sample_keys_.reset();
+  d_prefix_sum_.reset();
+  for (size_t i = 0; i < d_sampleidx2rows_.size(); i++) {
+    d_sampleidx2rows_[i].reset();
+  }
+  delete table_;
+}
+
+int GraphDataGenerator::FillInferBuf() {
+  platform::CUDADeviceGuard guard(gpuid_);
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+  auto &global_infer_node_type_start =
+      gpu_graph_ptr->global_infer_node_type_start_[gpuid_];
+  auto &infer_cursor = gpu_graph_ptr->infer_cursor_[thread_id_];
+  total_row_ = 0;
+  if (infer_cursor < h_device_keys_len_.size()) {
+    if (global_infer_node_type_start[infer_cursor] >=
+        h_device_keys_len_[infer_cursor]) {
+      infer_cursor++;
+      if (infer_cursor >= h_device_keys_len_.size()) {
+        return 0;
+      }
+    }
+    size_t device_key_size = h_device_keys_len_[infer_cursor];
+    total_row_ =
+        (global_infer_node_type_start[infer_cursor] + infer_table_cap_ <=
+         device_key_size)
+            ? infer_table_cap_
+            : device_key_size - global_infer_node_type_start[infer_cursor];
+
+    host_vec_.resize(total_row_);
+    uint64_t *d_type_keys =
+        reinterpret_cast<uint64_t *>(d_device_keys_[infer_cursor]->ptr());
+    cudaMemcpyAsync(host_vec_.data(),
+                    d_type_keys + global_infer_node_type_start[infer_cursor],
+                    sizeof(uint64_t) * total_row_,
+                    cudaMemcpyDeviceToHost,
+                    sample_stream_);
+    cudaStreamSynchronize(sample_stream_);
+    VLOG(1) << "cursor: " << infer_cursor
+            << " start: " << global_infer_node_type_start[infer_cursor]
+            << " num: " << total_row_;
+    infer_node_start_ = global_infer_node_type_start[infer_cursor];
+    global_infer_node_type_start[infer_cursor] += total_row_;
+    infer_node_end_ = global_infer_node_type_start[infer_cursor];
+    cursor_ = infer_cursor;
+  }
+  return 0;
+}
+
+void GraphDataGenerator::ClearSampleState() {
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+  auto &finish_node_type = gpu_graph_ptr->finish_node_type_[gpuid_];
+  auto &node_type_start = gpu_graph_ptr->node_type_start_[gpuid_];
+  finish_node_type.clear();
+  for (auto iter = node_type_start.begin(); iter != node_type_start.end(); iter++) {
+    iter->second = 0;
+  }
+}
+
+int GraphDataGenerator::FillWalkBuf() {
   platform::CUDADeviceGuard guard(gpuid_);
   size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_;
   ////////
@@ -896,30 +1017,42 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
   }
   ///////
   auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
-  uint64_t *walk = reinterpret_cast<uint64_t *>(d_walk->ptr());
+  uint64_t *walk = reinterpret_cast<uint64_t *>(d_walk_->ptr());
   int *len_per_row = reinterpret_cast<int *>(d_len_per_row_->ptr());
   uint64_t *d_sample_keys = reinterpret_cast<uint64_t *>(d_sample_keys_->ptr());
-  cudaMemsetAsync(walk, 0, buf_size_ * sizeof(uint64_t), stream_);
-  cudaMemsetAsync(
-      len_per_row, 0, once_max_sample_keynum * sizeof(int), stream_);
+  cudaMemsetAsync(walk, 0, buf_size_ * sizeof(uint64_t), sample_stream_);
+  // cudaMemsetAsync(
+  //     len_per_row, 0, once_max_sample_keynum * sizeof(int), sample_stream_);
+  int sample_times = 0;
   int i = 0;
-  int total_row = 0;
-  size_t node_type_len = first_node_type_.size();
+  total_row_ = 0;
+
+  // 获取全局采样状态
+  auto &first_node_type = gpu_graph_ptr->first_node_type_;
+  auto &meta_path = gpu_graph_ptr->meta_path_;
+  auto &node_type_start = gpu_graph_ptr->node_type_start_[gpuid_];
+  auto &finish_node_type = gpu_graph_ptr->finish_node_type_[gpuid_];
+  auto &type_to_index = gpu_graph_ptr->get_graph_type_to_index();
+  auto &cursor = gpu_graph_ptr->cursor_[thread_id_];
+  size_t node_type_len = first_node_type.size();
   int remain_size =
       buf_size_ - walk_degree_ * once_sample_startid_len_ * walk_len_;
+  int total_samples = 0;
 
   while (i <= remain_size) {
-    int cur_node_idx = cursor_ % node_type_len;
-    int node_type = first_node_type_[cur_node_idx];
-    auto &path = meta_path_[cur_node_idx];
-    size_t start = node_type_start_[node_type];
+    int cur_node_idx = cursor % node_type_len;
+    int node_type = first_node_type[cur_node_idx];
+    auto &path = meta_path[cur_node_idx];
+    size_t start = node_type_start[node_type];
+    VLOG(2) << "cur_node_idx = " << cur_node_idx
+            << " meta_path.size = " << meta_path.size();
     // auto node_query_result = gpu_graph_ptr->query_node_list(
-    //    gpuid_, node_type, start, once_sample_startid_len_);
+    //     gpuid_, node_type, start, once_sample_startid_len_);
 
     // int tmp_len = node_query_result.actual_sample_size;
     VLOG(2) << "choose start type: " << node_type;
-    int type_index = type_to_index_[node_type];
-    size_t device_key_size = h_device_keys_[type_index]->size();
+    int type_index = type_to_index[node_type];
+    size_t device_key_size = h_device_keys_len_[type_index];
     VLOG(2) << "type: " << node_type << " size: " << device_key_size
             << " start: " << start;
     uint64_t *d_type_keys =
@@ -927,21 +1060,19 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
     int tmp_len = start + once_sample_startid_len_ > device_key_size
                       ? device_key_size - start
                       : once_sample_startid_len_;
-    node_type_start_[node_type] = tmp_len + start;
+    bool update = true;
     if (tmp_len == 0) {
-      finish_node_type_.insert(node_type);
-      if (finish_node_type_.size() == node_type_start_.size()) {
+      finish_node_type.insert(node_type);
+      if (finish_node_type.size() == node_type_start.size()) {
+        cursor = 0;
+        epoch_finish_ = true;
         break;
       }
-      cursor_ += 1;
+      cursor += 1;
       continue;
     }
-    // if (tmp_len == 0) {
-    //  break;
-    //}
-    VLOG(2) << "i = " << i << " buf_size_ = " << buf_size_
-            << " tmp_len = " << tmp_len << " cursor = " << cursor_
-            << " once_max_sample_keynum = " << once_max_sample_keynum;
+
+    VLOG(2) << "gpuid = " << gpuid_ << " path[0] = " << path[0];
     uint64_t *cur_walk = walk + i;
 
     NeighborSampleQuery q;
@@ -955,6 +1086,30 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
     int step = 1;
     VLOG(2) << "sample edge type: " << path[0] << " step: " << 1;
     jump_rows_ = sample_res.total_sample_size;
+    total_samples += sample_res.total_sample_size;
+    VLOG(2) << "i = " << i << " start = " << start << " tmp_len = " << tmp_len
+            << " cursor = " << node_type << " cur_node_idx = " << cur_node_idx
+            << " jump row: " << jump_rows_;
+    VLOG(2) << "jump_row: " << jump_rows_;
+    if (jump_rows_ == 0) {
+      node_type_start[node_type] = tmp_len + start;
+      cursor += 1;
+      continue;
+    }
+
+    if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) {
+      if (InsertTable(d_type_keys + start, tmp_len, d_uniq_node_num_) != 0) {
+        VLOG(2) << "in step 0, insert key stage, table is full";
+        update = false;
+        break;
+      }
+      if (InsertTable(sample_res.actual_val, sample_res.total_sample_size, d_uniq_node_num_) !=
+          0) {
+        VLOG(2) << "in step 0, insert sample res stage, table is full";
+        update = false;
+        break;
+      }
+    }
     FillOneStep(d_type_keys + start,
                 cur_walk,
                 tmp_len,
@@ -962,7 +1117,6 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
                 walk_degree_,
                 step,
                 len_per_row);
-    VLOG(2) << "jump_row: " << jump_rows_;
     /////////
     if (debug_mode_) {
       cudaMemcpy(
@@ -971,11 +1125,16 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
         VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx];
       }
     }
+
+    VLOG(2) << "sample, step=" << step << " sample_keys=" << tmp_len
+        << " sample_res_len=" << sample_res.total_sample_size;
+
     /////////
     step++;
     size_t path_len = path.size();
     for (; step < walk_len_; step++) {
       if (sample_res.total_sample_size == 0) {
+        VLOG(2) << "sample finish, step=" << step;
         break;
       }
       auto sample_key_mem = sample_res.actual_val_mem;
@@ -988,11 +1147,20 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
                    (uint64_t)sample_keys_ptr,
                    1,
                    sample_res.total_sample_size);
+      int sample_key_len = sample_res.total_sample_size;
       sample_res = gpu_graph_ptr->graph_neighbor_sample_v3(q, false);
-
+      total_samples += sample_res.total_sample_size;
+      if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) {
+        if (InsertTable(sample_res.actual_val, sample_res.total_sample_size, d_uniq_node_num_) !=
+            0) {
+          VLOG(2) << "in step: " << step << ", table is full";
+          update = false;
+          break;
+        }
+      }
       FillOneStep(d_type_keys + start,
                   cur_walk,
-                  sample_res.total_sample_size,
+                  sample_key_len,
                   sample_res,
                   1,
                   step,
@@ -1004,34 +1172,44 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
           VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx];
         }
       }
+
+      VLOG(2) << "sample, step=" << step << " sample_keys=" << sample_key_len
+          << " sample_res_len=" << sample_res.total_sample_size;
+    }
+    // 此时更新全局采样状态
+    if (update == true) {
+      node_type_start[node_type] = tmp_len + start;
+      i += jump_rows_ * walk_len_;
+      total_row_ += jump_rows_;
+      cursor += 1;
+      sample_times++;
+    } else {
+      VLOG(2) << "table is full, not update stat!";
+      break;
     }
-    // cursor_ += tmp_len;
-    i += jump_rows_ * walk_len_;
-    total_row += jump_rows_;
-    cursor_ += 1;
   }
-  buf_state_.Reset(total_row);
+  buf_state_.Reset(total_row_);
   int *d_random_row = reinterpret_cast<int *>(d_random_row_->ptr());
 
   thrust::random::default_random_engine engine(shuffle_seed_);
-  const auto &exec_policy = thrust::cuda::par.on(stream_);
+  const auto &exec_policy = thrust::cuda::par.on(sample_stream_);
   thrust::counting_iterator<int> cnt_iter(0);
   thrust::shuffle_copy(exec_policy,
                        cnt_iter,
-                       cnt_iter + total_row,
+                       cnt_iter + total_row_,
                        thrust::device_pointer_cast(d_random_row),
                        engine);
 
-  cudaStreamSynchronize(stream_);
+  cudaStreamSynchronize(sample_stream_);
   shuffle_seed_ = engine();
 
   if (debug_mode_) {
-    int *h_random_row = new int[total_row + 10];
+    int *h_random_row = new int[total_row_ + 10];
     cudaMemcpy(h_random_row,
                d_random_row,
-               total_row * sizeof(int),
+               total_row_ * sizeof(int),
                cudaMemcpyDeviceToHost);
-    for (int xx = 0; xx < total_row; xx++) {
+    for (int xx = 0; xx < total_row_; xx++) {
       VLOG(2) << "h_random_row[" << xx << "]: " << h_random_row[xx];
     }
     delete[] h_random_row;
@@ -1041,71 +1219,198 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
     delete[] h_len_per_row;
     delete[] h_prefix_sum;
   }
-  return total_row != 0;
+  if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) {
+    // table_->prefetch(cudaCpuDeviceId, sample_stream_);
+    // thrust::pair<uint64_t, uint64_t> *kv = table_->data();
+    // size_t size = table_->size();
+    // uint64_t unused_key = std::numeric_limits<uint64_t>::max();
+    // for (size_t i = 0; i < size; i++) {
+    //   if (kv[i].first == unused_key) {
+    //     continue;
+    //   }
+    //   host_vec_.push_back(kv[i].first);
+    // }
+
+    uint64_t h_uniq_node_num = 0;
+    uint64_t *d_uniq_node_num =
+        reinterpret_cast<uint64_t *>(d_uniq_node_num_->ptr());
+    cudaMemcpyAsync(&h_uniq_node_num,
+                    d_uniq_node_num,
+                    sizeof(uint64_t),
+                    cudaMemcpyDeviceToHost,
+                    sample_stream_);
+    cudaStreamSynchronize(sample_stream_);
+    VLOG(2) << "h_uniq_node_num: " << h_uniq_node_num;
+    // 临时显存, 存储去重后的nodeid
+    auto d_uniq_node = memory::AllocShared(
+        place_,
+        h_uniq_node_num * sizeof(uint64_t),
+        phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
+    uint64_t *d_uniq_node_ptr =
+        reinterpret_cast<uint64_t *>(d_uniq_node->ptr());
+
+    auto d_node_cursor = memory::AllocShared(
+        place_,
+        sizeof(uint64_t),
+        phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
+
+    uint64_t *d_node_cursor_ptr =
+        reinterpret_cast<uint64_t *>(d_node_cursor->ptr());
+    cudaMemsetAsync(d_node_cursor_ptr, 0, sizeof(uint64_t), sample_stream_);
+    // uint64_t unused_key = std::numeric_limits<uint64_t>::max();
+    table_->get_keys(d_uniq_node_ptr, d_node_cursor_ptr, sample_stream_);
+
+    cudaStreamSynchronize(sample_stream_);
+
+    host_vec_.resize(h_uniq_node_num);
+    cudaMemcpyAsync(host_vec_.data(),
+                    d_uniq_node_ptr,
+                    sizeof(uint64_t) * h_uniq_node_num,
+                    cudaMemcpyDeviceToHost,
+                    sample_stream_);
+    cudaStreamSynchronize(sample_stream_);
+    VLOG(2) << "slot_num: " << slot_num_;
+    if (slot_num_ > 0) {
+      VLOG(2) << "uniq feature";
+      auto d_uniq_fea_num = memory::AllocShared(
+          place_,
+          sizeof(uint64_t),
+          phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
+      cudaMemsetAsync(
+          d_uniq_fea_num->ptr(), 0, sizeof(uint64_t), sample_stream_);
+      table_->clear(sample_stream_);
+      size_t cursor = 0;
+      size_t batch = 0;
+      d_feature_list_ = memory::AllocShared(
+          place_,
+          once_sample_startid_len_ * slot_num_ * sizeof(uint64_t),
+          phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
+      uint64_t *d_feature_list_ptr =
+          reinterpret_cast<uint64_t *>(d_feature_list_->ptr());
+      while (cursor < h_uniq_node_num) {
+        batch = (cursor + once_sample_startid_len_ <= h_uniq_node_num)
+                    ? once_sample_startid_len_
+                    : h_uniq_node_num - cursor;
+
+        int ret = gpu_graph_ptr->get_feature_of_nodes(gpuid_,
+                                                      d_uniq_node_ptr + cursor,
+                                                      d_feature_list_ptr,
+                                                      batch,
+                                                      slot_num_);
+        if (InsertTable(
+                d_feature_list_ptr, slot_num_ * batch, d_uniq_fea_num)) {
+          CopyFeaFromTable(d_uniq_fea_num);
+          table_->clear(sample_stream_);
+          cudaMemsetAsync(
+              d_uniq_fea_num->ptr(), 0, sizeof(uint64_t), sample_stream_);
+        }
+        cursor += batch;
+      }
+      CopyFeaFromTable(d_uniq_fea_num);
+    }
+
+    VLOG(0) << "sample_times:" << sample_times
+        << ", d_walk_size:" << buf_size_
+        << ", d_walk_offset:" << i
+        << ", total_rows:" << total_row_
+        << ", total_samples:" << total_samples
+        << ", h_uniq_node_num:" << h_uniq_node_num;
+  }
+  return total_row_ != 0;
 }
 
-void GraphDataGenerator::AllocResource(const paddle::platform::Place &place,
-                                       std::vector<LoDTensor *> feed_vec) {
-  place_ = place;
-  gpuid_ = place_.GetDeviceId();
-  VLOG(3) << "gpuid " << gpuid_;
-  stream_ = dynamic_cast<phi::GPUContext *>(
-                platform::DeviceContextPool::Instance().Get(place))
-                ->stream();
+void GraphDataGenerator::SetFeedVec(std::vector<LoDTensor *> feed_vec) {
   feed_vec_ = feed_vec;
-  slot_num_ = (feed_vec_.size() - 3) / 2;
-
-  // d_device_keys_.resize(h_device_keys_.size());
-  VLOG(2) << "h_device_keys size: " << h_device_keys_.size();
-  infer_node_type_start_ = std::vector<int>(h_device_keys_.size(), 0);
-  for (size_t i = 0; i < h_device_keys_.size(); i++) {
-    for (size_t j = 0; j < h_device_keys_[i]->size(); j++) {
-      VLOG(3) << "h_device_keys_[" << i << "][" << j
-              << "] = " << (*(h_device_keys_[i]))[j];
-    }
-    auto buf = memory::AllocShared(
-        place_, h_device_keys_[i]->size() * sizeof(uint64_t));
-    d_device_keys_.push_back(buf);
-    CUDA_CHECK(cudaMemcpyAsync(buf->ptr(),
-                               h_device_keys_[i]->data(),
-                               h_device_keys_[i]->size() * sizeof(uint64_t),
-                               cudaMemcpyHostToDevice,
-                               stream_));
+}
+void GraphDataGenerator::AllocResource(int thread_id,
+                                       std::vector<LoDTensor *> feed_vec) {
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+  gpuid_ = gpu_graph_ptr->device_id_mapping[thread_id];
+  thread_id_ = thread_id;
+  place_ = platform::CUDAPlace(gpuid_);
+  debug_gpu_memory_info(gpuid_, "AllocResource start");
+
+  platform::CUDADeviceGuard guard(gpuid_);
+  if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) {
+    table_ = new HashTable<uint64_t, uint64_t>(
+        train_table_cap_ / FLAGS_gpugraph_hbm_table_load_factor);
+  }
+  VLOG(1) << "AllocResource gpuid " << gpuid_
+          << " feed_vec.size: " << feed_vec.size()
+          << " table cap: " << train_table_cap_;
+  sample_stream_ = gpu_graph_ptr->get_local_stream(gpuid_);
+  train_stream_ = dynamic_cast<phi::GPUContext *>(
+                      platform::DeviceContextPool::Instance().Get(place_))
+                      ->stream();
+  // feed_vec_ = feed_vec;
+  slot_num_ = (feed_vec.size() - 3) / 2;
+
+  // infer_node_type_start_ = std::vector<int>(h_device_keys_.size(), 0);
+  // for (size_t i = 0; i < h_device_keys_.size(); i++) {
+  //   for (size_t j = 0; j < h_device_keys_[i]->size(); j++) {
+  //     VLOG(3) << "h_device_keys_[" << i << "][" << j
+  //             << "] = " << (*(h_device_keys_[i]))[j];
+  //   }
+  //   auto buf = memory::AllocShared(
+  //       place_, h_device_keys_[i]->size() * sizeof(uint64_t));
+  //   d_device_keys_.push_back(buf);
+  //   CUDA_CHECK(cudaMemcpyAsync(buf->ptr(),
+  //                              h_device_keys_[i]->data(),
+  //                              h_device_keys_[i]->size() * sizeof(uint64_t),
+  //                              cudaMemcpyHostToDevice,
+  //                              stream_));
+  // }
+  auto &d_graph_all_type_keys = gpu_graph_ptr->d_graph_all_type_total_keys_;
+  auto &h_graph_all_type_keys_len = gpu_graph_ptr->h_graph_all_type_keys_len_;
+
+  for (size_t i = 0; i < d_graph_all_type_keys.size(); i++) {
+    d_device_keys_.push_back(d_graph_all_type_keys[i][thread_id]);
+    h_device_keys_len_.push_back(h_graph_all_type_keys_len[i][thread_id]);
   }
-  // h_device_keys_ = h_device_keys;
-  // device_key_size_ = h_device_keys_->size();
-  // d_device_keys_ =
-  //    memory::AllocShared(place_, device_key_size_ * sizeof(int64_t));
-  // CUDA_CHECK(cudaMemcpyAsync(d_device_keys_->ptr(), h_device_keys_->data(),
-  //                           device_key_size_ * sizeof(int64_t),
-  //                           cudaMemcpyHostToDevice, stream_));
+  VLOG(2) << "h_device_keys size: " << h_device_keys_len_.size();
   size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_;
-  d_prefix_sum_ =
-      memory::AllocShared(place_, (once_max_sample_keynum + 1) * sizeof(int));
+  d_prefix_sum_ = memory::AllocShared(
+      place_,
+      (once_max_sample_keynum + 1) * sizeof(int),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
   int *d_prefix_sum_ptr = reinterpret_cast<int *>(d_prefix_sum_->ptr());
-  cudaMemsetAsync(
-      d_prefix_sum_ptr, 0, (once_max_sample_keynum + 1) * sizeof(int), stream_);
+  cudaMemsetAsync(d_prefix_sum_ptr,
+                  0,
+                  (once_max_sample_keynum + 1) * sizeof(int),
+                  sample_stream_);
   cursor_ = 0;
   jump_rows_ = 0;
-  d_walk_ = memory::AllocShared(place_, buf_size_ * sizeof(uint64_t));
-  cudaMemsetAsync(d_walk_->ptr(), 0, buf_size_ * sizeof(uint64_t), stream_);
-  if (!FLAGS_enable_opt_get_features && slot_num_ > 0) {
-    d_feature_ =
-        memory::AllocShared(place_, buf_size_ * slot_num_ * sizeof(uint64_t));
-    cudaMemsetAsync(
-        d_feature_->ptr(), 0, buf_size_ * sizeof(uint64_t), stream_);
-  }
-  d_sample_keys_ =
-      memory::AllocShared(place_, once_max_sample_keynum * sizeof(uint64_t));
+  d_uniq_node_num_ = memory::AllocShared(
+      place_,
+      sizeof(uint64_t),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
+  cudaMemsetAsync(d_uniq_node_num_->ptr(), 0, sizeof(uint64_t), sample_stream_);
 
-  d_sampleidx2rows_.push_back(
-      memory::AllocShared(place_, once_max_sample_keynum * sizeof(int)));
-  d_sampleidx2rows_.push_back(
-      memory::AllocShared(place_, once_max_sample_keynum * sizeof(int)));
+  d_walk_ = memory::AllocShared(
+      place_,
+      buf_size_ * sizeof(uint64_t),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
+  cudaMemsetAsync(
+      d_walk_->ptr(), 0, buf_size_ * sizeof(uint64_t), sample_stream_);
+  d_sample_keys_ = memory::AllocShared(
+      place_,
+      once_max_sample_keynum * sizeof(uint64_t),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
+
+  d_sampleidx2rows_.push_back(memory::AllocShared(
+      place_,
+      once_max_sample_keynum * sizeof(int),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_))));
+  d_sampleidx2rows_.push_back(memory::AllocShared(
+      place_,
+      once_max_sample_keynum * sizeof(int),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_))));
   cur_sampleidx2row_ = 0;
 
-  d_len_per_row_ =
-      memory::AllocShared(place_, once_max_sample_keynum * sizeof(int));
+  d_len_per_row_ = memory::AllocShared(
+      place_,
+      once_max_sample_keynum * sizeof(int),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
   for (int i = -window_; i < 0; i++) {
     window_step_.push_back(i);
   }
@@ -1115,7 +1420,8 @@ void GraphDataGenerator::AllocResource(const paddle::platform::Place &place,
   buf_state_.Init(batch_size_, walk_len_, &window_step_);
   d_random_row_ = memory::AllocShared(
       place_,
-      (once_sample_startid_len_ * walk_degree_ * repeat_time_) * sizeof(int));
+      (once_sample_startid_len_ * walk_degree_ * repeat_time_) * sizeof(int),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
   shuffle_seed_ = 0;
 
   ins_buf_pair_len_ = 0;
@@ -1126,14 +1432,15 @@ void GraphDataGenerator::AllocResource(const paddle::platform::Place &place,
         place_, (batch_size_ * 2 * 2) * slot_num_ * sizeof(uint64_t));
   }
   d_pair_num_ = memory::AllocShared(place_, sizeof(int));
-  if (FLAGS_enable_opt_get_features && slot_num_ > 0) {
-    d_slot_tensor_ptr_ =
-        memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *));
-    d_slot_lod_tensor_ptr_ =
-        memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *));
-  }
 
-  cudaStreamSynchronize(stream_);
+  d_slot_tensor_ptr_ =
+      memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *));
+  d_slot_lod_tensor_ptr_ =
+      memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *));
+
+  cudaStreamSynchronize(sample_stream_);
+
+  debug_gpu_memory_info(gpuid_, "AllocResource end");
 }
 
 void GraphDataGenerator::SetConfig(
@@ -1153,48 +1460,22 @@ void GraphDataGenerator::SetConfig(
   repeat_time_ = graph_config.sample_times_one_chunk();
   buf_size_ =
       once_sample_startid_len_ * walk_len_ * walk_degree_ * repeat_time_;
-  VLOG(2) << "Confirm GraphConfig, walk_degree : " << walk_degree_
+  train_table_cap_ = graph_config.train_table_cap();
+  infer_table_cap_ = graph_config.infer_table_cap();
+  epoch_finish_ = false;
+  VLOG(0) << "Confirm GraphConfig, walk_degree : " << walk_degree_
           << ", walk_len : " << walk_len_ << ", window : " << window_
           << ", once_sample_startid_len : " << once_sample_startid_len_
           << ", sample_times_one_chunk : " << repeat_time_
-          << ", batch_size: " << batch_size_;
+          << ", batch_size: " << batch_size_
+          << ", train_table_cap: " << train_table_cap_
+          << ", infer_table_cap: " << infer_table_cap_;
   std::string first_node_type = graph_config.first_node_type();
   std::string meta_path = graph_config.meta_path();
   auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
-  auto edge_to_id = gpu_graph_ptr->edge_to_id;
-  auto node_to_id = gpu_graph_ptr->feature_to_id;
-  // parse first_node_type
-  auto node_types =
-      paddle::string::split_string<std::string>(first_node_type, ";");
-  VLOG(2) << "node_types: " << first_node_type;
-  finish_node_type_.clear();
-  node_type_start_.clear();
-  for (auto &type : node_types) {
-    auto iter = node_to_id.find(type);
-    PADDLE_ENFORCE_NE(
-        iter,
-        node_to_id.end(),
-        platform::errors::NotFound("(%s) is not found in node_to_id.", type));
-    VLOG(2) << "node_to_id[" << type << "] = " << iter->second;
-    first_node_type_.push_back(iter->second);
-    node_type_start_[iter->second] = 0;
-  }
-  meta_path_.resize(first_node_type_.size());
-  auto meta_paths = paddle::string::split_string<std::string>(meta_path, ";");
-
-  for (size_t i = 0; i < meta_paths.size(); i++) {
-    auto path = meta_paths[i];
-    auto nodes = paddle::string::split_string<std::string>(path, "-");
-    for (auto &node : nodes) {
-      auto iter = edge_to_id.find(node);
-      PADDLE_ENFORCE_NE(
-          iter,
-          edge_to_id.end(),
-          platform::errors::NotFound("(%s) is not found in edge_to_id.", node));
-      VLOG(2) << "edge_to_id[" << node << "] = " << iter->second;
-      meta_path_[i].push_back(iter->second);
-    }
-  }
+  debug_gpu_memory_info("init_conf start");
+  gpu_graph_ptr->init_conf(first_node_type, meta_path);
+  debug_gpu_memory_info("init_conf end");
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index ba9d5d0546791..7ae03968598aa 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -60,6 +60,8 @@ class Scope;
 class Variable;
 class NeighborSampleResult;
 class NodeQueryResult;
+template <typename KeyType, typename ValType>
+class HashTable;
 }  // namespace framework
 }  // namespace paddle
 
@@ -878,6 +880,9 @@ struct BufState {
 
   int GetNextBatch() {
     cursor += len;
+    if (row_num - cursor < 0) {
+      return 0;
+    }
     int tmp_len = cursor + batch_size > row_num ? row_num - cursor : batch_size;
     if (tmp_len == 0) {
       return 0;
@@ -895,11 +900,13 @@ class GraphDataGenerator {
   GraphDataGenerator(){};
   virtual ~GraphDataGenerator(){};
   void SetConfig(const paddle::framework::DataFeedDesc& data_feed_desc);
-  void AllocResource(const paddle::platform::Place& place,
-                     std::vector<LoDTensor*> feed_vec);
+  void AllocResource(int thread_id, std::vector<LoDTensor*> feed_vec);
+  void SetFeedVec(std::vector<LoDTensor*> feed_vec);
   int AcquireInstance(BufState* state);
   int GenerateBatch();
-  int FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk);
+  int FillWalkBuf();
+  int FillInferBuf();
+  void DoWalk();
   int FillFeatureBuf(uint64_t* d_walk, uint64_t* d_feature, size_t key_num);
   int FillFeatureBuf(std::shared_ptr<phi::Allocation> d_walk,
                      std::shared_ptr<phi::Allocation> d_feature);
@@ -911,12 +918,32 @@ class GraphDataGenerator {
                    int step,
                    int* len_per_row);
   int FillInsBuf();
+  int FillIdShowClkTensor(int total_instance,
+                          bool gpu_graph_training,
+                          size_t cursor = 0);
+  int FillGraphSlotFeature(int total_instance, bool gpu_graph_training);
+  int MakeInsPair();
+  int GetPathNum() { return total_row_; }
+  void ResetPathNum() {total_row_ = 0; }
+  void ResetEpochFinish() {epoch_finish_ = false; }
+  void ClearSampleState();
   void SetDeviceKeys(std::vector<uint64_t>* device_keys, int type) {
-    type_to_index_[type] = h_device_keys_.size();
-    h_device_keys_.push_back(device_keys);
-  }
+    // type_to_index_[type] = h_device_keys_.size();
+    // h_device_keys_.push_back(device_keys);
+  }
+  int InsertTable(const unsigned long* d_keys,
+                  unsigned long len,
+                  std::shared_ptr<phi::Allocation> d_uniq_node_num);
+  std::shared_ptr<phi::Allocation> GetTableKeys(
+      std::shared_ptr<phi::Allocation> d_uniq_node_num,
+      uint64_t& h_uniq_node_num);
+  void CopyFeaFromTable(std::shared_ptr<phi::Allocation> d_uniq_fea_num);
+  std::vector<uint64_t>& GetHostVec() { return host_vec_; }
+  bool get_epoch_finish() {return epoch_finish_; }
+  void clear_gpu_mem();
 
  protected:
+  HashTable<uint64_t, uint64_t>* table_;
   int walk_degree_;
   int walk_len_;
   int window_;
@@ -924,16 +951,14 @@ class GraphDataGenerator {
   int gpuid_;
   // start ids
   // int64_t* device_keys_;
-  // size_t device_key_size_;
-  std::vector<std::vector<uint64_t>*> h_device_keys_;
-  std::unordered_map<int, int> type_to_index_;
-  // point to device_keys_
   size_t cursor_;
+  int thread_id_;
   size_t jump_rows_;
   int64_t* id_tensor_ptr_;
   int64_t* show_tensor_ptr_;
   int64_t* clk_tensor_ptr_;
-  cudaStream_t stream_;
+  cudaStream_t train_stream_;
+  cudaStream_t sample_stream_;
   paddle::platform::Place place_;
   std::vector<LoDTensor*> feed_vec_;
   std::vector<size_t> offset_;
@@ -941,9 +966,11 @@ class GraphDataGenerator {
   std::vector<std::shared_ptr<phi::Allocation>> d_device_keys_;
 
   std::shared_ptr<phi::Allocation> d_walk_;
+  std::shared_ptr<phi::Allocation> d_feature_list_;
   std::shared_ptr<phi::Allocation> d_feature_;
   std::shared_ptr<phi::Allocation> d_len_per_row_;
   std::shared_ptr<phi::Allocation> d_random_row_;
+  std::shared_ptr<phi::Allocation> d_uniq_node_num_;
   //
   std::vector<std::shared_ptr<phi::Allocation>> d_sampleidx2rows_;
   int cur_sampleidx2row_;
@@ -951,10 +978,6 @@ class GraphDataGenerator {
   std::shared_ptr<phi::Allocation> d_sample_keys_;
   int sample_keys_len_;
 
-  std::set<int> finish_node_type_;
-  std::unordered_map<int, size_t> node_type_start_;
-  std::vector<int> infer_node_type_start_;
-
   std::shared_ptr<phi::Allocation> d_ins_buf_;
   std::shared_ptr<phi::Allocation> d_feature_buf_;
   std::shared_ptr<phi::Allocation> d_pair_num_;
@@ -970,9 +993,15 @@ class GraphDataGenerator {
   int slot_num_;
   int shuffle_seed_;
   int debug_mode_;
-  std::vector<int> first_node_type_;
-  std::vector<std::vector<int>> meta_path_;
   bool gpu_graph_training_;
+  bool epoch_finish_;
+  std::vector<uint64_t> host_vec_;
+  std::vector<uint64_t> h_device_keys_len_;
+  uint64_t train_table_cap_;
+  uint64_t infer_table_cap_;
+  int total_row_;
+  size_t infer_node_start_;
+  size_t infer_node_end_;
 };
 
 class DataFeed {
@@ -1037,11 +1066,29 @@ class DataFeed {
   virtual void SetParseLogKey(bool parse_logkey) {}
   virtual void SetEnablePvMerge(bool enable_pv_merge) {}
   virtual void SetCurrentPhase(int current_phase) {}
+  virtual void InitGraphResource() {}
   virtual void SetDeviceKeys(std::vector<uint64_t>* device_keys, int type) {
 #if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
     gpu_graph_data_generator_.SetDeviceKeys(device_keys, type);
 #endif
   }
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+  virtual const std::vector<uint64_t>& GetHostVec() {
+    return gpu_graph_data_generator_.GetHostVec();
+  }
+#endif
+
+  virtual void clear_gpu_mem() {
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+    gpu_graph_data_generator_.clear_gpu_mem();
+#endif
+  }
+  virtual bool get_epoch_finish() {
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+    return gpu_graph_data_generator_.get_epoch_finish();
+#endif
+  }
+
   virtual void SetGpuGraphMode(int gpu_graph_mode) {
     gpu_graph_mode_ = gpu_graph_mode;
   }
@@ -1058,11 +1105,40 @@ class DataFeed {
     return ins_content_vec_;
   }
   virtual int GetCurBatchSize() { return batch_size_; }
+  virtual int GetGraphPathNum() {
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+    return gpu_graph_data_generator_.GetPathNum();
+#else
+    return 0;
+#endif
+  }
+  virtual void ResetPathNum() {
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+    gpu_graph_data_generator_.ResetPathNum();
+#endif
+  }
+  
+  virtual void ClearSampleState() {
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+    gpu_graph_data_generator_.ClearSampleState();
+#endif
+  }
+
+  virtual void ResetEpochFinish() {
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+    gpu_graph_data_generator_.ResetEpochFinish();
+#endif
+}
+
   virtual bool IsTrainMode() { return train_mode_; }
   virtual void LoadIntoMemory() {
     PADDLE_THROW(platform::errors::Unimplemented(
         "This function(LoadIntoMemory) is not implemented."));
   }
+  virtual void DoWalk() {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This function(DoWalk) is not implemented."));
+  }
   virtual void SetPlace(const paddle::platform::Place& place) {
     place_ = place;
   }
@@ -1637,6 +1713,7 @@ class SlotRecordInMemoryDataFeed : public InMemoryDataFeed<SlotRecord> {
   //                                    CustomParser* parser) {}
   virtual void PutToFeedVec(const std::vector<SlotRecord>& ins_vec) {}
 
+  virtual void InitGraphResource(void);
   virtual void LoadIntoMemoryByCommand(void);
   virtual void LoadIntoMemoryByLib(void);
   virtual void LoadIntoMemoryByLine(void);
@@ -1671,6 +1748,8 @@ class SlotRecordInMemoryDataFeed : public InMemoryDataFeed<SlotRecord> {
                      const int float_slot_size,
                      const UsedSlotGpuType* used_slots);
 #endif
+  virtual void DoWalk();
+
   float sample_rate_ = 1.0f;
   int use_slot_size_ = 0;
   int float_use_slot_size_ = 0;
diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto
index a7ab70948795f..38791a124c56e 100644
--- a/paddle/fluid/framework/data_feed.proto
+++ b/paddle/fluid/framework/data_feed.proto
@@ -38,6 +38,8 @@ message GraphConfig {
   optional string first_node_type = 8;
   optional string meta_path = 9;
   optional bool gpu_graph_training = 10 [ default = true ];
+  optional int64 train_table_cap = 11 [ default = 80000 ];
+  optional int64 infer_table_cap = 12 [ default = 80000 ];
 }
 
 message DataFeedDesc {
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index 1d70ef6a1c78b..e22136fdaf707 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -36,7 +36,9 @@
 #endif
 
 USE_INT_STAT(STAT_total_feasign_num_in_mem);
+USE_INT_STAT(STAT_epoch_finish);
 DECLARE_bool(graph_get_neighbor_id);
+DECLARE_int32(gpugraph_storage_mode);
 
 namespace paddle {
 namespace framework {
@@ -446,18 +448,6 @@ void MultiSlotDataset::PrepareTrain() {
   return;
 }
 
-template <typename T>
-void DatasetImpl<T>::SetGraphDeviceKeys(
-    const std::vector<uint64_t>& h_device_keys) {
-  //  for (size_t i = 0; i < gpu_graph_device_keys_.size(); i++) {
-  //    gpu_graph_device_keys_[i].clear();
-  //  }
-  //  size_t device_num = gpu_graph_device_keys_.size();
-  //  for (size_t i = 0; i < h_device_keys.size(); i++) {
-  //    int shard = h_device_keys[i] % device_num;
-  //    gpu_graph_device_keys_[shard].push_back(h_device_keys[i]);
-  //  }
-}
 // load data into memory, Dataset hold this memory,
 // which will later be fed into readers' channel
 template <typename T>
@@ -469,63 +459,54 @@ void DatasetImpl<T>::LoadIntoMemory() {
   if (gpu_graph_mode_) {
     VLOG(0) << "in gpu_graph_mode";
 #ifdef PADDLE_WITH_HETERPS
-    graph_all_type_total_keys_.clear();
-    auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
-    auto node_to_id = gpu_graph_ptr->feature_to_id;
-    auto edge_to_id = gpu_graph_ptr->edge_to_id;
-    graph_all_type_total_keys_.resize(node_to_id.size());
-    int cnt = 0;
-    // set sample start node
-    for (auto& iter : node_to_id) {
-      int node_idx = iter.second;
-      std::vector<std::vector<uint64_t>> gpu_graph_device_keys;
-      gpu_graph_ptr->get_all_id(
-          1, node_idx, thread_num_, &gpu_graph_device_keys);
-      auto& type_total_key = graph_all_type_total_keys_[cnt];
-      type_total_key.resize(thread_num_);
-      for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) {
-        VLOG(2) << "node type: " << node_idx << ", gpu_graph_device_keys[" << i
-                << "] = " << gpu_graph_device_keys[i].size();
-        for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) {
-          type_total_key[i].push_back(gpu_graph_device_keys[i][j]);
-        }
-      }
-
+    for (size_t i = 0; i < readers_.size(); i++) {
+      readers_[i]->SetGpuGraphMode(gpu_graph_mode_);
+    }
+    
+    if (STAT_GET(STAT_epoch_finish) == 1) {
+      VLOG(0) << "get epoch finish true";
+      STAT_RESET(STAT_epoch_finish, 0);
       for (size_t i = 0; i < readers_.size(); i++) {
-        readers_[i]->SetDeviceKeys(&type_total_key[i], node_idx);
-        readers_[i]->SetGpuGraphMode(gpu_graph_mode_);
+        readers_[i]->ResetPathNum();
+        readers_[i]->ResetEpochFinish();
       }
-      cnt++;
+      return;
     }
 
-    // add node embedding id
-    std::vector<std::vector<uint64_t>> gpu_graph_device_keys;
-    gpu_graph_ptr->get_node_embedding_ids(thread_num_, &gpu_graph_device_keys);
-    for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) {
-      for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) {
-        gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]);
+    for (int64_t i = 0; i < thread_num_; ++i) {
+      load_threads.push_back(
+          std::thread(&paddle::framework::DataFeed::DoWalk, readers_[i].get()));
+    }
+    for (std::thread& t : load_threads) {
+      t.join();
+    }
+    uint64_t node_num = 0;
+    for (int i = 0; i < thread_num_; i++) {
+      auto& host_vec = readers_[i]->GetHostVec();
+      node_num += host_vec.size();
+    }
+    gpu_graph_total_keys_.reserve(node_num);
+    for (int i = 0; i < thread_num_; i++) {
+      auto& host_vec = readers_[i]->GetHostVec();
+      for (size_t j = 0; j < host_vec.size(); j++) {
+        gpu_graph_total_keys_.push_back(host_vec[j]);
       }
     }
 
-    // add feature embedding id
-    VLOG(2) << "begin add feature_id into gpu_graph_total_keys_ size["
-            << gpu_graph_total_keys_.size() << "]";
-    for (auto& iter : node_to_id) {
-      std::vector<std::vector<uint64_t>> gpu_graph_device_keys;
-      int node_idx = iter.second;
-      gpu_graph_ptr->get_all_feature_ids(
-          1, node_idx, thread_num_, &gpu_graph_device_keys);
-      for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) {
-        VLOG(2) << "begin node type: " << node_idx << ", gpu_graph_device_keys["
-                << i << "] = " << gpu_graph_device_keys[i].size();
-        for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) {
-          gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]);
-        }
-        VLOG(2) << "end node type: " << node_idx << ", gpu_graph_device_keys["
-                << i << "] = " << gpu_graph_device_keys[i].size();
+    if (GetEpochFinish() == true) {
+      VLOG(0) << "epoch finish, set stat and clear sample stat!";
+      STAT_RESET(STAT_epoch_finish, 1);
+      for (size_t i = 0; i < readers_.size(); i++) {
+        readers_[i]->ClearSampleState();
       }
     }
-    VLOG(2) << "end add feature_id into gpu_graph_total_keys_ size["
+    if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) {
+      for (size_t i = 0; i < readers_.size(); i++) {
+        readers_[i]->clear_gpu_mem();
+      }
+    }
+
+    VLOG(2) << "end add edge into gpu_graph_total_keys_ size["
             << gpu_graph_total_keys_.size() << "]";
 #endif
   } else {
@@ -1123,7 +1104,26 @@ void DatasetImpl<T>::DestroyPreLoadReaders() {
 
 template <typename T>
 int64_t DatasetImpl<T>::GetMemoryDataSize() {
-  return input_channel_->Size();
+  if (gpu_graph_mode_) {
+    int64_t total_path_num = 0;
+    for (int i = 0; i < thread_num_; i++) {
+      total_path_num += readers_[i]->GetGraphPathNum();
+    }
+    return total_path_num;
+  } else {
+    return input_channel_->Size();
+  }
+}
+
+template <typename T>
+bool DatasetImpl<T>::GetEpochFinish() {
+  bool is_epoch_finish = true;
+  if (gpu_graph_mode_) {
+    for (int i = 0; i < thread_num_; i++) {
+      is_epoch_finish = is_epoch_finish && readers_[i]->get_epoch_finish();
+    }
+  }
+  return is_epoch_finish;
 }
 
 template <typename T>
@@ -1780,6 +1780,7 @@ void SlotRecordDataset::CreateReaders() {
     readers_[i]->SetParseLogKey(parse_logkey_);
     readers_[i]->SetEnablePvMerge(enable_pv_merge_);
     readers_[i]->SetCurrentPhase(current_phase_);
+    readers_[i]->InitGraphResource();
     if (input_channel_ != nullptr) {
       readers_[i]->SetInputChannel(input_channel_.get());
     }
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index 0489c2ece64e8..9e1998a35fd64 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -169,6 +169,10 @@ class Dataset {
 
   virtual void SetGpuGraphMode(int is_graph_mode) = 0;
   virtual int GetGpuGraphMode() = 0;
+  virtual bool GetEpochFinish() = 0;
+
+  virtual void SetPassId(uint32_t pass_id) = 0;
+  virtual uint32_t GetPassID() = 0;
 
  protected:
   virtual int ReceiveFromClient(int msg_type,
@@ -253,7 +257,7 @@ class DatasetImpl : public Dataset {
                                          int read_thread_num,
                                          int consume_thread_num,
                                          int shard_num) {}
-  virtual void SetGraphDeviceKeys(const std::vector<uint64_t>& h_device_keys);
+  virtual void SetGraphDeviceKeys(const std::vector<uint64_t>& h_device_keys) {}
   virtual void ClearLocalTables() {}
   virtual void CreatePreLoadReaders();
   virtual void DestroyPreLoadReaders();
@@ -263,11 +267,7 @@ class DatasetImpl : public Dataset {
   virtual void DynamicAdjustReadersNum(int thread_num);
   virtual void SetFleetSendSleepSeconds(int seconds);
   virtual std::vector<std::string> GetSlots();
-  /* for enable_heterps_
-  virtual void EnableHeterps(bool enable_heterps) {
-    enable_heterps_ = enable_heterps;
-  }
-  */
+  virtual bool GetEpochFinish();
 
   std::vector<paddle::framework::Channel<T>>& GetMultiOutputChannel() {
     return multi_output_channel_;
@@ -280,10 +280,13 @@ class DatasetImpl : public Dataset {
       return multi_consume_channel_;
     }
   }
+  Channel<T>& GetInputChannelRef() { return input_channel_; }
   std::vector<uint64_t>& GetGpuGraphTotalKeys() {
     return gpu_graph_total_keys_;
   }
-  Channel<T>& GetInputChannelRef() { return input_channel_; }
+
+  virtual void SetPassId(uint32_t pass_id) { pass_id_ = pass_id; }
+  virtual uint32_t GetPassID() { return pass_id_; }
 
  protected:
   virtual int ReceiveFromClient(int msg_type,
@@ -344,9 +347,9 @@ class DatasetImpl : public Dataset {
   std::vector<std::string> use_slots_;
   bool enable_heterps_ = false;
   int gpu_graph_mode_ = 0;
-  // std::vector<std::vector<int64_t>> gpu_graph_device_keys_;
-  std::vector<std::vector<std::vector<uint64_t>>> graph_all_type_total_keys_;
+  std::vector<std::vector<std::vector<uint64_t>>> gpu_graph_type_keys_;
   std::vector<uint64_t> gpu_graph_total_keys_;
+  uint32_t pass_id_ = 0;
 };
 
 // use std::vector<MultiSlotType> or Record as data type
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 4cf3ab8dc1a67..bacb096f751b1 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -29,7 +29,7 @@ if(WITH_HETERPS)
       nv_library(
         ps_gpu_wrapper
         SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
-        DEPS heter_ps gloo_wrapper ps_framework_proto ${BRPC_DEPS})
+        DEPS heter_ps gloo_wrapper ps_framework_proto graph_gpu_wrapper ${BRPC_DEPS})
     else()
       nv_library(
         ps_gpu_wrapper
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index ef2e73d6dd5b5..cab31ade6c374 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -86,6 +86,7 @@ class HeterContext {
   int multi_mf_dim_ = 0;
 
   uint32_t shard_num_ = 37;
+  uint16_t pass_id_ = 0;
   uint64_t size() {
     uint64_t total_size = 0;
     for (auto& keys : feature_keys_) {
diff --git a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
index 85bf6bb553b22..160de2646d7d9 100644
--- a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
@@ -524,6 +524,7 @@ class concurrent_unordered_map : public managed {
   __forceinline__ __device__ iterator
   insert(const value_type& x,
          aggregation_type op,
+         uint64_t* local_count = NULL,
          comparison_type keys_equal = key_equal(),
          bool precomputed_hash = false,
          hash_value_type precomputed_hash_value = 0) {
@@ -580,6 +581,10 @@ class concurrent_unordered_map : public managed {
         if (m_enable_collision_stat) {
           atomicAdd(&m_insert_times, 1);
         }
+
+        if (local_count != NULL && keys_equal(unused_key, old_key)) {
+          atomicAdd(local_count, 1);
+        }
         break;
       }
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
index 08a87b6a84688..c6b3bde4a2bbd 100644
--- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
@@ -170,6 +170,7 @@ struct NeighborSampleResult {
   uint64_t *actual_val;
   int *actual_sample_size, sample_size, key_size;
   int total_sample_size;
+  cudaStream_t stream=0;
   std::shared_ptr<memory::Allocation> val_mem, actual_sample_size_mem;
   std::shared_ptr<memory::Allocation> actual_val_mem;
   uint64_t *get_val() { return val; }
@@ -179,18 +180,30 @@ struct NeighborSampleResult {
   int get_key_size() { return key_size; }
   void set_total_sample_size(int s) { total_sample_size = s; }
   int get_len() { return total_sample_size; }
+  void set_stream(cudaStream_t stream_t) {
+      stream = stream_t;
+  }
   void initialize(int _sample_size, int _key_size, int dev_id) {
     sample_size = _sample_size;
     key_size = _key_size;
     platform::CUDADeviceGuard guard(dev_id);
     platform::CUDAPlace place = platform::CUDAPlace(dev_id);
-    val_mem =
-        memory::AllocShared(place, _sample_size * _key_size * sizeof(uint64_t));
+    if (stream != 0) {
+      val_mem =
+          memory::AllocShared(place, _sample_size * _key_size * sizeof(uint64_t),  phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+      actual_sample_size_mem =
+          memory::AllocShared(place, _key_size * sizeof(int),  phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+    }
+    else {
+      val_mem =
+          memory::AllocShared(place, _sample_size * _key_size * sizeof(uint64_t));
+      actual_sample_size_mem =
+          memory::AllocShared(place, _key_size * sizeof(int));
+    }
     val = (uint64_t *)val_mem->ptr();
-    actual_sample_size_mem =
-        memory::AllocShared(place, _key_size * sizeof(int));
     actual_sample_size = (int *)actual_sample_size_mem->ptr();
   }
+
   void display() {
     VLOG(0) << "in node sample result display ------------------";
     int64_t *res = new int64_t[sample_size * key_size];
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
index aa202fe020fe9..d890b74ff9b2e 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -112,8 +112,17 @@ class GpuPsGraphTable
         }
       }
     }
+    device_mutex_.resize(gpu_num);
+    for (int i = 0; i < gpu_num; i++) {
+      device_mutex_[i] = new std::mutex();
+    }
+  }
+  ~GpuPsGraphTable() {
+    for (size_t i = 0; i < device_mutex_.size(); ++i) {
+      delete device_mutex_[i];
+    }
+    device_mutex_.clear();
   }
-  ~GpuPsGraphTable() {}
   void build_graph_on_single_gpu(const GpuPsCommGraph &g, int gpu_id, int idx);
   void build_graph_fea_on_single_gpu(const GpuPsCommGraphFea &g, int gpu_id);
   void clear_graph_info(int gpu_id, int index);
@@ -154,7 +163,9 @@ class GpuPsGraphTable
                                  uint64_t *src_sample_res,
                                  int *actual_sample_size);
   int init_cpu_table(const paddle::distributed::GraphParameter &graph);
-
+  gpuStream_t get_local_stream(int gpu_id) {
+    return resource_->local_stream(gpu_id, 0);
+  }
   int gpu_num;
   int graph_table_num_, feature_table_num_;
   std::vector<GpuPsCommGraph> gpu_graph_list_;
@@ -165,6 +176,7 @@ class GpuPsGraphTable
   std::shared_ptr<paddle::distributed::GraphTable> cpu_graph_table_;
   std::shared_ptr<pthread_rwlock_t> rw_lock;
   mutable std::mutex mutex_;
+  std::vector<std::mutex *> device_mutex_;
   std::condition_variable cv_;
   int cpu_table_status;
 };
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
index 3693277a75d39..eac888383a3de 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
@@ -15,8 +15,8 @@
 #include <thrust/device_vector.h>
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
-
 #include <functional>
+#include "cub/cub.cuh"
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
@@ -678,7 +678,10 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     int sample_size,
     int len,
     bool cpu_query_switch) {
+  device_mutex_[gpu_id]->lock();
   NeighborSampleResult result;
+  auto stream = resource_->local_stream(gpu_id, 0);
+  result.set_stream(stream);
   result.initialize(sample_size, len, resource_->dev_id(gpu_id));
 
   if (len == 0) {
@@ -691,15 +694,20 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
   int* actual_sample_size = result.actual_sample_size;
   uint64_t* val = result.val;
   int total_gpu = resource_->total_device();
-  auto stream = resource_->local_stream(gpu_id, 0);
 
   int grid_size = (len - 1) / block_size_ + 1;
 
   int h_left[total_gpu];   // NOLINT
   int h_right[total_gpu];  // NOLINT
 
-  auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
-  auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
+  auto d_left =
+      memory::Alloc(place,
+                    total_gpu * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  auto d_right =
+      memory::Alloc(place,
+                    total_gpu * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
   int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
   int default_value = 0;
@@ -710,15 +718,26 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
   CUDA_CHECK(cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream));
   CUDA_CHECK(cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream));
   //
-  auto d_idx = memory::Alloc(place, len * sizeof(int));
+  auto d_idx =
+      memory::Alloc(place,
+                    len * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
 
-  auto d_shard_keys = memory::Alloc(place, len * sizeof(uint64_t));
+  auto d_shard_keys =
+      memory::Alloc(place,
+                    len * sizeof(uint64_t),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   uint64_t* d_shard_keys_ptr = reinterpret_cast<uint64_t*>(d_shard_keys->ptr());
   auto d_shard_vals =
-      memory::Alloc(place, sample_size * len * sizeof(uint64_t));
+      memory::Alloc(place,
+                    sample_size * len * sizeof(uint64_t),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   uint64_t* d_shard_vals_ptr = reinterpret_cast<uint64_t*>(d_shard_vals->ptr());
-  auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
+  auto d_shard_actual_sample_size =
+      memory::Alloc(place,
+                    len * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_shard_actual_sample_size_ptr =
       reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
 
@@ -730,10 +749,17 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
-  CUDA_CHECK(cudaMemcpy(
-      h_left, d_left_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(
-      h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpyAsync(h_left,
+                             d_left_ptr,
+                             total_gpu * sizeof(int),
+                             cudaMemcpyDeviceToHost,
+                             stream));
+  CUDA_CHECK(cudaMemcpyAsync(h_right,
+                             d_right_ptr,
+                             total_gpu * sizeof(int),
+                             cudaMemcpyDeviceToHost,
+                             stream));
+  CUDA_CHECK(cudaStreamSynchronize(stream));
   for (int i = 0; i < total_gpu; ++i) {
     int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
     if (shard_len == 0) {
@@ -911,29 +937,50 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     CUDA_CHECK(cudaStreamSynchronize(stream));
     platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
     platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
-
-    thrust::device_vector<int> t_actual_sample_size(len);
-    thrust::copy(actual_sample_size,
-                 actual_sample_size + len,
-                 t_actual_sample_size.begin());
-    int total_sample_size = thrust::reduce(t_actual_sample_size.begin(),
-                                           t_actual_sample_size.end());
-
-    result.actual_val_mem =
-        memory::AllocShared(place, total_sample_size * sizeof(uint64_t));
+    size_t temp_storage_bytes = 0;
+    int total_sample_size = 0;
+    auto cumsum_actual_sample_size =
+        memory::Alloc(place,
+                      (len + 1) * sizeof(int),
+                      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+    int* cumsum_actual_sample_size_p =
+        reinterpret_cast<int*>(cumsum_actual_sample_size->ptr());
+    CUDA_CHECK(
+        cudaMemsetAsync(cumsum_actual_sample_size_p, 0, sizeof(int), stream));
+    CUDA_CHECK(cub::DeviceScan::InclusiveSum(NULL,
+                                             temp_storage_bytes,
+                                             actual_sample_size,
+                                             cumsum_actual_sample_size_p + 1,
+                                             len,
+                                             stream));
+    auto d_temp_storage =
+        memory::Alloc(place,
+                      temp_storage_bytes,
+                      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+    CUDA_CHECK(cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(),
+                                             temp_storage_bytes,
+                                             actual_sample_size,
+                                             cumsum_actual_sample_size_p + 1,
+                                             len,
+                                             stream));
+    CUDA_CHECK(cudaMemcpyAsync(&total_sample_size,
+                               cumsum_actual_sample_size_p + len,
+                               sizeof(int),
+                               cudaMemcpyDeviceToHost,
+                               stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    result.actual_val_mem = memory::AllocShared(
+        place,
+        total_sample_size * sizeof(uint64_t),
+        phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
     result.actual_val = (uint64_t*)(result.actual_val_mem)->ptr();
 
     result.set_total_sample_size(total_sample_size);
-    thrust::device_vector<int> cumsum_actual_sample_size(len);
-    thrust::exclusive_scan(t_actual_sample_size.begin(),
-                           t_actual_sample_size.end(),
-                           cumsum_actual_sample_size.begin(),
-                           0);
     fill_actual_vals<<<grid_size, block_size_, 0, stream>>>(
         val,
         result.actual_val,
         actual_sample_size,
-        thrust::raw_pointer_cast(cumsum_actual_sample_size.data()),
+        cumsum_actual_sample_size_p,
         sample_size,
         len);
   }
@@ -945,6 +992,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     destroy_storage(gpu_id, i);
   }
   cudaStreamSynchronize(stream);
+  device_mutex_[gpu_id]->unlock();
   return result;
 }
 
@@ -1002,6 +1050,7 @@ int GpuPsGraphTable::get_feature_of_nodes(int gpu_id,
                                           uint64_t* d_feature,
                                           int node_num,
                                           int slot_num) {
+  device_mutex_[gpu_id]->lock();
   if (node_num == 0) {
     return -1;
   }
@@ -1011,23 +1060,40 @@ int GpuPsGraphTable::get_feature_of_nodes(int gpu_id,
   int total_gpu = resource_->total_device();
   auto stream = resource_->local_stream(gpu_id, 0);
 
-  auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
-  auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
+  auto d_left =
+      memory::Alloc(place,
+                    total_gpu * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  auto d_right =
+      memory::Alloc(place,
+                    total_gpu * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
   int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
 
   CUDA_CHECK(cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream));
   CUDA_CHECK(cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream));
   //
-  auto d_idx = memory::Alloc(place, node_num * sizeof(int));
+  auto d_idx =
+      memory::Alloc(place,
+                    node_num * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
 
-  auto d_shard_keys = memory::Alloc(place, node_num * sizeof(uint64_t));
+  auto d_shard_keys =
+      memory::Alloc(place,
+                    node_num * sizeof(uint64_t),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   uint64_t* d_shard_keys_ptr = reinterpret_cast<uint64_t*>(d_shard_keys->ptr());
   auto d_shard_vals =
-      memory::Alloc(place, slot_num * node_num * sizeof(uint64_t));
+      memory::Alloc(place,
+                    slot_num * node_num * sizeof(uint64_t),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   uint64_t* d_shard_vals_ptr = reinterpret_cast<uint64_t*>(d_shard_vals->ptr());
-  auto d_shard_actual_size = memory::Alloc(place, node_num * sizeof(int));
+  auto d_shard_actual_size =
+      memory::Alloc(place,
+                    node_num * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_shard_actual_size_ptr =
       reinterpret_cast<int*>(d_shard_actual_size->ptr());
 
@@ -1131,6 +1197,7 @@ int GpuPsGraphTable::get_feature_of_nodes(int gpu_id,
   }
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
+  device_mutex_[gpu_id]->unlock();
 
   return 0;
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
index 4cc1b746a558f..6e0ea83d92991 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
@@ -28,6 +28,121 @@ void GraphGpuWrapper::set_device(std::vector<int> ids) {
   }
 }
 
+void GraphGpuWrapper::init_conf(const std::string &first_node_type,
+                                const std::string &meta_path) {
+  static std::mutex mutex;
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    if (conf_initialized_) {
+      return;
+    }
+    VLOG(2) << "init path config";
+    conf_initialized_ = true;
+    auto node_types =
+        paddle::string::split_string<std::string>(first_node_type, ";");
+    VLOG(2) << "node_types: " << first_node_type;
+    for (auto &type : node_types) {
+      auto iter = feature_to_id.find(type);
+      PADDLE_ENFORCE_NE(iter,
+                        feature_to_id.end(),
+                        platform::errors::NotFound(
+                            "(%s) is not found in feature_to_id.", type));
+      VLOG(2) << "feature_to_id[" << type << "] = " << iter->second;
+      first_node_type_.push_back(iter->second);
+    }
+    meta_path_.resize(first_node_type_.size());
+    auto meta_paths = paddle::string::split_string<std::string>(meta_path, ";");
+
+    for (size_t i = 0; i < meta_paths.size(); i++) {
+      auto path = meta_paths[i];
+      auto nodes = paddle::string::split_string<std::string>(path, "-");
+      for (auto &node : nodes) {
+        auto iter = edge_to_id.find(node);
+        PADDLE_ENFORCE_NE(iter,
+                          edge_to_id.end(),
+                          platform::errors::NotFound(
+                              "(%s) is not found in edge_to_id.", node));
+        VLOG(2) << "edge_to_id[" << node << "] = " << iter->second;
+        meta_path_[i].push_back(iter->second);
+      }
+    }
+    int max_dev_id = 0;
+    for (size_t i = 0; i < device_id_mapping.size(); i++) {
+      if (device_id_mapping[i] > max_dev_id) {
+        max_dev_id = device_id_mapping[i];
+      }
+    }
+    finish_node_type_.resize(max_dev_id + 1);
+    node_type_start_.resize(max_dev_id + 1);
+    global_infer_node_type_start_.resize(max_dev_id + 1);
+    for (size_t i = 0; i < device_id_mapping.size(); i++) {
+      int dev_id = device_id_mapping[i];
+      auto &node_type_start = node_type_start_[i];
+      auto &infer_node_type_start = global_infer_node_type_start_[i];
+      auto &finish_node_type = finish_node_type_[i];
+      finish_node_type.clear();
+
+      for (size_t idx = 0; idx < feature_to_id.size(); idx++) {
+        infer_node_type_start[idx] = 0;
+      }
+      for (auto &type : node_types) {
+        auto iter = feature_to_id.find(type);
+        node_type_start[iter->second] = 0;
+        infer_node_type_start[iter->second] = 0;
+      }
+      infer_cursor_.push_back(0);
+      cursor_.push_back(0);
+    }
+    init_type_keys();
+  }
+}
+
+void GraphGpuWrapper::init_type_keys() {
+  size_t thread_num = device_id_mapping.size();
+  int cnt = 0;
+
+  auto &graph_all_type_total_keys = get_graph_type_keys();
+  auto &type_to_index = get_graph_type_to_index();
+  std::vector<std::vector<uint64_t>> tmp_keys;
+  tmp_keys.resize(thread_num);
+  d_graph_all_type_total_keys_.resize(graph_all_type_total_keys.size());
+  h_graph_all_type_keys_len_.resize(graph_all_type_total_keys.size());
+  for (size_t f_idx = 0; f_idx < graph_all_type_total_keys.size(); f_idx++) {
+    for (size_t j = 0; j < tmp_keys.size(); j++) {
+      tmp_keys[j].clear();
+    }
+    d_graph_all_type_total_keys_[f_idx].resize(thread_num);
+    auto &type_total_key = graph_all_type_total_keys[f_idx];
+    for (size_t j = 0; j < type_total_key.size(); j++) {
+      uint64_t shard = type_total_key[j] % thread_num;
+      tmp_keys[shard].push_back(type_total_key[j]);
+    }
+    for (size_t j = 0; j < thread_num; j++) {
+      h_graph_all_type_keys_len_[f_idx].push_back(tmp_keys[j].size());
+      VLOG(1) << "node type: " << type_to_index[f_idx]
+              << ", gpu_graph_device_keys[" << j
+              << "] = " << tmp_keys[j].size();
+    }
+    for (size_t j = 0; j < thread_num; j++) {
+      auto stream = get_local_stream(j);
+      int gpuid = device_id_mapping[j];
+      auto place = platform::CUDAPlace(gpuid);
+      platform::CUDADeviceGuard guard(gpuid);
+      d_graph_all_type_total_keys_[f_idx][j] =
+          memory::AllocShared(place, tmp_keys[j].size() * sizeof(uint64_t));
+      cudaMemcpyAsync(d_graph_all_type_total_keys_[f_idx][j]->ptr(),
+                      tmp_keys[j].data(),
+                      sizeof(uint64_t) * tmp_keys[j].size(),
+                      cudaMemcpyHostToDevice,
+                      stream);
+    }
+  }
+  for (int i = 0; i < thread_num; i++) {
+    auto stream = get_local_stream(i);
+    cudaStreamSynchronize(stream);
+  }
+}
+
 int GraphGpuWrapper::get_all_id(int type,
                                 int slice_num,
                                 std::vector<std::vector<uint64_t>> *output) {
@@ -144,6 +259,15 @@ void GraphGpuWrapper::load_edge_file(std::string name,
   }
 }
 
+void GraphGpuWrapper::load_edge_file(std::string etype2files,
+                                     std::string graph_data_local_path,
+                                     int part_num,
+                                     bool reverse) {
+  ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table_->parse_edge_and_load(
+          etype2files, graph_data_local_path, part_num, reverse);
+}
+
 void GraphGpuWrapper::load_node_file(std::string name, std::string filepath) {
   // 'n' means load nodes and 'node_type' follows
 
@@ -155,15 +279,22 @@ void GraphGpuWrapper::load_node_file(std::string name, std::string filepath) {
   }
 }
 
-void GraphGpuWrapper::load_node_and_edge(std::string etype,
-                                         std::string ntype,
-                                         std::string epath,
-                                         std::string npath,
+void GraphGpuWrapper::load_node_file(std::string ntype2files,
+                                     std::string graph_data_local_path,
+                                     int part_num) {
+  ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table_->parse_node_and_load(
+          ntype2files, graph_data_local_path, part_num);
+}
+
+void GraphGpuWrapper::load_node_and_edge(std::string etype2files,
+                                         std::string ntype2files,
+                                         std::string graph_data_local_path,
                                          int part_num,
                                          bool reverse) {
   ((GpuPsGraphTable *)graph_table)
       ->cpu_graph_table_->load_node_and_edge_file(
-          etype, ntype, epath, npath, part_num, reverse);
+          etype2files, ntype2files, graph_data_local_path, part_num, reverse);
 }
 
 void GraphGpuWrapper::add_table_feat_conf(std::string table_name,
@@ -196,6 +327,10 @@ void GraphGpuWrapper::add_table_feat_conf(std::string table_name,
 }
 void GraphGpuWrapper::init_search_level(int level) { search_level = level; }
 
+gpuStream_t GraphGpuWrapper::get_local_stream(int gpuid) {
+  return ((GpuPsGraphTable *)graph_table)->get_local_stream(gpuid);
+}
+
 void GraphGpuWrapper::init_service() {
   table_proto.set_task_pool_size(24);
   table_proto.set_shard_num(1000);
@@ -357,9 +492,6 @@ std::vector<uint64_t> GraphGpuWrapper::graph_neighbor_sample(
       res.push_back(cpu_key[i * sample_size + j]);
     }
   }
-  /* for(int i = 0;i < res.size();i ++) { */
-  /*     VLOG(0) << i << " " << res[i]; */
-  /* } */
   delete[] actual_sample_size;
   cudaFree(cuda_key);
   return res;
@@ -386,6 +518,31 @@ void GraphGpuWrapper::export_partition_files(int idx, std::string file_path) {
   return ((GpuPsGraphTable *)graph_table)
       ->cpu_graph_table_->export_partition_files(idx, file_path);
 }
+
+void GraphGpuWrapper::release_graph() {
+  return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->release_graph();
+}
+
+void GraphGpuWrapper::release_graph_edge() {
+  return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->release_graph_edge();
+}
+
+void GraphGpuWrapper::release_graph_node() {
+  return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->release_graph_node();
+}
+
+std::vector<uint64_t> &GraphGpuWrapper::get_graph_total_keys() {
+  return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->graph_total_keys_;
+}
+
+std::vector<std::vector<uint64_t>> &GraphGpuWrapper::get_graph_type_keys() {
+  return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->graph_type_keys_;
+}
+
+std::unordered_map<int, int> &GraphGpuWrapper::get_graph_type_to_index() {
+  return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->type_to_index_;
+}
+
 #endif
 }  // namespace framework
 };  // namespace paddle
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
index b41303b85e0df..8f7c29f487f4e 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
@@ -22,6 +22,14 @@
 namespace paddle {
 namespace framework {
 #ifdef PADDLE_WITH_HETERPS
+
+enum GpuGraphStorageMode {
+  WHOLE_HBM = 1,
+  MEM_EMB_AND_GPU_GRAPH,
+  MEM_EMB_FEATURE_AND_GPU_GRAPH,
+  SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH
+};
+
 class GraphGpuWrapper {
  public:
   static std::shared_ptr<GraphGpuWrapper> GetInstance() {
@@ -31,6 +39,8 @@ class GraphGpuWrapper {
     return s_instance_;
   }
   static std::shared_ptr<GraphGpuWrapper> s_instance_;
+  void init_conf(const std::string& first_node_type,
+                 const std::string& meta_path);
   void initialize();
   void finalize();
   void set_device(std::vector<int> ids);
@@ -47,11 +57,19 @@ class GraphGpuWrapper {
                            std::string feat_dtype,
                            int feat_shape);
   void load_edge_file(std::string name, std::string filepath, bool reverse);
+  void load_edge_file(std::string etype2files,
+                      std::string graph_data_local_path,
+                      int part_num,
+                      bool reverse);
+
   void load_node_file(std::string name, std::string filepath);
-  void load_node_and_edge(std::string etype,
-                          std::string ntype,
-                          std::string epath,
-                          std::string npath,
+  void load_node_file(std::string ntype2files,
+                      std::string graph_data_local_path,
+                      int part_num);
+ 
+  void load_node_and_edge(std::string etype2files,
+                          std::string ntype2files,
+                          std::string graph_data_local_path,
                           int part_num,
                           bool reverse);
   int32_t load_next_partition(int idx);
@@ -93,6 +111,7 @@ class GraphGpuWrapper {
                                              uint64_t* device_keys,
                                              int walk_degree,
                                              int len);
+  gpuStream_t get_local_stream(int gpuid);
   std::vector<uint64_t> graph_neighbor_sample(int gpu_id,
                                               int idx,
                                               std::vector<uint64_t>& key,
@@ -104,6 +123,14 @@ class GraphGpuWrapper {
                            uint32_t size,
                            int slot_num);
 
+  void release_graph();
+  void release_graph_edge();
+  void release_graph_node();
+  void init_type_keys();
+  std::vector<uint64_t>& get_graph_total_keys();
+  std::vector<std::vector<uint64_t>>& get_graph_type_keys();
+  std::unordered_map<int, int>& get_graph_type_to_index();
+
   std::unordered_map<std::string, int> edge_to_id, feature_to_id;
   std::vector<std::string> id_to_feature, id_to_edge;
   std::vector<std::unordered_map<std::string, int>> table_feat_mapping;
@@ -117,6 +144,19 @@ class GraphGpuWrapper {
   int upload_num = 8;
   std::shared_ptr<::ThreadPool> upload_task_pool;
   std::string feature_separator_ = std::string(" ");
+  //
+  bool conf_initialized_ = false;
+  std::vector<int> first_node_type_;
+  std::vector<std::vector<int>> meta_path_;
+
+  std::vector<std::set<int>> finish_node_type_;
+  std::vector<std::unordered_map<int, size_t>> node_type_start_;
+  std::vector<std::unordered_map<int, size_t>> global_infer_node_type_start_;
+  std::vector<size_t> infer_cursor_;
+  std::vector<size_t> cursor_;
+  std::vector<std::vector<std::shared_ptr<phi::Allocation>>>
+      d_graph_all_type_total_keys_;
+  std::vector<std::vector<uint64_t>> h_graph_all_type_keys_len_;
 };
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index 18fb2eca5b752..05c254b2739f2 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -124,6 +124,12 @@ class HashTable {
               size_t len,
               StreamType stream);
 
+  template <typename StreamType>
+  void insert(const KeyType* d_keys,
+              size_t len,
+              uint64_t* global_num,
+              StreamType stream);
+
   template <typename StreamType>
   void insert(const KeyType* d_keys,
               size_t len,
@@ -153,6 +159,9 @@ class HashTable {
   template <typename StreamType>
   void dump_to_cpu(int devid, StreamType stream);
 
+  template <typename StreamType>
+  void get_keys(KeyType* d_out, uint64_t* global_cursor, StreamType stream);
+
 #if defined(PADDLE_WITH_CUDA)
 
   template <typename Sgd, typename StreamType>
@@ -185,7 +194,7 @@ class HashTable {
 #endif
 
   int size() { return container_->size(); }
-
+  thrust::pair<KeyType, ValType>* data() { return container_->data(); }
   void set_feature_value_size(size_t pull_feature_value_size,
                               size_t push_grad_value_size) {
     pull_feature_value_size_ = pull_feature_value_size;
@@ -194,6 +203,12 @@ class HashTable {
             << " push value size: " << push_grad_value_size_;
   }
 
+  int prefetch(const int dev_id, cudaStream_t stream = 0) {
+    return container_->prefetch(dev_id, stream);
+  }
+
+  void clear(cudaStream_t stream = 0) { container_->clear_async(stream); }
+
   void show_collision(int id) { return container_->print_collision(id); }
 
   std::unique_ptr<phi::RWLock> rwlock_{nullptr};
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index 1fda5a586a2e8..33b50f789a49c 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -31,6 +31,35 @@ struct ReplaceOp {
   }
 };
 
+template <typename Table>
+__global__ void insert_kernel(Table* table,
+                              const typename Table::key_type* const keys,
+                              size_t len,
+                              uint64_t* global_num) {
+  ReplaceOp<typename Table::mapped_type> op;
+  thrust::pair<typename Table::key_type, typename Table::mapped_type> kv;
+
+  __shared__ uint64_t local_num;
+
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (threadIdx.x == 0) {
+    local_num = 0;
+  }
+  __syncthreads();
+
+  if (i < len) {
+    kv.first = keys[i];
+    kv.second = 1;  // fake value
+    auto it = table->insert(kv, op, &local_num);
+    assert(it != table->end() && "error: insert fails: table is full");
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    atomicAdd(global_num, local_num);
+  }
+}
+
 template <typename Table>
 __global__ void insert_kernel(Table* table,
                               const typename Table::key_type* const keys,
@@ -38,7 +67,6 @@ __global__ void insert_kernel(Table* table,
                               size_t len) {
   ReplaceOp<typename Table::mapped_type> op;
   thrust::pair<typename Table::key_type, typename Table::mapped_type> kv;
-
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < len) {
     kv.first = keys[i];
@@ -139,6 +167,41 @@ __global__ void dy_mf_update_kernel(Table* table,
   }
 }
 
+template <typename Table>
+__global__ void get_keys_kernel(Table* table,
+                                typename Table::key_type* d_out,
+                                uint64_t* global_cursor,
+                                uint64_t unused_key) {
+  extern __shared__ typename Table::key_type local_key[];
+  __shared__ uint64_t local_num;
+  __shared__ uint64_t global_num;
+
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (threadIdx.x == 0) {
+    local_num = 0;
+  }
+  __syncthreads();
+  uint64_t len = table->size();
+  if (idx < len) {
+    typename Table::value_type val = *(table->data() + idx);
+    if (val.first != unused_key) {
+      uint64_t dst = atomicAdd(&local_num, 1);
+      local_key[dst] = val.first;
+    }
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    global_num = atomicAdd(global_cursor, local_num);
+  }
+  __syncthreads();
+
+  if (threadIdx.x < local_num) {
+    d_out[global_num + threadIdx.x] = local_key[threadIdx.x];
+  }
+}
+
 template <typename KeyType, typename ValType>
 HashTable<KeyType, ValType>::HashTable(size_t capacity) {
   container_ = new TableContainer<KeyType, ValType>(capacity);
@@ -211,6 +274,20 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys,
       container_, d_keys, d_vals, len, pull_feature_value_size_, fv_accessor);
 }
 
+template <typename KeyType, typename ValType>
+template <typename StreamType>
+void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
+                                         size_t len,
+                                         uint64_t* global_num,
+                                         StreamType stream) {
+  if (len == 0) {
+    return;
+  }
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
+  insert_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+      container_, d_keys, len, global_num);
+}
+
 template <typename KeyType, typename ValType>
 template <typename StreamType>
 void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
@@ -225,6 +302,20 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
       container_, d_keys, d_vals, len);
 }
 
+template <typename KeyType, typename ValType>
+template <typename StreamType>
+void HashTable<KeyType, ValType>::get_keys(KeyType* d_out,
+                                           uint64_t* global_cursor,
+                                           StreamType stream) {
+  size_t len = container_->size();
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
+  KeyType unuse_key = std::numeric_limits<KeyType>::max();
+  size_t shared_mem_size = sizeof(KeyType) * BLOCK_SIZE_;
+  get_keys_kernel<<<grid_size, BLOCK_SIZE_, shared_mem_size, stream>>>(
+      container_, d_out, global_cursor, unuse_key);
+}
+
+
 template <typename KeyType, typename ValType>
 template <typename StreamType>
 void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
@@ -436,6 +527,17 @@ template void HashTable<long, unsigned int>::insert<cudaStream_t>(
     size_t len,
     cudaStream_t stream);
 
+template void HashTable<unsigned long, unsigned long>::get_keys<cudaStream_t>(
+                unsigned long* d_out,
+                unsigned long* global_cursor,
+                cudaStream_t stream);
+
+template void HashTable<unsigned long, unsigned long>::insert<cudaStream_t>(
+    const unsigned long* d_keys,
+    unsigned long len,
+    uint64_t* global_num,
+    cudaStream_t stream);
+
 template void HashTable<unsigned long, unsigned long>::insert<cudaStream_t>(
     const unsigned long* d_keys,
     const unsigned long* d_vals,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 82532836b8e22..cf6c4eaf8b99a 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -1127,13 +1127,22 @@ void HeterComm<KeyType, ValType, GradType, GPUAccessor>::split_input_to_shard(
   AnyDeviceGuard guard(dev_id);
   auto stream = resource_->local_stream(dev_num, 0);
 
-  auto d_idx_tmp = memory::Alloc(place, len * sizeof(int));
+  auto d_idx_tmp =
+      memory::Alloc(place,
+                    len * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_idx_tmp_ptr = reinterpret_cast<int*>(d_idx_tmp->ptr());
 
-  auto d_shard_index = memory::Alloc(place, len * sizeof(int));
+  auto d_shard_index =
+      memory::Alloc(place,
+                    len * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_shard_index_ptr = reinterpret_cast<int*>(d_shard_index->ptr());
 
-  auto d_shard_index_tmp = memory::Alloc(place, len * sizeof(int));
+  auto d_shard_index_tmp =
+      memory::Alloc(place,
+                    len * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_shard_index_tmp_ptr = reinterpret_cast<int*>(d_shard_index_tmp->ptr());
 
   heter_comm_kernel_->fill_idx(d_idx_tmp_ptr, len, stream);
@@ -1153,7 +1162,10 @@ void HeterComm<KeyType, ValType, GradType, GPUAccessor>::split_input_to_shard(
                                  num_bits,
                                  stream);
 
-  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  auto d_temp_storage =
+      memory::Alloc(place,
+                    temp_storage_bytes,
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   heter_comm_kernel_->sort_pairs(d_temp_storage->ptr(),
                                  temp_storage_bytes,
                                  d_shard_index_tmp_ptr,
diff --git a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
index 4696a7cc91b5a..22d6199584f58 100644
--- a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
+++ b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
@@ -96,6 +96,53 @@ class HBMMemoryPool : public managed {
   size_t block_size_;
 };
 
+class HBMMemoryPoolFix : public managed {
+ public:
+  HBMMemoryPoolFix() {
+    capacity_ = 0;
+    size_ = 0 ;
+    block_size_ = 0;
+    max_byte_capacity_ = 0;
+  }
+
+  ~HBMMemoryPoolFix() {
+    VLOG(3) << "delete hbm memory pool";
+    cudaFree(mem_);
+  }
+
+  size_t block_size() { return block_size_; }
+
+  void clear(void) { cudaMemset(mem_, 0, block_size_ * capacity_); }
+
+  void reset(size_t capacity, size_t block_size) {
+    if (max_byte_capacity_ < capacity * block_size) {
+      if (mem_ != NULL) {
+        cudaFree(mem_);
+      }
+      max_byte_capacity_ = (block_size * capacity / 8 + 1) * 8;
+      CUDA_CHECK(cudaMalloc(&mem_, max_byte_capacity_));
+    }
+    size_ = capacity;
+    block_size_ = block_size;
+    capacity_ = max_byte_capacity_ / block_size;
+  }
+
+  char* mem() { return mem_; }
+
+  size_t capacity() { return capacity_; }
+  size_t size() { return size_; }
+  __forceinline__ __device__ void* mem_address(const uint32_t& idx) {
+    return (void*)&mem_[(idx)*block_size_];
+  }
+
+ private:
+  char* mem_ = NULL;
+  size_t capacity_;
+  size_t size_;
+  size_t block_size_;
+  size_t max_byte_capacity_;
+};
+
 }  // end namespace framework
 }  // end namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 91eb11d9dbdbc..a61860c7f3608 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -34,12 +34,14 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
 #include "paddle/fluid/platform/timer.h"
 #if defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
 #endif
 
 DECLARE_int32(gpugraph_dedup_pull_push_mode);
+DECLARE_int32(gpugraph_storage_mode);
 
 namespace paddle {
 namespace framework {
@@ -111,6 +113,102 @@ void PSGPUWrapper::InitAfsApi(const std::string& fs_name,
   use_afs_api_ = 1;
 }
 #endif
+
+void PSGPUWrapper::add_key_to_local(const std::vector<uint64_t>& vec_data) {
+  size_t total_len = vec_data.size();
+  size_t len_per_thread = total_len / thread_keys_thread_num_;
+  size_t begin = 0;
+  std::vector<std::thread> threads;
+
+  int remain = total_len % thread_keys_thread_num_;
+  auto gen_graph_data_func = [this](const std::vector<uint64_t>& total_data,
+                                    int begin_index,
+                                    int end_index,
+                                    int i) {
+    for (auto iter = total_data.begin() + begin_index;
+         iter != total_data.begin() + end_index;
+         iter++) {
+      uint64_t cur_key = *iter;
+      int shard_id = cur_key % thread_keys_shard_num_;
+      this->thread_keys_[i][shard_id].insert(cur_key);
+    }
+  };
+  auto gen_graph_dynamic_mf_func = [this](
+                                       const std::vector<uint64_t>& total_data,
+                                       int begin_index,
+                                       int end_index,
+                                       int i) {
+    for (auto iter = total_data.begin() + begin_index;
+         iter != total_data.begin() + end_index;
+         iter++) {
+      uint64_t cur_key = *iter;
+      int shard_id = cur_key % thread_keys_shard_num_;
+      // TODO: feasign <-> slot <-> multi_dim
+      this->thread_dim_keys_[i][shard_id][0].insert(cur_key);
+    }
+  };
+  for (int i = 0; i < thread_keys_thread_num_; i++) {
+    if (!multi_mf_dim_) {
+      threads.push_back(
+          std::thread(gen_graph_data_func,
+                      std::ref(vec_data),
+                      begin,
+                      begin + len_per_thread + (i < remain ? 1 : 0),
+                      i));
+    } else {
+      threads.push_back(
+          std::thread(gen_graph_dynamic_mf_func,
+                      std::ref(vec_data),
+                      begin,
+                      begin + len_per_thread + (i < remain ? 1 : 0),
+                      i));
+    }
+    begin += len_per_thread + (i < remain ? 1 : 0);
+  }
+  for (std::thread& t : threads) {
+    t.join();
+  }
+}
+
+void PSGPUWrapper::add_key_to_gputask(std::shared_ptr<HeterContext> gpu_task) {
+  std::vector<std::thread> threads;
+  platform::Timer timeline;
+  timeline.Start();
+  // merge thread_keys to shard_keys
+  auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) {
+    for (int i = 0; i < thread_keys_thread_num_; ++i) {
+      gpu_task->batch_add_keys(
+          shard_num, dim_id, thread_dim_keys_[i][shard_num][dim_id]);
+      thread_dim_keys_[i][shard_num][dim_id].clear();
+    }
+  };
+  for (int i = 0; i < thread_keys_shard_num_; ++i) {
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j));
+    }
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  timeline.Pause();
+
+  VLOG(0) << "GpuPs task add keys cost " << timeline.ElapsedSec()
+          << " seconds.";
+  timeline.Start();
+  gpu_task->UniqueKeys();
+  timeline.Pause();
+  VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
+  for (int i = 0; i < thread_keys_shard_num_; i++) {
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      if (i == 0 && j == multi_mf_dim_ - 1) {
+        gpu_task->feature_dim_keys_[i][j].push_back(0);
+      }
+      gpu_task->value_dim_ptr_[i][j].resize(
+          gpu_task->feature_dim_keys_[i][j].size());
+    }
+  }
+}
+
 void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
   VLOG(3) << "PSGPUWrapper::BuildGPUPSTask begin";
   platform::Timer timeline;
@@ -237,118 +335,24 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
     VLOG(0) << "PreBuild in GpuGraph mode";
     SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_);
     const std::vector<uint64_t>& vec_data = dataset->GetGpuGraphTotalKeys();
-
-    total_len = vec_data.size();
-    len_per_thread = total_len / thread_keys_thread_num_;
-    VLOG(0) << "GpuGraphTotalKeys: " << total_len;
-    remain = total_len % thread_keys_thread_num_;
-    auto gen_graph_data_func = [this](const std::vector<uint64_t>& total_data,
-                                      int begin_index,
-                                      int end_index,
-                                      int i) {
-      for (auto iter = total_data.begin() + begin_index;
-           iter != total_data.begin() + end_index;
-           iter++) {
-        uint64_t cur_key = *iter;
-        int shard_id = cur_key % thread_keys_shard_num_;
-        this->thread_keys_[i][shard_id].insert(cur_key);
-      }
-    };
-    auto gen_graph_dynamic_mf_func =
-        [this](const std::vector<uint64_t>& total_data,
-               int begin_index,
-               int end_index,
-               int i) {
-          for (auto iter = total_data.begin() + begin_index;
-               iter != total_data.begin() + end_index;
-               iter++) {
-            uint64_t cur_key = *iter;
-            int shard_id = cur_key % thread_keys_shard_num_;
-            // TODO: feasign <-> slot <-> multi_dim
-            this->thread_dim_keys_[i][shard_id][0].insert(cur_key);
-          }
-        };
-    for (int i = 0; i < thread_keys_thread_num_; i++) {
-      if (!multi_mf_dim_) {
-        VLOG(1) << "psgpu graph wrapper genfunc";
-        threads.push_back(
-            std::thread(gen_graph_data_func,
-                        std::ref(vec_data),
-                        begin,
-                        begin + len_per_thread + (i < remain ? 1 : 0),
-                        i));
-      } else {
-        VLOG(1) << "psgpu graph wrapper genfunc with dynamic mf";
-        threads.push_back(
-            std::thread(gen_graph_dynamic_mf_func,
-                        std::ref(vec_data),
-                        begin,
-                        begin + len_per_thread + (i < remain ? 1 : 0),
-                        i));
-      }
-      begin += len_per_thread + (i < remain ? 1 : 0);
-    }
-    for (std::thread& t : threads) {
-      t.join();
-    }
+    VLOG(0) << "GpuGraphTotalKeys: " << vec_data.size();
+    timeline.Start();
+    add_key_to_local(vec_data);
+    timeline.Pause();
+    VLOG(0) << "add_key_to_local cost " << timeline.ElapsedSec() << " seconds.";
   }
 
-  timeline.Start();
-
-  threads.clear();
-  // merge thread_keys to shard_keys
-  auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) {
-    for (int i = 0; i < thread_keys_thread_num_; ++i) {
-      gpu_task->batch_add_keys(
-          shard_num, dim_id, thread_dim_keys_[i][shard_num][dim_id]);
-      thread_dim_keys_[i][shard_num][dim_id].clear();
-    }
-  };
-  for (int i = 0; i < thread_keys_shard_num_; ++i) {
-    for (int j = 0; j < multi_mf_dim_; j++) {
-      threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j));
-    }
-  }
-  for (auto& t : threads) {
-    t.join();
-  }
-  timeline.Pause();
-
-  VLOG(0) << "GpuPs task add keys cost " << timeline.ElapsedSec()
-          << " seconds.";
-  timeline.Start();
-  gpu_task->UniqueKeys();
-  timeline.Pause();
-
-  VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
-  for (int i = 0; i < thread_keys_shard_num_; i++) {
-    for (int j = 0; j < multi_mf_dim_; j++) {
-      if (i == 0 && j == multi_mf_dim_ - 1) {
-        gpu_task->feature_dim_keys_[i][j].push_back(0);
-      }
-      VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j]
-              << " key len: " << gpu_task->feature_dim_keys_[i][j].size();
-      gpu_task->value_dim_ptr_[i][j].resize(
-          gpu_task->feature_dim_keys_[i][j].size());
-    }
-  }
+  add_key_to_gputask(gpu_task);
 }
 
 void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
   platform::Timer timeline;
-  std::vector<std::future<void>> task_futures;
-  int device_num = heter_devices_.size();
-  auto& local_keys = gpu_task->feature_keys_;
-  auto& local_ptr = gpu_task->value_ptr_;
 
   auto& local_dim_keys = gpu_task->feature_dim_keys_;
   auto& local_dim_ptr = gpu_task->value_dim_ptr_;
 
-  auto& device_keys = gpu_task->device_keys_;
-  auto& device_vals = gpu_task->device_values_;
   auto& device_dim_keys = gpu_task->device_dim_keys_;
   auto& device_dim_ptr = gpu_task->device_dim_ptr_;
-  auto& device_dim_mutex = gpu_task->dim_mutex_;
 
   for (size_t dev = 0; dev < device_dim_keys.size(); dev++) {
     device_dim_keys[dev].resize(multi_mf_dim_);
@@ -380,7 +384,8 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
   timeline.Start();
 
   auto ptl_dynamic_mf_func =
-      [this, &local_dim_keys, &local_dim_ptr, &fleet_ptr](int i, int j) {
+      [this, &local_dim_keys, &local_dim_ptr, &fleet_ptr, &gpu_task](int i,
+                                                                     int j) {
         size_t key_size = local_dim_keys[i][j].size();
         int32_t status = -1;
         int32_t cnt = 0;
@@ -421,10 +426,12 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
 #ifdef PADDLE_WITH_PSCORE
         while (true) {
           auto tt = fleet_ptr->worker_ptr_->PullSparsePtr(
+              i,
               reinterpret_cast<char**>(local_dim_ptr[i][j].data()),
               this->table_id_,
               local_dim_keys[i][j].data(),
-              key_size);
+              key_size,
+              gpu_task->pass_id_);
           bool flag = true;
 
           tt.wait();
@@ -461,7 +468,10 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
         }
       };
 
+  // fleet_ptr->pslib_ptr_->_worker_ptr->acquire_table_mutex(this->table_id_);
   threads.resize(thread_keys_shard_num_ * multi_mf_dim_);
+
+  std::vector<std::future<void>> task_futures;
   for (int i = 0; i < thread_keys_shard_num_; i++) {
     for (int j = 0; j < multi_mf_dim_; j++) {
       task_futures.emplace_back(
@@ -471,6 +481,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
   for (auto& f : task_futures) {
     f.wait();
   }
+  // fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_);
   task_futures.clear();
   timeline.Pause();
   VLOG(0) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec()
@@ -483,13 +494,29 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     }
     gloo_wrapper->Barrier();
   }
+}
 
-  timeline.Start();
-  std::vector<std::vector<std::pair<uint64_t, char*>>> pass_values;
+void PSGPUWrapper::divide_to_device(std::shared_ptr<HeterContext> gpu_task) {
+  platform::Timer timeline;
+  int device_num = heter_devices_.size();
+  std::vector<std::thread> threads;
+  std::vector<std::future<void>> task_futures;
+  auto& local_dim_keys = gpu_task->feature_dim_keys_;
+  auto& local_dim_ptr = gpu_task->value_dim_ptr_;
 
-  bool record_status = false;
-  auto& device_task_keys = gpu_task->device_task_keys_;
-  auto& device_task_ptrs = gpu_task->device_task_ptr_;
+  auto& device_dim_keys = gpu_task->device_dim_keys_;
+  auto& device_dim_ptr = gpu_task->device_dim_ptr_;
+  auto& device_dim_mutex = gpu_task->dim_mutex_;
+  // auto& device_mutex = gpu_task->mutex_;
+
+  if (multi_mf_dim_) {
+    for (size_t dev = 0; dev < device_dim_keys.size(); dev++) {
+      device_dim_keys[dev].resize(multi_mf_dim_);
+      device_dim_ptr[dev].resize(multi_mf_dim_);
+    }
+  }
+
+  timeline.Start();
   auto build_pull_dynamic_mf_func = [this,
                                      device_num,
                                      &local_dim_keys,
@@ -526,6 +553,43 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
       device_dim_mutex[dev][j]->unlock();
     }
   };
+
+  if (multi_mf_dim_) {
+    threads.resize(thread_keys_shard_num_ * multi_mf_dim_);
+    for (int i = 0; i < thread_keys_shard_num_; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        threads[i * multi_mf_dim_ + j] =
+            std::thread(build_pull_dynamic_mf_func, i, j);
+      }
+    }
+    for (std::thread& t : threads) {
+      t.join();
+    }
+  }
+  timeline.Pause();
+  VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec()
+          << " seconds.";
+}
+
+void PSGPUWrapper::PrepareGPUTask(std::shared_ptr<HeterContext> gpu_task) {
+  platform::Timer timeline;
+  int device_num = heter_devices_.size();
+  std::vector<std::thread> threads;
+  std::vector<std::future<void>> task_futures;
+  auto& local_keys = gpu_task->feature_keys_;
+  auto& local_ptr = gpu_task->value_ptr_;
+
+  auto& device_keys = gpu_task->device_keys_;
+  auto& device_vals = gpu_task->device_values_;
+  // auto& device_mutex = gpu_task->mutex_;
+
+  timeline.Start();
+  std::vector<std::vector<std::pair<uint64_t, char*>>> pass_values;
+
+  bool record_status = false;
+  auto& device_task_keys = gpu_task->device_task_keys_;
+  auto& device_task_ptrs = gpu_task->device_task_ptr_;
+
   auto build_func = [device_num,
                      record_status,
                      &pass_values,
@@ -653,18 +717,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
 #endif
     VLOG(3) << "GpuPs build hbmps done";
   };
-
-  if (multi_mf_dim_) {
-    for (int i = 0; i < thread_keys_shard_num_; i++) {
-      for (int j = 0; j < multi_mf_dim_; j++) {
-        threads[i * multi_mf_dim_ + j] =
-            std::thread(build_pull_dynamic_mf_func, i, j);
-      }
-    }
-    for (std::thread& t : threads) {
-      t.join();
-    }
-  } else {
+  if (!multi_mf_dim_) {
     for (int i = 0; i < thread_keys_shard_num_; i++) {
       for (int j = 0; j < device_num; j++) {
         task_futures.emplace_back(
@@ -683,8 +736,8 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
 
 void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
   int device_num = heter_devices_.size();
-  platform::Timer timeline;
-  timeline.Start();
+  platform::Timer stagetime;
+  stagetime.Start();
 
   std::vector<size_t> feature_keys_count(device_num);
   size_t size_max = 0;
@@ -696,7 +749,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
               << " dim index: " << j << " contains feasign nums: "
               << gpu_task->device_dim_ptr_[i][j].size();
     }
-    VLOG(1) << i << " card with dynamic mf contains feasign nums total: "
+    VLOG(0) << i << " card with dynamic mf contains feasign nums total: "
             << feature_keys_count[i];
     size_max = std::max(size_max, feature_keys_count[i]);
   }
@@ -719,87 +772,28 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
   HeterPs_->set_sparse_sgd(optimizer_config_);
   HeterPs_->set_embedx_sgd(optimizer_config_);
 #endif
+  stagetime.Pause();
+  VLOG(0) << "card: "
+          << " BuildGPUTask create HeterPs_ costs: " << stagetime.ElapsedSec()
+          << " s.";
+  stagetime.Start();
 
-  auto build_dymf_mem_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i,
-                                                                      int j) {
-    this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_);
-    int mf_dim = this->index_dim_vec_[j];
-    VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim
-            << " feature_value_size:"
-            << accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
-    size_t feature_value_size =
-        accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
-    auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
+  auto build_dynamic_mf_func = [this, &gpu_task, &accessor_wrapper_ptr](
+                                   int i, int j, size_t start, size_t end) {
+    // this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_);
     auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j];
-    size_t len = device_dim_keys.size();
-    CHECK(len == device_dim_ptrs.size());
-    this->mem_pools_[i * this->multi_mf_dim_ + j] =
-        new MemoryPool(len, feature_value_size);
-  };
-  auto build_dymf_hbm_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i,
-                                                                      int j) {
-    auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
-    size_t len = device_dim_keys.size();
     int mf_dim = this->index_dim_vec_[j];
     size_t feature_value_size =
         accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
+    size_t real_len = end - start;
+    std::shared_ptr<char> build_values(new char[feature_value_size * real_len],
+                                       [](char* p) { delete[] p; });
+    char* test_build_values = build_values.get();
 
-    auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j];
-    platform::CUDADeviceGuard guard(resource_->dev_id(i));
-    this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool);
-    auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
-
-    this->HeterPs_->build_ps(i,
-                             device_dim_keys.data(),
-                             cur_pool->mem(),
-                             len,
-                             feature_value_size,
-                             500000,
-                             2);
-    if (device_dim_keys.size() > 0) {
-      VLOG(3) << "show table: " << i
-              << " table kv size: " << device_dim_keys.size()
-              << "dim: " << mf_dim << " len: " << len;
-      HeterPs_->show_one_table(i);
-    }
-    delete mem_pool;
-  };
-  int thread_num = 16;
-  auto build_dynamic_mf_func = [this,
-                                &gpu_task,
-                                thread_num,
-                                &accessor_wrapper_ptr](int i, int j, int z) {
-    // this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_);
-    int mf_dim = this->index_dim_vec_[j];
-    VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim;
-    auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
-    auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j];
-    size_t len = device_dim_keys.size();
-    CHECK(len == device_dim_ptrs.size());
-    // this->mem_pools_[i * this->multi_mf_dim_ + j] =
-    //    new MemoryPool(len, feature_value_size);
-    auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j];
-
-    // ============ add for multi-thread ================
-    size_t len_per_thread = len / thread_num;
-    size_t remain = len % thread_num;
-    size_t left = 0, right = 0;
-
-    size_t real_len = len_per_thread;
-    if ((size_t)z < remain) real_len++;
-
-    if ((size_t)z < remain) {
-      left = z * (len_per_thread + 1);
-      right = left + real_len;
-    } else {
-      left = remain * (len_per_thread + 1) + (z - remain) * len_per_thread;
-      right = left + real_len;
-    }
-    // ============ add for multi-thread ================
-
-    for (size_t k = left; k < right; k++) {
+    for (size_t k = start; k < end; k++) {
 #ifdef PADDLE_WITH_PSLIB
-      float* val = (float*)(mem_pool->mem_address(k));
+      float* val =
+          (float*)(test_build_values + (k - start) * feature_value_size);
       float* ptr_val = device_dim_ptrs[k]->data();
       size_t dim = device_dim_ptrs[k]->size();
       val->delta_score =
@@ -833,54 +827,124 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
       }
 #endif
 #ifdef PADDLE_WITH_PSCORE
-      void* val = mem_pool->mem_address(k);
+      void* val =
+          (float*)(test_build_values + (k - start) * feature_value_size);
       accessor_wrapper_ptr->BuildFill(
           val, device_dim_ptrs[k], cpu_table_accessor_, mf_dim);
 #endif
     }
+    task_info task;
+    task.build_values = build_values;
+    task.offset = start;
+    task.device_id = i;
+    task.multi_mf_dim = j;
+    task.start = 0;
+    task.end = int(real_len);
+    cpu_reday_channels_[i]->Put(task);
   };
 
-  threads.resize(device_num * multi_mf_dim_);
-  for (int i = 0; i < device_num; i++) {
+  auto build_dymf_hbm_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i) {
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    std::vector<std::thread> threads(multi_mf_dim_);
     for (int j = 0; j < multi_mf_dim_; j++) {
-      threads[i + j * device_num] = std::thread(build_dymf_mem_pool, i, j);
+      auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
+      size_t len = device_dim_keys.size();
+      int mf_dim = this->index_dim_vec_[j];
+      size_t feature_value_size =
+          accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
+      this->hbm_pools_[i * this->multi_mf_dim_ + j]->reset(len,
+                                                           feature_value_size);
+
+      auto build_ps_thread =
+          [this, &gpu_task](
+              int i, int j, size_t len, size_t feature_value_size) {
+            auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
+            this->HeterPs_->build_ps(
+                i,
+                device_dim_keys.data(),
+                this->hbm_pools_[i * this->multi_mf_dim_ + j]->mem(),
+                len,
+                feature_value_size,
+                500000,
+                2);
+            if (device_dim_keys.size() > 0) {
+              VLOG(3) << "show table: " << i
+                      << " table kv size: " << device_dim_keys.size()
+                      << "dim: " << this->index_dim_vec_[j] << " len: " << len;
+              HeterPs_->show_one_table(i);
+            }
+          };
+      threads[j] = std::thread(build_ps_thread, i, j, len, feature_value_size);
     }
-  }
 
-  for (std::thread& t : threads) {
-    t.join();
-  }
-  threads.clear();
+    struct task_info task;
+    while (cpu_reday_channels_[i]->Get(task)) {
+      auto hbm = this->hbm_pools_[task.device_id * this->multi_mf_dim_ +
+                                  task.multi_mf_dim]
+                     ->mem();
+      int mf_dim = this->index_dim_vec_[task.multi_mf_dim];
+      size_t feature_value_size =
+          accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
+      auto hbm_start = hbm + task.offset * feature_value_size;
+      CUDA_CHECK(
+          cudaMemcpy(hbm_start,
+                     task.build_values.get() + task.start * feature_value_size,
+                     (task.end - task.start) * feature_value_size,
+                     cudaMemcpyHostToDevice));
+    }
+    platform::Timer stagetime;
+    stagetime.Start();
+    for (std::thread& t : threads) {
+      t.join();
+    }
+    stagetime.Pause();
+    VLOG(0) << "card: " << i
+            << " BuildGPUTask build_ps async costs: " << stagetime.ElapsedSec()
+            << " s.";
+  };
+
+  std::vector<std::future<void>> cpu_task_futures;
+  std::vector<std::future<void>> gpu_task_futures;
 
-  // multi-thread process
-  threads.resize(device_num * multi_mf_dim_ * thread_num);
+  int once_gpu_copy = 64 * 1024;
+  threads.resize(device_num * multi_mf_dim_);
   for (int i = 0; i < device_num; i++) {
+    cpu_reday_channels_[i]->Open();
+    gpu_task_futures.emplace_back(
+        hbm_thread_pool_[i]->enqueue(build_dymf_hbm_pool, i));
     for (int j = 0; j < multi_mf_dim_; j++) {
-      for (int k = 0; k < thread_num; k++) {
-        threads[(i + j * device_num) * thread_num + k] =
-            std::thread(build_dynamic_mf_func, i, j, k);
+      auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
+      size_t len = device_dim_keys.size();
+      size_t start = 0;
+      size_t end = 0;
+      while (end < len) {
+        start = end;
+        end = end + once_gpu_copy < len ? (end + once_gpu_copy) : len;
+        cpu_task_futures.emplace_back(cpu_work_pool_[i]->enqueue(
+            build_dynamic_mf_func, i, j, start, end));
       }
     }
   }
 
-  for (std::thread& t : threads) {
-    t.join();
+  stagetime.Start();
+  for (auto& f : cpu_task_futures) {
+    f.wait();
   }
-  threads.clear();
-  threads.resize(device_num * multi_mf_dim_);
+  cpu_task_futures.clear();
+  stagetime.Pause();
+  VLOG(0) << " BuildGPUTask build_dynamic_mf_func "
+          << " cost " << stagetime.ElapsedSec() << " s.";
   for (int i = 0; i < device_num; i++) {
-    for (int j = 0; j < multi_mf_dim_; j++) {
-      threads[i + j * device_num] = std::thread(build_dymf_hbm_pool, i, j);
-    }
+    cpu_reday_channels_[i]->Close();
   }
-  for (std::thread& t : threads) {
-    t.join();
+  stagetime.Start();
+  for (auto& f : gpu_task_futures) {
+    f.wait();
   }
-  threads.clear();
-
-  timeline.Pause();
-  VLOG(0) << "GpuPs build table total costs: " << timeline.ElapsedSec()
-          << " s.";
+  gpu_task_futures.clear();
+  stagetime.Pause();
+  VLOG(0) << " BuildGPUTask build_dymf_hbm_pool "
+          << " cost " << stagetime.ElapsedSec() << " s.";
 }
 
 void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
@@ -890,17 +954,25 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
   dataset_->LoadIntoMemory();
   timer.Pause();
   VLOG(0) << "LoadIntoMemory cost: " << timer.ElapsedSec() << "s";
-
+  gpu_graph_mode_ = dataset_->GetGpuGraphMode();
+  if (dataset_->GetMemoryDataSize() == 0) {
+    VLOG(0) << "GetMemoryDataSize == 0";
+    return;
+  }
   // local shuffle
   if (is_shuffle) {
     dataset_->LocalShuffle();
   }
-  InitSlotInfo();
-  gpu_graph_mode_ = dataset_->GetGpuGraphMode();
-  std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
-  gpu_task->Reset();
 
-  data_ready_channel_->Put(gpu_task);
+  InitSlotInfo();
+  if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) {
+    std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
+    gpu_task->Reset();
+    gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID());
+    data_ready_channel_->Put(gpu_task);
+  } else if (hbm_sparse_table_initialized_ == false) {
+    SparseTableToHbm();
+  }
 
   VLOG(3) << "End LoadIntoMemory(), dataset[" << dataset_ << "]";
 }
@@ -909,6 +981,7 @@ void PSGPUWrapper::start_build_thread() {
   running_ = true;
   VLOG(3) << "start build CPU ps thread.";
   pre_build_threads_ = std::thread([this] { pre_build_thread(); });
+  buildpull_threads_ = std::thread([this] { build_pull_thread(); });
 }
 
 void PSGPUWrapper::pre_build_thread() {
@@ -931,6 +1004,27 @@ void PSGPUWrapper::pre_build_thread() {
   VLOG(3) << "build cpu thread end";
 }
 
+void PSGPUWrapper::build_pull_thread() {
+  while (running_) {
+    std::shared_ptr<HeterContext> gpu_task = nullptr;
+    if (!buildcpu_ready_channel_->Get(gpu_task)) {
+      continue;
+    }
+    VLOG(3) << "thread build pull start.";
+    platform::Timer timer;
+    timer.Start();
+    // build cpu ps data process
+    BuildPull(gpu_task);
+    if (multi_mf_dim_) {
+      divide_to_device(gpu_task);
+    }
+    timer.Pause();
+    VLOG(1) << "thread BuildPull end, cost time: " << timer.ElapsedSec() << "s";
+    buildpull_ready_channel_->Put(gpu_task);
+  }
+  VLOG(3) << "build cpu thread end";
+}
+
 void PSGPUWrapper::build_task() {
   // build_task: build_pull + build_gputask
   std::shared_ptr<HeterContext> gpu_task = nullptr;
@@ -939,24 +1033,29 @@ void PSGPUWrapper::build_task() {
     return;
   }
   // ins and pre_build end
-  if (!buildcpu_ready_channel_->Get(gpu_task)) {
+  if (!buildpull_ready_channel_->Get(gpu_task)) {
     return;
   }
 
-  VLOG(0) << "BuildPull start.";
+  VLOG(0) << "PrepareGPUTask start.";
   platform::Timer timer;
   timer.Start();
-  BuildPull(gpu_task);
+  if (!multi_mf_dim_) {
+    PrepareGPUTask(gpu_task);
+  }
   BuildGPUTask(gpu_task);
   timer.Pause();
-  VLOG(0) << "BuildPull + BuildGPUTask end, cost time: " << timer.ElapsedSec()
-          << "s";
+  VLOG(0) << "PrepareGPUTask + BuildGPUTask end, cost time: "
+          << timer.ElapsedSec() << "s";
 
   current_task_ = gpu_task;
 }
 
 void PSGPUWrapper::BeginPass() {
   platform::Timer timer;
+  if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) {
+    return;
+  }
   timer.Start();
   if (current_task_) {
     PADDLE_THROW(
@@ -982,12 +1081,59 @@ void PSGPUWrapper::BeginPass() {
 }
 
 void PSGPUWrapper::EndPass() {
+  if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) {
+    return;
+  }
+  platform::Timer stagetime;
+  stagetime.Start();
+  HbmToSparseTable();
+  stagetime.Pause();
+  VLOG(0) << "EndPass HbmToSparseTable cost time: " << stagetime.ElapsedSec()
+          << "s";
+
+  gpu_task_pool_.Push(current_task_);
+  current_task_ = nullptr;
+  gpu_free_channel_->Put(current_task_);
+  // fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_);
+}
+
+void PSGPUWrapper::SparseTableToHbm() {
+  std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
+  gpu_task->Reset();
+  size_t device_num = heter_devices_.size();
+  gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);
+  gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID());
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+  auto node_to_id = gpu_graph_ptr->feature_to_id;
+  auto edge_to_id = gpu_graph_ptr->edge_to_id;
+  std::vector<uint64_t> vec_data = gpu_graph_ptr->get_graph_total_keys();
+
+  thread_dim_keys_.resize(thread_keys_thread_num_);
+  for (int i = 0; i < thread_keys_thread_num_; i++) {
+    thread_dim_keys_[i].resize(thread_keys_shard_num_);
+    for (int j = 0; j < thread_keys_shard_num_; j++) {
+      thread_dim_keys_[i][j].resize(multi_mf_dim_);
+    }
+  }
+
+  add_key_to_local(vec_data);
+  add_key_to_gputask(gpu_task);
+  BuildPull(gpu_task);
+  if (!multi_mf_dim_) {
+    PrepareGPUTask(gpu_task);
+  } else {
+    divide_to_device(gpu_task);
+  }
+  BuildGPUTask(gpu_task);
+  current_task_ = gpu_task;
+  hbm_sparse_table_initialized_ = true;
+}
+
+void PSGPUWrapper::HbmToSparseTable() {
   if (!current_task_) {
     PADDLE_THROW(
         platform::errors::Fatal("[EndPass] current task has been ended."));
   }
-  platform::Timer timer;
-  timer.Start();
   size_t keysize_max = 0;
   // in case of feasign_num = 0, skip dump_to_cpu
 
@@ -997,86 +1143,125 @@ void PSGPUWrapper::EndPass() {
           std::max(keysize_max, current_task_->device_dim_keys_[i][j].size());
     }
   }
-  int thread_num = 8;
   auto accessor_wrapper_ptr =
       GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
-  auto dump_pool_to_cpu_func = [this, thread_num, &accessor_wrapper_ptr](
-                                   int i, int j, int z) {
+  // auto fleet_ptr = FleetWrapper::GetInstance();
+  // fleet_ptr->pslib_ptr_->_worker_ptr->acquire_table_mutex(this->table_id_);
+
+  int once_cpu_num = 16 * 1024;
+  int once_gpu_copy = 8 * once_cpu_num;
+
+  auto dump_pool_to_cpu_func = [this, &accessor_wrapper_ptr, once_cpu_num](
+                                   int i, int j, size_t start, size_t end) {
     PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i)));
     auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
-    auto& device_keys = this->current_task_->device_dim_keys_[i][j];
-    size_t len = device_keys.size();
-    // ====== multi-thread process feasign================
-    int len_per_thread = len / thread_num;
-    int remain = len % thread_num;
-    int left = -1, right = -1;
-    int real_len = len_per_thread;
-    if (z < remain) real_len++;
-    if (z < remain) {
-      left = z * (len_per_thread + 1);
-      right = left + real_len;
-    } else {
-      left = remain * (len_per_thread + 1) + (z - remain) * len_per_thread;
-      right = left + real_len;
-    }
+    size_t real_len = end - start;
     // ============ multi-thread process feasign============
     int mf_dim = this->index_dim_vec_[j];
     size_t feature_value_size =
         accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
-    VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim
-            << " key_len :" << len
-            << " feature_value_size:" << feature_value_size;
-    char* test_build_values = (char*)malloc(feature_value_size * real_len);
-    uint64_t offset = left * feature_value_size;
+
+    std::shared_ptr<char> build_values(new char[feature_value_size * real_len],
+                                       [](char* p) { delete[] p; });
+    uint64_t offset = start * feature_value_size;
+    char* test_build_values = build_values.get();
+
     cudaMemcpy(test_build_values,
                hbm_pool->mem() + offset,
                feature_value_size * real_len,
                cudaMemcpyDeviceToHost);
-    CHECK(len == hbm_pool->capacity());
-    uint64_t unuse_key = std::numeric_limits<uint64_t>::max();
-    for (int i = left; i < right; ++i) {
-      if (device_keys[i] == unuse_key) {
-        continue;
-      }
-      size_t local_offset = (i - left) * feature_value_size;
-      float* gpu_val = (float*)(test_build_values + local_offset);
+    for (size_t k = 0; k * once_cpu_num < real_len; k++) {
+      struct task_info task;
+      task.build_values = build_values;
+      task.offset = start;
+      task.device_id = i;
+      task.multi_mf_dim = j;
+      task.start = k * once_cpu_num;
+      task.end = (k + 1) * once_cpu_num < real_len ? ((k + 1) * once_cpu_num)
+                                                   : (real_len);
+      cpu_reday_channels_[i]->Put(task);
+    }
+  };
+  auto cpu_func = [this, &accessor_wrapper_ptr](int j) {
+    struct task_info task;
+    while (cpu_reday_channels_[j]->Get(task)) {
+      auto& device_keys =
+          this->current_task_
+              ->device_dim_keys_[task.device_id][task.multi_mf_dim];
+      char* test_build_values = task.build_values.get();
+      int mf_dim = this->index_dim_vec_[task.multi_mf_dim];
+      size_t feature_value_size =
+          accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
+      uint64_t unuse_key = std::numeric_limits<uint64_t>::max();
+      for (int i = task.start; i < task.end; ++i) {
+        if (device_keys[i + task.offset] == unuse_key) {
+          continue;
+        }
+        size_t local_offset = i * feature_value_size;
+        float* gpu_val = (float*)(test_build_values + local_offset);
 #ifdef PADDLE_WITH_PSLIB
-      // TODO: PSLIB DumpFill
+        // TODO: PSLIB DumpFill
 #endif
 #ifdef PADDLE_WITH_PSCORE
-      accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim);
+        accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim);
 #endif
+      }
     }
-    free(test_build_values);
   };
+  platform::Timer timer;
+  timer.Start();
+  std::vector<std::future<void>> cpu_task_futures;
+  std::vector<std::future<void>> gpu_task_futures;
+  size_t thread_num = 16;
+  size_t device_num = heter_devices_.size();
   if (multi_mf_dim_) {
     VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_;
-    size_t device_num = heter_devices_.size();
-    std::vector<std::thread> threads(device_num * multi_mf_dim_ * thread_num);
     for (size_t i = 0; i < device_num; i++) {
+      cpu_reday_channels_[i]->Open();
       for (int j = 0; j < multi_mf_dim_; j++) {
-        for (int k = 0; k < thread_num; k++) {
-          threads[(i + j * device_num) * thread_num + k] =
-              std::thread(dump_pool_to_cpu_func, i, j, k);
+        auto& device_keys = this->current_task_->device_dim_keys_[i][j];
+        size_t len = device_keys.size();
+        size_t start = 0;
+        size_t end = 0;
+        while (end < len) {
+          start = end;
+          end = end + once_gpu_copy < len ? (end + once_gpu_copy) : len;
+          gpu_task_futures.emplace_back(hbm_thread_pool_[i]->enqueue(
+              dump_pool_to_cpu_func, i, j, start, end));
         }
       }
+      for (size_t j = 0; j < thread_num; j++) {
+        cpu_task_futures.emplace_back(cpu_work_pool_[i]->enqueue(cpu_func, i));
+      }
     }
-    for (std::thread& t : threads) {
-      t.join();
-    }
   }
+  for (auto& f : gpu_task_futures) {
+    f.wait();
+  }
+  timer.Pause();
+  VLOG(0) << " EndPass  dump_pool_to_cpu_func "
+          << " cost " << timer.ElapsedSec() << " s.";
+  for (size_t i = 0; i < device_num; i++) {
+    cpu_reday_channels_[i]->Close();
+  }
+  gpu_task_futures.clear();
+  timer.Start();
+  for (auto& f : cpu_task_futures) {
+    f.wait();
+  }
+  cpu_task_futures.clear();
+  timer.Pause();
+  VLOG(0) << " EndPass  cpu_func "
+          << " cost " << timer.ElapsedSec() << " s.";
   if (keysize_max != 0) {
     HeterPs_->end_pass();
   }
+}
 
-  for (size_t i = 0; i < hbm_pools_.size(); i++) {
-    delete hbm_pools_[i];
+void PSGPUWrapper::DumpToMem() {
+  if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) {
+    this->HbmToSparseTable();
   }
-  gpu_task_pool_.Push(current_task_);
-  current_task_ = nullptr;
-  gpu_free_channel_->Put(current_task_);
-  timer.Pause();
-  VLOG(1) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
 }
 
 void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index c48cf3347573a..eff966e1f1616 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -36,6 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/heter_context.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
 #include "paddle/fluid/framework/heter_util.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/framework/fleet/heter_ps/mem_pool.h"
@@ -63,6 +64,7 @@ limitations under the License. */
 #include "downpour_accessor.h"  // NOLINT
 #endif
 #include "paddle/fluid/framework/fleet/heter_ps/log_patch.h"
+DECLARE_int32(gpugraph_storage_mode);
 
 namespace paddle {
 namespace framework {
@@ -96,6 +98,15 @@ class AfsWrapper {
 };
 #endif
 
+struct task_info {
+  std::shared_ptr<char> build_values;
+  size_t offset;
+  int device_id;
+  int multi_mf_dim;
+  int start;
+  int end;
+};
+
 class PSGPUWrapper {
   class DCacheBuffer {
    public:
@@ -188,27 +199,45 @@ class PSGPUWrapper {
                 int total_len,
                 int* key2slot);
 
+
+  void divide_to_device(std::shared_ptr<HeterContext> gpu_task);
   void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task);
   void PreBuildTask(std::shared_ptr<HeterContext> gpu_task);
   void BuildPull(std::shared_ptr<HeterContext> gpu_task);
+  void PrepareGPUTask(std::shared_ptr<HeterContext> gpu_task);
   void LoadIntoMemory(bool is_shuffle);
   void BeginPass();
   void EndPass();
+  void add_key_to_local(const std::vector<uint64_t> & keys);
+  void add_key_to_gputask(std::shared_ptr<HeterContext> gpu_task);
+  void SparseTableToHbm();
+  void HbmToSparseTable();
   void start_build_thread();
   void pre_build_thread();
+  void build_pull_thread();
   void build_task();
+  void DumpToMem();
 
   void Finalize() {
     VLOG(3) << "PSGPUWrapper Begin Finalize.";
     if (s_instance_ == nullptr) {
       return;
     }
+    if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) {
+        this->EndPass();
+    }
+    for (size_t i = 0; i < hbm_pools_.size(); i++) {
+      delete hbm_pools_[i];
+    }
     data_ready_channel_->Close();
     buildcpu_ready_channel_->Close();
+    buildpull_ready_channel_->Close();
     gpu_free_channel_->Close();
     running_ = false;
     VLOG(3) << "begin stop pre_build_threads_";
     pre_build_threads_.join();
+    VLOG(3) << "begin stop buildpull_threads_";
+    buildpull_threads_.join();
     s_instance_ = nullptr;
     VLOG(3) << "PSGPUWrapper Finalize Finished.";
     HeterPs_->show_table_collisions();
@@ -278,9 +307,17 @@ class PSGPUWrapper {
       data_ready_channel_->SetCapacity(3);
       buildcpu_ready_channel_->Open();
       buildcpu_ready_channel_->SetCapacity(3);
+      buildpull_ready_channel_->Open();
+      buildpull_ready_channel_->SetCapacity(1);
       gpu_free_channel_->Open();
       gpu_free_channel_->SetCapacity(1);
 
+      cpu_reday_channels_.resize(dev_ids.size());
+      for (size_t i = 0; i < dev_ids.size(); i++) {
+          cpu_reday_channels_[i] = paddle::framework::MakeChannel<task_info>();
+          cpu_reday_channels_[i]->SetCapacity(16);
+      }
+
       current_task_ = nullptr;
       gpu_free_channel_->Put(current_task_);
 
@@ -378,6 +415,11 @@ class PSGPUWrapper {
       hbm_thread_pool_[i].reset(new ::ThreadPool(1));
     }
 
+    cpu_work_pool_.resize(thread_keys_shard_num_);
+    for (size_t i = 0; i < hbm_thread_pool_.size(); i++) {
+        cpu_work_pool_[i].reset(new ::ThreadPool(16));
+    }
+
     auto sparse_table_accessor = sparse_table.accessor();
     auto sparse_table_accessor_parameter =
         sparse_table_accessor.ctr_accessor_param();
@@ -589,6 +631,10 @@ class PSGPUWrapper {
       dim_index_map[index_dim_vec_[i]] = i;
     }
     hbm_pools_.resize(resource_->total_device() * num_of_dim);
+    for (size_t i = 0; i < hbm_pools_.size(); i++) {
+        hbm_pools_[i] = new HBMMemoryPoolFix();
+    }
+
     mem_pools_.resize(resource_->total_device() * num_of_dim);
     max_mf_dim_ = index_dim_vec_.back();
     multi_mf_dim_ = (dim_index_map.size() >= 1) ? dim_index_map.size() : 0;
@@ -686,6 +732,7 @@ class PSGPUWrapper {
   int month_;
   int day_;
   bool slot_info_initialized_ = false;
+  bool hbm_sparse_table_initialized_ = false;
   int use_afs_api_ = 0;
   int optimizer_type_ = 1;
   std::string accessor_class_;
@@ -696,7 +743,7 @@ class PSGPUWrapper {
 
 #ifdef PADDLE_WITH_CUDA
   std::vector<MemoryPool*> mem_pools_;
-  std::vector<HBMMemoryPool*> hbm_pools_;  // in multi mfdim, one table need hbm
+  std::vector<HBMMemoryPoolFix*> hbm_pools_;  // in multi mfdim, one table need hbm
                                            // pools of totol dims number
 #endif
 
@@ -712,11 +759,19 @@ class PSGPUWrapper {
       paddle::framework::ChannelObject<std::shared_ptr<HeterContext>>>
       gpu_free_channel_ =
           paddle::framework::MakeChannel<std::shared_ptr<HeterContext>>();
+  std::shared_ptr<
+      paddle::framework::ChannelObject<std::shared_ptr<HeterContext>>>
+      buildpull_ready_channel_ =
+          paddle::framework::MakeChannel<std::shared_ptr<HeterContext>>();
+  std::vector<std::shared_ptr<
+      paddle::framework::ChannelObject<task_info>>> cpu_reday_channels_ ;
   std::shared_ptr<HeterContext> current_task_ = nullptr;
   std::thread pre_build_threads_;
+  std::thread buildpull_threads_;
   bool running_ = false;
   std::vector<std::shared_ptr<ThreadPool>> pull_thread_pool_;
   std::vector<std::shared_ptr<ThreadPool>> hbm_thread_pool_;
+  std::vector<std::shared_ptr<ThreadPool>> cpu_work_pool_;
   OptimizerConfig optimizer_config_;
 
  protected:
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
index ceed8cb6bfa63..96a473be1aa8c 100755
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -48,6 +48,7 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
     places_.push_back(place);
   }
 #endif
+  user_define_dump_filename_ = trainer_desc.user_define_dump_filename();
   // get filelist from trainer_desc here
   const std::vector<paddle::framework::DataFeed*> readers =
       dataset->GetReaders();
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index 27c7563fee840..d2ef1f238de8a 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -49,7 +49,12 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
     : underlying_allocator_(underlying_allocator),
       alignment_(alignment),
       chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)),
-      allow_free_idle_chunk_(allow_free_idle_chunk) {}
+      allow_free_idle_chunk_(allow_free_idle_chunk) {
+          total_alloc_times_ = 0;
+          total_alloc_size_ = 0;
+          total_free_times_ = 0;
+          total_free_size_ = 0;
+      }
 
 phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
     size_t unaligned_size) {
@@ -112,6 +117,8 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
     VLOG(2) << "Not found and reallocate " << realloc_size << "("
             << static_cast<void *>(p) << "), and remaining " << remaining_size;
   }
+  ++total_alloc_times_;
+  total_alloc_size_ += size;
   VLOG(10) << "Alloc " << block_it->size_ << " bytes, ptr = " << block_it->ptr_;
   return new BlockAllocation(block_it);
 }
@@ -126,6 +133,9 @@ void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
   auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
   auto &blocks = block_it->chunk_->blocks_;
 
+  total_free_times_ += 1;
+  total_free_size_ += block_it->size_;
+
   block_it->is_free_ = true;
 
   if (block_it != blocks.begin()) {
@@ -176,9 +186,28 @@ uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() {
       ++chunk_it;
     }
   }
+
+  Trace();
   return bytes;
 }
 
+void AutoGrowthBestFitAllocator::Trace() const {
+  size_t cur_idle_bytes = 0;
+  auto it = free_blocks_.begin();
+  for (; it != free_blocks_.end(); ++it) {
+      cur_idle_bytes += it->second->size_;
+  }
+
+  VLOG(0) << "alloc:" << total_alloc_size_ / double(1024*1024)
+      << "m free:" << total_free_size_ / double(1024*1024)
+      << "m busy:" << (total_alloc_size_ - total_free_size_) / double(1024*1024)
+      << "m idle:" << cur_idle_bytes / double(1024*1024)
+      << "m alloc_times:" << total_alloc_times_
+      << " free_times:" << total_free_times_
+      << " free_blocks_num:" << free_blocks_.size()
+      << " curr_chunks_num:" << chunks_.size();
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
index dadf751bdfa41..138f4a98c4db5 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -49,6 +49,7 @@ class AutoGrowthBestFitAllocator : public Allocator {
 
  private:
   uint64_t FreeIdleChunks();
+  void Trace() const;
 
   template <typename T>
   using List = std::list<T>;
@@ -93,6 +94,12 @@ class AutoGrowthBestFitAllocator : public Allocator {
   size_t chunk_size_;
   bool allow_free_idle_chunk_;
 
+  // stat info
+  size_t total_alloc_times_;
+  size_t total_alloc_size_;
+  size_t total_free_times_;
+  size_t total_free_size_;
+
   SpinLock spinlock_;
 };
 
diff --git a/paddle/fluid/operators/shuffle_batch_op.cu b/paddle/fluid/operators/shuffle_batch_op.cu
index 6b70b8d37d79c..19dc830cc430f 100644
--- a/paddle/fluid/operators/shuffle_batch_op.cu
+++ b/paddle/fluid/operators/shuffle_batch_op.cu
@@ -27,6 +27,37 @@
 namespace paddle {
 namespace operators {
 
+struct CacheAllocator {
+  typedef char value_type;
+  CacheAllocator(platform::Place place) {
+    VLOG(2) << "construct allocator";
+    place_ = place;
+  }
+
+  ~CacheAllocator() { VLOG(2) << "destory allocator"; }
+
+  char *allocate(std::ptrdiff_t num_bytes) {
+    VLOG(2) << "allocate " << num_bytes << " bytes";
+    auto storage = memory::AllocShared(place_, num_bytes);
+    char *ptr = reinterpret_cast<char *>(storage->ptr());
+    busy_allocation_.emplace(std::make_pair(ptr, storage));
+    return ptr;
+  }
+
+  void deallocate(char *ptr, size_t) {
+    VLOG(2) << "deallocate ";
+    allocation_map_type::iterator iter = busy_allocation_.find(ptr);
+    CHECK(iter != busy_allocation_.end());
+    busy_allocation_.erase(iter);
+  }
+
+ private:
+  typedef std::unordered_map<char *, std::shared_ptr<phi::Allocation>>
+      allocation_map_type;
+  allocation_map_type busy_allocation_;
+  platform::Place place_;
+};
+
 template <typename T, bool kIsForward>
 struct ReorderFunctor {
   ReorderFunctor(const T *x, const int64_t *shuffle_idx, T *y, int64_t stride)
@@ -90,7 +121,8 @@ class ShuffleBatchCUDAKernel : public framework::OpKernel<T> {
 
     auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
 #ifdef PADDLE_WITH_CUDA
-    const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream());
+    CacheAllocator allocator(ctx.GetPlace());
+    const auto &exec_policy = thrust::cuda::par(allocator).on(dev_ctx.stream());
 #else
     const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream());
 #endif
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 4933cbb6cf74b..79bb41fd01f55 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -833,6 +833,18 @@ PADDLE_DEFINE_EXPORTED_bool(
     false,
     "It controls whether exit trainer when an worker has no ins.");
 
+/**
+ * Distributed related FLAG
+ * Name: enable_exit_when_partial_worker
+ * Since Version: 2.2.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: represent gpugraph storage mode, 1 for full hbm, 2 for hbm + mem + ssd.
+ */
+PADDLE_DEFINE_EXPORTED_int32(gpugraph_storage_mode,
+                             1,
+                             "gpugraph storage mode, default 1");
+
 /**
  * KP kernel related FLAG
  * Name: FLAGS_run_kp_kernel
@@ -968,7 +980,6 @@ PADDLE_DEFINE_EXPORTED_int32(
 PADDLE_DEFINE_EXPORTED_bool(gpugraph_load_node_list_into_hbm,
                             true,
                             "enable load_node_list_into_hbm, default true");
-
 /**
  * ProcessGroupNCCL related FLAG
  * Name: nccl_blocking_wait
diff --git a/paddle/fluid/platform/monitor.cc b/paddle/fluid/platform/monitor.cc
index ea6240b649cad..dd38ce7956309 100644
--- a/paddle/fluid/platform/monitor.cc
+++ b/paddle/fluid/platform/monitor.cc
@@ -19,6 +19,7 @@ namespace platform {}  // namespace platform
 }  // namespace paddle
 
 DEFINE_INT_STATUS(STAT_total_feasign_num_in_mem)
+DEFINE_INT_STATUS(STAT_epoch_finish)
 DEFINE_INT_STATUS(STAT_gpu0_mem_size)
 DEFINE_INT_STATUS(STAT_gpu1_mem_size)
 DEFINE_INT_STATUS(STAT_gpu2_mem_size)
diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc
index e902baa13532e..dc381e6a033e0 100644
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
@@ -368,6 +368,9 @@ void BindDataset(py::module *m) {
            py::call_guard<py::gil_scoped_release>())
       .def("set_gpu_graph_mode",
            &framework::Dataset::SetGpuGraphMode,
+           py::call_guard<py::gil_scoped_release>())
+      .def("set_pass_id",
+           &framework::Dataset::SetPassId,
            py::call_guard<py::gil_scoped_release>());
 
   py::class_<IterableDatasetWrapper>(*m, "IterableDatasetWrapper")
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
old mode 100755
new mode 100644
index b11f5832d8c8a..db2ab38530a2c
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -64,6 +64,7 @@ void BindDistFleetWrapper(py::module* m) {
       .def("save_one_model", &FleetWrapper::SaveModelOneTable)
       .def("recv_and_save_model", &FleetWrapper::RecvAndSaveTable)
       .def("sparse_table_stat", &FleetWrapper::PrintTableStat)
+      .def("save_cache_table", &FleetWrapper::SaveCacheTable)
       .def("stop_server", &FleetWrapper::StopServer)
       .def("stop_worker", &FleetWrapper::FinalizeWorker)
       .def("barrier", &FleetWrapper::BarrierWithTable)
@@ -371,7 +372,12 @@ void BindGraphGpuWrapper(py::module* m) {
       .def("set_up_types", &GraphGpuWrapper::set_up_types)
       .def("query_node_list", &GraphGpuWrapper::query_node_list)
       .def("add_table_feat_conf", &GraphGpuWrapper::add_table_feat_conf)
-      .def("load_edge_file", &GraphGpuWrapper::load_edge_file)
+      .def("load_edge_file",
+           py::overload_cast<std::string, std::string, bool>(
+               &GraphGpuWrapper::load_edge_file))
+      .def("load_edge_file",
+           py::overload_cast<std::string, std::string, int, bool>(
+               &GraphGpuWrapper::load_edge_file))
       .def("load_node_and_edge", &GraphGpuWrapper::load_node_and_edge)
       .def("upload_batch",
            py::overload_cast<int, int, int, const std::string&>(
@@ -395,7 +401,15 @@ void BindGraphGpuWrapper(py::module* m) {
       .def("get_partition", &GraphGpuWrapper::get_partition)
       .def("load_node_weight", &GraphGpuWrapper::load_node_weight)
       .def("export_partition_files", &GraphGpuWrapper::export_partition_files)
-      .def("load_node_file", &GraphGpuWrapper::load_node_file)
+      .def("load_node_file",
+           py::overload_cast<std::string, std::string>(
+               &GraphGpuWrapper::load_node_file))
+      .def("load_node_file",
+           py::overload_cast<std::string, std::string, int>(
+               &GraphGpuWrapper::load_node_file))
+      .def("release_graph", &GraphGpuWrapper::release_graph)
+      .def("release_graph_edge", &GraphGpuWrapper::release_graph_edge)
+      .def("release_graph_node", &GraphGpuWrapper::release_graph_node)
       .def("finalize", &GraphGpuWrapper::finalize);
 }
 #endif
diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
index e9c993d3ee128..4d7d17463e4fe 100644
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
@@ -64,6 +64,9 @@ void BindPSGPUWrapper(py::module* m) {
       .def("begin_pass",
            &framework::PSGPUWrapper::BeginPass,
            py::call_guard<py::gil_scoped_release>())
+      .def("dump_to_mem",
+           &framework::PSGPUWrapper::DumpToMem,
+           py::call_guard<py::gil_scoped_release>())
       .def("load_into_memory",
            &framework::PSGPUWrapper::LoadIntoMemory,
            py::call_guard<py::gil_scoped_release>())
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index 0cfb946d3d8ca..83f60a6e26b40 100755
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -88,3 +88,4 @@
 shrink = fleet.shrink
 get_hybrid_communicate_group = fleet.get_hybrid_communicate_group
 distributed_scaler = fleet.distributed_scaler
+save_cache_table = fleet.save_cache_table
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index d58770dd714ff..93acfb8042fbe 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -531,7 +531,9 @@ def fleet_desc_configs(self, configs):
                                    'embed_sparse_beta2_decay_rate', 'embedx_sparse_optimizer', 'embedx_sparse_learning_rate', \
                                    'embedx_sparse_weight_bounds', 'embedx_sparse_initial_range', 'embedx_sparse_initial_g2sum', \
                                    'embedx_sparse_beta1_decay_rate', 'embedx_sparse_beta2_decay_rate', 'feature_learning_rate', 'nodeid_slot']
-        support_sparse_table_class = ['DownpourSparseTable']
+        support_sparse_table_class = [
+            'DownpourSparseTable', 'DownpourSparseSSDTable'
+        ]
         support_sparse_accessor_class = [
             'DownpourSparseValueAccessor', 'DownpourCtrAccessor',
             'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor',
@@ -623,9 +625,12 @@ def set_sparse_table_config(table_data, config):
                                      "DownpourSparseTable")
             if table_class not in support_sparse_table_class:
                 raise ValueError(
-                    "support sparse_table_class: ['DownpourSparseTable'], but actual %s"
+                    "support sparse_table_class: ['DownpourSparseTable, DownpourSparseSSDTable'], but actual %s"
                     % (table_class))
-            table_data.table_class = 'MemorySparseTable'
+            if table_class == "DownpourSparseSSDTable":
+                table_data.table_class = 'SSDSparseTable'
+            else:
+                table_data.table_class = 'MemorySparseTable'
             table_data.shard_num = config.get('sparse_shard_num', 1000)
 
             accessor_class = config.get("sparse_accessor_class",
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 1a9b3f565b77a..4b9037795e067 100755
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -906,6 +906,15 @@ def save_persistables(self, executor, dirname, main_program=None, mode=0):
     def save_cache_model(self, dirname, **configs):
         return self._runtime_handle._save_cache_model(dirname, **configs)
 
+    @is_non_distributed_check
+    @inited_runtime_handler
+    def save_cache_table(self,
+                         table_id,
+                         pass_id,
+                         mem_cache_key_threshold=4000000000):
+        return self._runtime_handle._save_cache_table(table_id, pass_id,
+                                                      mem_cache_key_threshold)
+
     def shrink(self, threshold=None):
         self._runtime_handle._shrink(threshold)
 
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index abf7eec73b8fe..5c7c1b11a27a7 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -1420,6 +1420,12 @@ def _save_cache_model(self, dirname, **kwargs):
         fleet.util.barrier()
         return feasign_num
 
+    def _save_cache_table(self, table_id, pass_id, mem_cache_key_threshold):
+        if self.role_maker._is_first_worker():
+            self._worker.save_cache_table(table_id, pass_id,
+                                          mem_cache_key_threshold)
+        fleet.util.barrier()
+
     def _load_sparse_params(self, dirname, context, main_program, mode):
         distributed_varnames = get_sparse_tablenames(self.origin_main_programs,
                                                      True)
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index 9fba7bb70f189..d84d66741344d 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -388,6 +388,7 @@ def __init__(self):
         self.merge_by_lineid = False
         self.fleet_send_sleep_seconds = None
         self.trainer_num = -1
+        self.pass_id = 0
 
     @deprecated(since="2.0.0",
                 update_to="paddle.distributed.InMemoryDataset._set_feed_type")
@@ -1080,8 +1081,25 @@ def set_graph_config(self, config):
         self.proto_desc.graph_config.meta_path = config.get("meta_path", "")
         self.proto_desc.graph_config.gpu_graph_training = config.get(
             "gpu_graph_training", True)
+        self.proto_desc.graph_config.train_table_cap = config.get(
+            "train_table_cap", 800000)
+        self.proto_desc.graph_config.infer_table_cap = config.get(
+            "infer_table_cap", 800000)
         self.dataset.set_gpu_graph_mode(True)
 
+    def set_pass_id(self, pass_id):
+        """
+        set_pass_id
+        """
+        self.pass_id = pass_id
+        self.dataset.set_pass_id(pass_id)
+
+    def get_pass_id(self):
+        """
+        get_pass_id
+        """
+        return self.pass_id
+
 
 class QueueDataset(DatasetBase):
     """
diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py
index 3ba9f9eea46d1..945b28aac88de 100644
--- a/python/paddle/fluid/trainer_factory.py
+++ b/python/paddle/fluid/trainer_factory.py
@@ -73,6 +73,9 @@ def _create_trainer(self, opt_info=None):
                 if opt_info.get("dump_fields_path") is not None and len(
                         opt_info.get("dump_fields_path")) != 0:
                     trainer._set_dump_fields_path(opt_info["dump_fields_path"])
+                if opt_info.get("user_define_dump_filename") is not None and len(
+                        opt_info.get("user_define_dump_filename")) != 0:
+                    trainer._set_user_define_dump_filename(opt_info["user_define_dump_filename"])
                 if opt_info.get("dump_file_num") is not None:
                     trainer._set_dump_file_num(opt_info["dump_file_num"])
                 if opt_info.get("dump_converter") is not None: